Пример #1
0
 private static void writeLine(Writer out, String line, TextIdentifier docQuerySnrId)
     throws IOException {
   String[] tokens = line.split(" ");
   if (tokens.length > SENTENCE_MAXTOKENS) {
     return;
   }
   out.write("<D=" + docQuerySnrId.toValidString() + ">\n");
   for (String token : tokens) {
     out.write(token + "\n");
   }
   out.write("</D>\n");
 }
Пример #2
0
  /**
   * A stream with one sentence per line is read in, sentences containing a query token (or alias
   * thereof) are retained and printed out, formatted.
   *
   * @param args
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    if (args.length < 1 || args.length > 2) {
      System.err.println("Format <query_expanded.xml> [extraction_window:int]");
      System.err.println("A stream with one sentece per line is read in, sentences containing");
      System.err.println("a query token are retained and printed out, formatted.");
      System.err.println(
          "The optional extraction_window parameter must be an integer between -1"
              + " and inf. It specifies the number of sentences printed out before and "
              + " after a sentence containing a query. E.g., extraction_window=1 will"
              + " print sentences immediately preceding and following query sentences."
              + " If set to -1, all sentences will be printed. Default is 0.");
      return;
    }
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(System.out, "UTF-8"));
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    QueryList ql = new QueryList(args[0]);
    int extractionWin = 0;
    if (args.length > 1) {
      try {
        extractionWin = Integer.parseInt(args[1]);
        if (extractionWin < -1) {
          throw new NumberFormatException();
        }
      } catch (NumberFormatException e) {
        System.err.println("Extraction window must be an integer between -1" + " and inf.");
        return;
      }
    }

    // look behind
    CircularFifoBuffer lastSentences = null;
    if (extractionWin > 0) {
      lastSentences = new CircularFifoBuffer(extractionWin);
    }
    // look forward
    int currentExtractionWin = 0;

    int sNr = 1;
    TextIdentifier docQueryId = null;
    for (String line; (line = br.readLine()) != null; ) {
      if (line.startsWith("#")) {
        docQueryId = TextIdentifier.fromDelimited(line.substring(1));
        if (docQueryId.getQueryId() == null || docQueryId.getQueryId().isEmpty()) {
          throw new IllegalArgumentException("TextIdentifier without query id:" + line);
        }
        sNr = 1;
        if (lastSentences != null) {
          lastSentences.clear();
        }
        currentExtractionWin = 0;
      } else {
        assert (docQueryId != null);
        TextIdentifier docQuerySnrId = new TextIdentifier(docQueryId).setSentNr(sNr);
        try {
          if (extractionWin == -1
              || hasQueryMatch(ql.getQueryById(docQuerySnrId.getQueryId()), line)) {
            // old sentences in buffer
            if (lastSentences != null) {
              int oldSNr = sNr - lastSentences.size();
              while (lastSentences.size() > 0) {
                Object lineStrObject = lastSentences.remove();
                writeLine(
                    out,
                    (String) lineStrObject,
                    new TextIdentifier(docQueryId).setSentNr(oldSNr++));
              }
            }
            // current sentence
            writeLine(out, line, new TextIdentifier(docQueryId).setSentNr(sNr));
            currentExtractionWin = extractionWin;
          } else { // no match in sentence
            // within look-forward window of last query-containing sentence
            if (currentExtractionWin > 0) {
              writeLine(out, line, new TextIdentifier(docQueryId).setSentNr(sNr));
              --currentExtractionWin;
            } else { // possibly in look-behind window
              if (lastSentences != null) {
                lastSentences.add(line);
              }
            }
          }
        } catch (IllegalArgumentException e) {
          throw new IllegalArgumentException(
              "Error processing sentence with id:"
                  + " \n"
                  + docQueryId
                  + "\n"
                  + docQuerySnrId
                  + "\n"
                  + e);
        }
        sNr += 1;
      }
    }
    out.flush();
    br.close();
  }