Ejemplo n.º 1
0
  /**
   * improve german sentences; the treetagger splits german sentences incorrectly on some occasions
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void improveGermanSentences(JCas jcas) {
    /*
     * these POS tag sequences will decide whether we want to merge two sentences
     * that have (supposedly wrongfully) been split.
     */
    HashSet<String[]> posRules = new HashSet<String[]>();
    posRules.add(new String[] {"CARD", "\\$.", "NN"});
    posRules.add(new String[] {"CARD", "\\$.", "NE"});

    FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator();

    // compare two sentences at a time in order to have access to all POS tags
    HashSet<HashSet<Sentence>> toMerge = new HashSet<HashSet<Sentence>>();
    Sentence prevSent = null, thisSent = null;
    while (sentIter.hasNext()) {
      if (thisSent == null) {
        thisSent = (Sentence) sentIter.next();
        continue;
      }

      prevSent = thisSent;
      thisSent = (Sentence) sentIter.next();
      /*
       * select the last two tokens within the previous sentence as well as the
       * first of the current one and check for matches.
       */
      Token penultimateToken = null, ultimateToken = null, firstToken = null;
      FSIterator tokIter = jcas.getAnnotationIndex(Token.type).subiterator(thisSent);
      if (tokIter.hasNext()) {
        firstToken = (Token) tokIter.next();
      }

      tokIter = jcas.getAnnotationIndex(Token.type).subiterator(prevSent);
      while (tokIter.hasNext()) {
        if (ultimateToken == null) {
          ultimateToken = (Token) tokIter.next();
          continue;
        }
        penultimateToken = ultimateToken;
        ultimateToken = (Token) tokIter.next();
      }

      // check that all tokens for further analysis are present. if not: skip
      if (penultimateToken == null || ultimateToken == null || firstToken == null) {
        continue;
      }

      // check rules, memorize sentences to be merged
      for (String[] posRule : posRules) {
        /*
         * either one of the pre-defined POS rules fit, or the first token's
         * covered text begins with lower case characters.
         */
        if ((penultimateToken.getPos() != null
                && penultimateToken.getPos().matches(posRule[0])
                && ultimateToken.getPos() != null
                && ultimateToken.getPos().matches(posRule[1])
                && firstToken.getPos() != null
                && firstToken.getPos().matches(posRule[2]))
            || (firstToken.getCoveredText().matches("^[a-z/].*"))) {
          /*
           * check whether one of the previous candidate pairs already
           * contains one of our sentences.
           */
          Boolean candidateExisted = false;
          for (HashSet<Sentence> mergeCandidate : toMerge) {
            if (mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) {
              // we add both here because sets ignore duplicates
              mergeCandidate.add(prevSent);
              mergeCandidate.add(thisSent);

              candidateExisted = true;
              break;
            }
          }

          /*
           * if one of the sentences was not already to be merged with another,
           * create a new merge candidate set
           */
          if (!candidateExisted) {
            HashSet<Sentence> newCandidate = new HashSet<Sentence>();
            newCandidate.add(prevSent);
            newCandidate.add(thisSent);

            toMerge.add(newCandidate);
          }

          break; // don't need to do the next rules; already merging.
        }
      }
    }

    // iterate over the previously collected merge candidates

    for (HashSet<Sentence> mergeCandidate : toMerge) {
      // find the earliest beginning and latest end for the set of sentences
      Integer beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE;

      Sentence mergedSent = new Sentence(jcas);
      for (Sentence s : mergeCandidate) {
        if (s.getBegin() < beginIndex) {
          beginIndex = s.getBegin();
        }

        if (s.getEnd() > endIndex) {
          endIndex = s.getEnd();
        }

        s.removeFromIndexes();
      }

      // set values, add to jcas
      mergedSent.setBegin(beginIndex);
      mergedSent.setEnd(endIndex);
      mergedSent.addToIndexes();
    }
  }
Ejemplo n.º 2
0
  /**
   * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas
   * object using the treetagger program.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void doTreeTag(JCas jcas) {
    File tmpDocument = null;
    BufferedWriter tmpFileWriter;
    ArrayList<Token> tokens = new ArrayList<Token>();

    try {
      // create a temporary file and write our pre-existing tokens to it.
      tmpDocument = File.createTempFile("postokens", null);
      tmpFileWriter =
          new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

      // iterate over existing tokens
      FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
      while (ai.hasNext()) {
        Token t = (Token) ai.next();

        tokens.add(t);
        if (!(t.getBegin() == t.getEnd())) {
          tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
        }
      }

      tmpFileWriter.close();
    } catch (IOException e) {
      Logger.printError(
          "Something went wrong creating a temporary file for the treetagger to process.");
      System.exit(-1);
    }

    // Possible End-of-Sentence Tags
    HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
    hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK,
    hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH
    hsEndOfSentenceTag.add("FS"); // SPANISH
    hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
    hsEndOfSentenceTag.add("ew"); // CHINESE

    try {
      Process p = ttprops.getTreeTaggingProcess(tmpDocument);
      Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);

      BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));

      Sentence sentence = null;
      // iterate over all the output lines and tokens array (which have the same source and are
      // hence symmetric)
      int i = 0;
      String s = null;
      while ((s = in.readLine()) != null) {
        // grab a token
        Token token = tokens.get(i++);
        // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
        while (token.getCoveredText().equals("")) {
          // if part of the configuration, also add sentences to the jcas document
          if ((annotate_sentences)
              && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) {
            // Establish sentence structure
            if (sentence == null) {
              sentence = new Sentence(jcas);
              sentence.setBegin(token.getBegin());
            }

            // Finish current sentence if end-of-sentence pos was found or document ended
            sentence.setEnd(token.getEnd());
            if (sentence.getBegin() < sentence.getEnd()) {
              sentence.addToIndexes();
            }

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
          token.removeFromIndexes();
          token = tokens.get(i++);
        }
        // remove tokens, otherwise they are in the index twice
        token.removeFromIndexes();
        // set part of speech tag and add to indexes again
        if (!(token.getCoveredText().equals(""))) {
          token.setPos(s);
          token.addToIndexes();
        }

        // if part of the configuration, also add sentences to the jcas document
        if (annotate_sentences) {
          // Establish sentence structure
          if (sentence == null) {
            sentence = new Sentence(jcas);
            sentence.setBegin(token.getBegin());
          }

          // Finish current sentence if end-of-sentence pos was found or document ended
          if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
            sentence.setEnd(token.getEnd());
            sentence.addToIndexes();

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
        }
      }
      while (i < tokens.size()) {
        if (!(sentence == null)) {
          sentence.setEnd(tokens.get(tokens.size() - 1).getEnd());
          sentence.addToIndexes();
        }
        Token token = tokens.get(i++);
        if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) {
          token.removeFromIndexes();
        }
      }
      in.close();
      p.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      // Delete temporary files
      tmpDocument.delete();
    }
  }