Java Token.getPos примеры использования

Язык программирования: Java

Пространство имен/Пакет: de.unihd.dbs.uima.types.heideltime

Класс/Тип: Token

Метод/Функция: getPos

Примеров на hotexamples.com: 2

Java Token.getPos - 2 примера найдено. Это лучшие примеры Java кода для de.unihd.dbs.uima.types.heideltime.Token.getPos, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

addToIndexes(3)

getBegin(3)

getEnd(3)

getCoveredText(2)

getPos(2)

setBegin(2)

setEnd(2)

setPos(2)

removeFromIndexes(1)

Пример #1

Показать файл

Файл: TreeTaggerWrapper.java Проект: qwaider/heideltime

  /**
   * improve german sentences; the treetagger splits german sentences incorrectly on some occasions
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void improveGermanSentences(JCas jcas) {
    /*
     * these POS tag sequences will decide whether we want to merge two sentences
     * that have (supposedly wrongfully) been split.
     */
    HashSet<String[]> posRules = new HashSet<String[]>();
    posRules.add(new String[] {"CARD", "\\$.", "NN"});
    posRules.add(new String[] {"CARD", "\\$.", "NE"});

    FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator();

    // compare two sentences at a time in order to have access to all POS tags
    HashSet<HashSet<Sentence>> toMerge = new HashSet<HashSet<Sentence>>();
    Sentence prevSent = null, thisSent = null;
    while (sentIter.hasNext()) {
      if (thisSent == null) {
        thisSent = (Sentence) sentIter.next();
        continue;
      }

      prevSent = thisSent;
      thisSent = (Sentence) sentIter.next();
      /*
       * select the last two tokens within the previous sentence as well as the
       * first of the current one and check for matches.
       */
      Token penultimateToken = null, ultimateToken = null, firstToken = null;
      FSIterator tokIter = jcas.getAnnotationIndex(Token.type).subiterator(thisSent);
      if (tokIter.hasNext()) {
        firstToken = (Token) tokIter.next();
      }

      tokIter = jcas.getAnnotationIndex(Token.type).subiterator(prevSent);
      while (tokIter.hasNext()) {
        if (ultimateToken == null) {
          ultimateToken = (Token) tokIter.next();
          continue;
        }
        penultimateToken = ultimateToken;
        ultimateToken = (Token) tokIter.next();
      }

      // check that all tokens for further analysis are present. if not: skip
      if (penultimateToken == null || ultimateToken == null || firstToken == null) {
        continue;
      }

      // check rules, memorize sentences to be merged
      for (String[] posRule : posRules) {
        /*
         * either one of the pre-defined POS rules fit, or the first token's
         * covered text begins with lower case characters.
         */
        if ((penultimateToken.getPos() != null
                && penultimateToken.getPos().matches(posRule[0])
                && ultimateToken.getPos() != null
                && ultimateToken.getPos().matches(posRule[1])
                && firstToken.getPos() != null
                && firstToken.getPos().matches(posRule[2]))
            || (firstToken.getCoveredText().matches("^[a-z/].*"))) {
          /*
           * check whether one of the previous candidate pairs already
           * contains one of our sentences.
           */
          Boolean candidateExisted = false;
          for (HashSet<Sentence> mergeCandidate : toMerge) {
            if (mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) {
              // we add both here because sets ignore duplicates
              mergeCandidate.add(prevSent);
              mergeCandidate.add(thisSent);

              candidateExisted = true;
              break;
            }
          }

          /*
           * if one of the sentences was not already to be merged with another,
           * create a new merge candidate set
           */
          if (!candidateExisted) {
            HashSet<Sentence> newCandidate = new HashSet<Sentence>();
            newCandidate.add(prevSent);
            newCandidate.add(thisSent);

            toMerge.add(newCandidate);
          }

          break; // don't need to do the next rules; already merging.
        }
      }
    }

    // iterate over the previously collected merge candidates

    for (HashSet<Sentence> mergeCandidate : toMerge) {
      // find the earliest beginning and latest end for the set of sentences
      Integer beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE;

      Sentence mergedSent = new Sentence(jcas);
      for (Sentence s : mergeCandidate) {
        if (s.getBegin() < beginIndex) {
          beginIndex = s.getBegin();
        }

        if (s.getEnd() > endIndex) {
          endIndex = s.getEnd();
        }

        s.removeFromIndexes();
      }

      // set values, add to jcas
      mergedSent.setBegin(beginIndex);
      mergedSent.setEnd(endIndex);
      mergedSent.addToIndexes();
    }
  }

Пример #2

Показать файл

Файл: TreeTaggerWrapper.java Проект: qwaider/heideltime

  /**
   * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas
   * object using the treetagger program.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void doTreeTag(JCas jcas) {
    File tmpDocument = null;
    BufferedWriter tmpFileWriter;
    ArrayList<Token> tokens = new ArrayList<Token>();

    try {
      // create a temporary file and write our pre-existing tokens to it.
      tmpDocument = File.createTempFile("postokens", null);
      tmpFileWriter =
          new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

      // iterate over existing tokens
      FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
      while (ai.hasNext()) {
        Token t = (Token) ai.next();

        tokens.add(t);
        if (!(t.getBegin() == t.getEnd())) {
          tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
        }
      }

      tmpFileWriter.close();
    } catch (IOException e) {
      Logger.printError(
          "Something went wrong creating a temporary file for the treetagger to process.");
      System.exit(-1);
    }

    // Possible End-of-Sentence Tags
    HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
    hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK,
    hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH
    hsEndOfSentenceTag.add("FS"); // SPANISH
    hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
    hsEndOfSentenceTag.add("ew"); // CHINESE

    try {
      Process p = ttprops.getTreeTaggingProcess(tmpDocument);
      Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);

      BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));

      Sentence sentence = null;
      // iterate over all the output lines and tokens array (which have the same source and are
      // hence symmetric)
      int i = 0;
      String s = null;
      while ((s = in.readLine()) != null) {
        // grab a token
        Token token = tokens.get(i++);
        // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
        while (token.getCoveredText().equals("")) {
          // if part of the configuration, also add sentences to the jcas document
          if ((annotate_sentences)
              && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) {
            // Establish sentence structure
            if (sentence == null) {
              sentence = new Sentence(jcas);
              sentence.setBegin(token.getBegin());
            }

            // Finish current sentence if end-of-sentence pos was found or document ended
            sentence.setEnd(token.getEnd());
            if (sentence.getBegin() < sentence.getEnd()) {
              sentence.addToIndexes();
            }

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
          token.removeFromIndexes();
          token = tokens.get(i++);
        }
        // remove tokens, otherwise they are in the index twice
        token.removeFromIndexes();
        // set part of speech tag and add to indexes again
        if (!(token.getCoveredText().equals(""))) {
          token.setPos(s);
          token.addToIndexes();
        }

        // if part of the configuration, also add sentences to the jcas document
        if (annotate_sentences) {
          // Establish sentence structure
          if (sentence == null) {
            sentence = new Sentence(jcas);
            sentence.setBegin(token.getBegin());
          }

          // Finish current sentence if end-of-sentence pos was found or document ended
          if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
            sentence.setEnd(token.getEnd());
            sentence.addToIndexes();

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
        }
      }
      while (i < tokens.size()) {
        if (!(sentence == null)) {
          sentence.setEnd(tokens.get(tokens.size() - 1).getEnd());
          sentence.addToIndexes();
        }
        Token token = tokens.get(i++);
        if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) {
          token.removeFromIndexes();
        }
      }
      in.close();
      p.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      // Delete temporary files
      tmpDocument.delete();
    }
  }