Java Token.getBegin примеры использования

Язык программирования: Java

Пространство имен/Пакет: de.unihd.dbs.uima.types.heideltime

Класс/Тип: Token

Метод/Функция: getBegin

Примеров на hotexamples.com: 3

Java Token.getBegin - 3 примера найдено. Это лучшие примеры Java кода для de.unihd.dbs.uima.types.heideltime.Token.getBegin, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

addToIndexes(3)

getBegin(3)

getEnd(3)

getCoveredText(2)

getPos(2)

setBegin(2)

setEnd(2)

setPos(2)

removeFromIndexes(1)

Пример #1

Показать файл

Файл: TreeTaggerWrapper.java Проект: qwaider/heideltime

  /**
   * tokenizes a given JCas object's document text using the treetagger program and adds the
   * recognized tokens to the JCas object.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void tokenize(JCas jcas) {
    // read tokenized text to add tokens to the jcas
    Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName);

    EnumSet<Flag> flags = Flag.getSet(ttprops.languageSwitch);
    TreeTaggerTokenizer ttt;
    ttprops.abbFileName = "english-abbreviations";
    if (ttprops.abbFileName != null) {
      ttt =
          new TreeTaggerTokenizer(
              ttprops.rootPath
                  + ttprops.fileSeparator
                  + "lib"
                  + ttprops.fileSeparator
                  + ttprops.abbFileName,
              flags);
    } else {
      ttt = new TreeTaggerTokenizer(null, flags);
    }

    String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n");
    List<String> tokenized = ttt.tokenize(docText);

    int tokenOffset = 0;
    // loop through all the lines in the treetagger output
    for (String s : tokenized) {
      // charset missmatch fallback: signal (invalid) s
      if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0))
        throw new RuntimeException(
            "Opps! Could not find token "
                + s
                + " in JCas after tokenizing with TreeTagger."
                + " Hmm, there may exist a charset missmatch!"
                + " Default encoding is "
                + Charset.defaultCharset().name()
                + " and should always be UTF-8 (use -Dfile.encoding=UTF-8)."
                + " If input document is not UTF-8 use -e option to set it according to the input, additionally.");

      // create tokens and add them to the jcas's indexes.
      Token newToken = new Token(jcas);
      if (s.equals("EMPTYLINE")) {
        newToken.setBegin(tokenOffset);
        newToken.setEnd(tokenOffset);
        newToken.setPos("EMPTYLINE");
        if (annotate_partofspeech) {
          newToken.addToIndexes();
        }
      } else {
        newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset));
        newToken.setEnd(newToken.getBegin() + s.length());
        newToken.addToIndexes();
        tokenOffset = newToken.getEnd();
      }
    }
  }

Пример #2

Показать файл

Файл: TreeTaggerWrapper.java Проект: qwaider/heideltime

  /**
   * tokenizes a given JCas object's document text using the chinese tokenization script and adds
   * the recognized tokens to the JCas object.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void tokenizeChinese(JCas jcas) {
    try {
      // read tokenized text to add tokens to the jcas
      Process proc = ttprops.getChineseTokenizationProcess();
      Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath);

      BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
      BufferedWriter out =
          new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8"));

      Integer tokenOffset = 0;
      // loop through all the lines in the stdout output
      String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+");
      for (String inSplit : inSplits) {
        out.write(inSplit);
        out.newLine();
        out.flush();

        // do one initial read
        String s = in.readLine();
        do {
          // break out of the loop if we've read a null
          if (s == null) break;

          String[] outSplits = s.split("\\s+");
          for (String tok : outSplits) {
            if (jcas.getDocumentText().indexOf(tok, tokenOffset) < 0)
              throw new RuntimeException(
                  "Could not find token "
                      + tok
                      + " in JCas after tokenizing with Chinese tokenization script.");

            // create tokens and add them to the jcas's indexes.
            Token newToken = new Token(jcas);
            newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset));
            newToken.setEnd(newToken.getBegin() + tok.length());
            newToken.addToIndexes();
            tokenOffset = newToken.getEnd();
          }

          // break out of the loop if the next read will block
          if (!in.ready()) break;

          s = in.readLine();
        } while (true);
      }

      // clean up
      in.close();
      proc.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

Пример #3

Показать файл

Файл: TreeTaggerWrapper.java Проект: qwaider/heideltime

  /**
   * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas
   * object using the treetagger program.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void doTreeTag(JCas jcas) {
    File tmpDocument = null;
    BufferedWriter tmpFileWriter;
    ArrayList<Token> tokens = new ArrayList<Token>();

    try {
      // create a temporary file and write our pre-existing tokens to it.
      tmpDocument = File.createTempFile("postokens", null);
      tmpFileWriter =
          new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

      // iterate over existing tokens
      FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
      while (ai.hasNext()) {
        Token t = (Token) ai.next();

        tokens.add(t);
        if (!(t.getBegin() == t.getEnd())) {
          tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
        }
      }

      tmpFileWriter.close();
    } catch (IOException e) {
      Logger.printError(
          "Something went wrong creating a temporary file for the treetagger to process.");
      System.exit(-1);
    }

    // Possible End-of-Sentence Tags
    HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
    hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK,
    hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH
    hsEndOfSentenceTag.add("FS"); // SPANISH
    hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
    hsEndOfSentenceTag.add("ew"); // CHINESE

    try {
      Process p = ttprops.getTreeTaggingProcess(tmpDocument);
      Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);

      BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));

      Sentence sentence = null;
      // iterate over all the output lines and tokens array (which have the same source and are
      // hence symmetric)
      int i = 0;
      String s = null;
      while ((s = in.readLine()) != null) {
        // grab a token
        Token token = tokens.get(i++);
        // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
        while (token.getCoveredText().equals("")) {
          // if part of the configuration, also add sentences to the jcas document
          if ((annotate_sentences)
              && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) {
            // Establish sentence structure
            if (sentence == null) {
              sentence = new Sentence(jcas);
              sentence.setBegin(token.getBegin());
            }

            // Finish current sentence if end-of-sentence pos was found or document ended
            sentence.setEnd(token.getEnd());
            if (sentence.getBegin() < sentence.getEnd()) {
              sentence.addToIndexes();
            }

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
          token.removeFromIndexes();
          token = tokens.get(i++);
        }
        // remove tokens, otherwise they are in the index twice
        token.removeFromIndexes();
        // set part of speech tag and add to indexes again
        if (!(token.getCoveredText().equals(""))) {
          token.setPos(s);
          token.addToIndexes();
        }

        // if part of the configuration, also add sentences to the jcas document
        if (annotate_sentences) {
          // Establish sentence structure
          if (sentence == null) {
            sentence = new Sentence(jcas);
            sentence.setBegin(token.getBegin());
          }

          // Finish current sentence if end-of-sentence pos was found or document ended
          if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
            sentence.setEnd(token.getEnd());
            sentence.addToIndexes();

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
        }
      }
      while (i < tokens.size()) {
        if (!(sentence == null)) {
          sentence.setEnd(tokens.get(tokens.size() - 1).getEnd());
          sentence.addToIndexes();
        }
        Token token = tokens.get(i++);
        if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) {
          token.removeFromIndexes();
        }
      }
      in.close();
      p.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      // Delete temporary files
      tmpDocument.delete();
    }
  }