Пример #1
0
  /**
   * tokenizes a given JCas object's document text using the treetagger program and adds the
   * recognized tokens to the JCas object.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void tokenize(JCas jcas) {
    // read tokenized text to add tokens to the jcas
    Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName);

    EnumSet<Flag> flags = Flag.getSet(ttprops.languageSwitch);
    TreeTaggerTokenizer ttt;
    ttprops.abbFileName = "english-abbreviations";
    if (ttprops.abbFileName != null) {
      ttt =
          new TreeTaggerTokenizer(
              ttprops.rootPath
                  + ttprops.fileSeparator
                  + "lib"
                  + ttprops.fileSeparator
                  + ttprops.abbFileName,
              flags);
    } else {
      ttt = new TreeTaggerTokenizer(null, flags);
    }

    String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n");
    List<String> tokenized = ttt.tokenize(docText);

    int tokenOffset = 0;
    // loop through all the lines in the treetagger output
    for (String s : tokenized) {
      // charset missmatch fallback: signal (invalid) s
      if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0))
        throw new RuntimeException(
            "Opps! Could not find token "
                + s
                + " in JCas after tokenizing with TreeTagger."
                + " Hmm, there may exist a charset missmatch!"
                + " Default encoding is "
                + Charset.defaultCharset().name()
                + " and should always be UTF-8 (use -Dfile.encoding=UTF-8)."
                + " If input document is not UTF-8 use -e option to set it according to the input, additionally.");

      // create tokens and add them to the jcas's indexes.
      Token newToken = new Token(jcas);
      if (s.equals("EMPTYLINE")) {
        newToken.setBegin(tokenOffset);
        newToken.setEnd(tokenOffset);
        newToken.setPos("EMPTYLINE");
        if (annotate_partofspeech) {
          newToken.addToIndexes();
        }
      } else {
        newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset));
        newToken.setEnd(newToken.getBegin() + s.length());
        newToken.addToIndexes();
        tokenOffset = newToken.getEnd();
      }
    }
  }
Пример #2
0
  /**
   * tokenizes a given JCas object's document text using the chinese tokenization script and adds
   * the recognized tokens to the JCas object.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void tokenizeChinese(JCas jcas) {
    try {
      // read tokenized text to add tokens to the jcas
      Process proc = ttprops.getChineseTokenizationProcess();
      Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath);

      BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
      BufferedWriter out =
          new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8"));

      Integer tokenOffset = 0;
      // loop through all the lines in the stdout output
      String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+");
      for (String inSplit : inSplits) {
        out.write(inSplit);
        out.newLine();
        out.flush();

        // do one initial read
        String s = in.readLine();
        do {
          // break out of the loop if we've read a null
          if (s == null) break;

          String[] outSplits = s.split("\\s+");
          for (String tok : outSplits) {
            if (jcas.getDocumentText().indexOf(tok, tokenOffset) < 0)
              throw new RuntimeException(
                  "Could not find token "
                      + tok
                      + " in JCas after tokenizing with Chinese tokenization script.");

            // create tokens and add them to the jcas's indexes.
            Token newToken = new Token(jcas);
            newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset));
            newToken.setEnd(newToken.getBegin() + tok.length());
            newToken.addToIndexes();
            tokenOffset = newToken.getEnd();
          }

          // break out of the loop if the next read will block
          if (!in.ready()) break;

          s = in.readLine();
        } while (true);
      }

      // clean up
      in.close();
      proc.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  private void populateFileList(String dirPath) throws ResourceInitializationException {
    ArrayList<File> myFiles = new ArrayList<File>();
    File dir = new File(dirPath);

    // check if the given directory path is valid
    if (!dir.exists() || !dir.isDirectory()) throw new ResourceInitializationException();
    else myFiles.addAll(Arrays.asList(dir.listFiles()));

    // check for existence and readability; add handle to the list
    for (File f : myFiles) {
      if (!f.exists() || !f.isFile() || !f.canRead()) {
        Logger.printDetail(
            component,
            "File \""
                + f.getAbsolutePath()
                + "\" was ignored because it either didn't exist, wasn't a file or wasn't readable.");
      } else {
        files.add(f);
      }
    }

    numberOfDocuments = files.size();
  }
Пример #4
0
  /**
   * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas
   * object using the treetagger program.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void doTreeTag(JCas jcas) {
    File tmpDocument = null;
    BufferedWriter tmpFileWriter;
    ArrayList<Token> tokens = new ArrayList<Token>();

    try {
      // create a temporary file and write our pre-existing tokens to it.
      tmpDocument = File.createTempFile("postokens", null);
      tmpFileWriter =
          new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

      // iterate over existing tokens
      FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
      while (ai.hasNext()) {
        Token t = (Token) ai.next();

        tokens.add(t);
        if (!(t.getBegin() == t.getEnd())) {
          tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
        }
      }

      tmpFileWriter.close();
    } catch (IOException e) {
      Logger.printError(
          "Something went wrong creating a temporary file for the treetagger to process.");
      System.exit(-1);
    }

    // Possible End-of-Sentence Tags
    HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
    hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK,
    hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH
    hsEndOfSentenceTag.add("FS"); // SPANISH
    hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
    hsEndOfSentenceTag.add("ew"); // CHINESE

    try {
      Process p = ttprops.getTreeTaggingProcess(tmpDocument);
      Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);

      BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));

      Sentence sentence = null;
      // iterate over all the output lines and tokens array (which have the same source and are
      // hence symmetric)
      int i = 0;
      String s = null;
      while ((s = in.readLine()) != null) {
        // grab a token
        Token token = tokens.get(i++);
        // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
        while (token.getCoveredText().equals("")) {
          // if part of the configuration, also add sentences to the jcas document
          if ((annotate_sentences)
              && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) {
            // Establish sentence structure
            if (sentence == null) {
              sentence = new Sentence(jcas);
              sentence.setBegin(token.getBegin());
            }

            // Finish current sentence if end-of-sentence pos was found or document ended
            sentence.setEnd(token.getEnd());
            if (sentence.getBegin() < sentence.getEnd()) {
              sentence.addToIndexes();
            }

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
          token.removeFromIndexes();
          token = tokens.get(i++);
        }
        // remove tokens, otherwise they are in the index twice
        token.removeFromIndexes();
        // set part of speech tag and add to indexes again
        if (!(token.getCoveredText().equals(""))) {
          token.setPos(s);
          token.addToIndexes();
        }

        // if part of the configuration, also add sentences to the jcas document
        if (annotate_sentences) {
          // Establish sentence structure
          if (sentence == null) {
            sentence = new Sentence(jcas);
            sentence.setBegin(token.getBegin());
          }

          // Finish current sentence if end-of-sentence pos was found or document ended
          if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
            sentence.setEnd(token.getEnd());
            sentence.addToIndexes();

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
        }
      }
      while (i < tokens.size()) {
        if (!(sentence == null)) {
          sentence.setEnd(tokens.get(tokens.size() - 1).getEnd());
          sentence.addToIndexes();
        }
        Token token = tokens.get(i++);
        if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) {
          token.removeFromIndexes();
        }
      }
      in.close();
      p.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      // Delete temporary files
      tmpDocument.delete();
    }
  }