Ejemplo n.º 1
0
  /** initialization method where we fill configuration values and check some prerequisites */
  public void initialize(UimaContext aContext) {
    // check if the supplied language is one that we can currently handle
    this.language =
        Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));

    // get configuration from the descriptor
    annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS);
    annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES);
    annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH);
    String cnTokPath = (String) aContext.getConfigParameterValue(PARAM_CHINESE_TOKENIZER_PATH);

    // set some configuration based upon these values
    ttprops.languageName = language.getTreeTaggerLangName();
    if (ttprops.rootPath == null) ttprops.rootPath = System.getenv("TREETAGGER_HOME");
    ttprops.tokScriptName = "utf8-tokenize.perl";

    // parameter file
    if (!(new File(
            ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.languageName + "-utf8.par")
        .exists())) // get UTF8 version if it exists
    ttprops.parFileName = ttprops.languageName + ".par";
    else ttprops.parFileName = ttprops.languageName + "-utf8.par";

    // abbreviation file
    if (new File(
            ttprops.rootPath + ttprops.fileSeparator + "lib",
            ttprops.languageName + "-abbreviations-utf8")
        .exists()) { // get UTF8 version if it exists
      ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8";
    } else {
      ttprops.abbFileName = ttprops.languageName + "-abbreviations";
    }

    ttprops.languageSwitch = language.getTreeTaggerSwitch();
    if (cnTokPath != null && !cnTokPath.equals(""))
      ttprops.chineseTokenizerPath = new File(cnTokPath);
    else ttprops.chineseTokenizerPath = new File(ttprops.rootPath, "cmd");

    // handle the treetagger path from the environment variables
    if (ttprops.rootPath == null) {
      Logger.printError("TreeTagger environment variable is not present, aborting.");
      System.exit(-1);
    }

    // Check for whether the required treetagger parameter files are present
    Boolean abbFileFlag = true;
    Boolean parFileFlag = true;
    Boolean tokScriptFlag = true;
    File abbFile = new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.abbFileName);
    File parFile = new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.parFileName);
    File tokFile =
        new File(ttprops.rootPath + ttprops.fileSeparator + "cmd", ttprops.tokScriptName);
    if (!(abbFileFlag = abbFile.exists())) {
      if (language.equals(Language.CHINESE) || language.equals(Language.RUSSIAN)) {
        abbFileFlag = true;
        ttprops.abbFileName = null;
      } else {
        Logger.printError(
            component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName);
      }
    }
    if (!(parFileFlag = parFile.exists())) {
      Logger.printError(
          component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName);
    }
    if (!(tokScriptFlag = tokFile.exists())) {
      if (language.equals(Language.CHINESE)) tokScriptFlag = true;
      else
        Logger.printError(
            component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName);
    }

    if (!abbFileFlag || !parFileFlag || !tokScriptFlag) {
      Logger.printError(
          component,
          "Cannot find tree tagger ("
              + ttprops.rootPath
              + ttprops.fileSeparator
              + "cmd"
              + ttprops.fileSeparator
              + ttprops.tokScriptName
              + ")."
              + " Make sure that path to tree tagger is set correctly in config.props!");
      Logger.printError(component, "If path is set correctly:");
      Logger.printError(
          component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz");
      Logger.printError(
          component,
          "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz");
      Logger.printError(
          component,
          "Extract this file and copy the missing file into the corresponding TreeTagger directories.");
      Logger.printError(
          component,
          "If missing, copy "
              + ttprops.abbFileName
              + " into "
              + ttprops.rootPath
              + ttprops.fileSeparator
              + "lib");
      Logger.printError(
          component,
          "If missing, copy "
              + ttprops.parFileName
              + " into "
              + ttprops.rootPath
              + ttprops.fileSeparator
              + "lib");
      Logger.printError(
          component,
          "If missing, copy "
              + ttprops.tokScriptName
              + " into "
              + ttprops.rootPath
              + ttprops.fileSeparator
              + "cmd");
      System.exit(-1);
    }
  }
Ejemplo n.º 2
0
  /**
   * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas
   * object using the treetagger program.
   *
   * @param jcas JCas object supplied by the pipeline
   */
  private void doTreeTag(JCas jcas) {
    File tmpDocument = null;
    BufferedWriter tmpFileWriter;
    ArrayList<Token> tokens = new ArrayList<Token>();

    try {
      // create a temporary file and write our pre-existing tokens to it.
      tmpDocument = File.createTempFile("postokens", null);
      tmpFileWriter =
          new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8"));

      // iterate over existing tokens
      FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator();
      while (ai.hasNext()) {
        Token t = (Token) ai.next();

        tokens.add(t);
        if (!(t.getBegin() == t.getEnd())) {
          tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator);
        }
      }

      tmpFileWriter.close();
    } catch (IOException e) {
      Logger.printError(
          "Something went wrong creating a temporary file for the treetagger to process.");
      System.exit(-1);
    }

    // Possible End-of-Sentence Tags
    HashSet<String> hsEndOfSentenceTag = new HashSet<String>();
    hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK,
    hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH
    hsEndOfSentenceTag.add("FS"); // SPANISH
    hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN
    hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN
    hsEndOfSentenceTag.add("ew"); // CHINESE

    try {
      Process p = ttprops.getTreeTaggingProcess(tmpDocument);
      Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName);

      BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8"));

      Sentence sentence = null;
      // iterate over all the output lines and tokens array (which have the same source and are
      // hence symmetric)
      int i = 0;
      String s = null;
      while ((s = in.readLine()) != null) {
        // grab a token
        Token token = tokens.get(i++);
        // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file
        while (token.getCoveredText().equals("")) {
          // if part of the configuration, also add sentences to the jcas document
          if ((annotate_sentences)
              && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) {
            // Establish sentence structure
            if (sentence == null) {
              sentence = new Sentence(jcas);
              sentence.setBegin(token.getBegin());
            }

            // Finish current sentence if end-of-sentence pos was found or document ended
            sentence.setEnd(token.getEnd());
            if (sentence.getBegin() < sentence.getEnd()) {
              sentence.addToIndexes();
            }

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
          token.removeFromIndexes();
          token = tokens.get(i++);
        }
        // remove tokens, otherwise they are in the index twice
        token.removeFromIndexes();
        // set part of speech tag and add to indexes again
        if (!(token.getCoveredText().equals(""))) {
          token.setPos(s);
          token.addToIndexes();
        }

        // if part of the configuration, also add sentences to the jcas document
        if (annotate_sentences) {
          // Establish sentence structure
          if (sentence == null) {
            sentence = new Sentence(jcas);
            sentence.setBegin(token.getBegin());
          }

          // Finish current sentence if end-of-sentence pos was found or document ended
          if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) {
            sentence.setEnd(token.getEnd());
            sentence.addToIndexes();

            // Make sure current sentence is not active anymore so that a new one might be created
            sentence = null;
            //						sentence = new Sentence(jcas);
          }
        }
      }
      while (i < tokens.size()) {
        if (!(sentence == null)) {
          sentence.setEnd(tokens.get(tokens.size() - 1).getEnd());
          sentence.addToIndexes();
        }
        Token token = tokens.get(i++);
        if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) {
          token.removeFromIndexes();
        }
      }
      in.close();
      p.destroy();
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      // Delete temporary files
      tmpDocument.delete();
    }
  }
  private void fillJCas(JCas jcas) {
    // grab a file to process
    File f = files.poll();
    try {
      // create xml parsing facilities
      DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
      // parse input xml file
      Document doc = db.parse(f);

      doc.getDocumentElement().normalize();

      // get the <text> tag's content to set the document text
      NodeList nList = doc.getElementsByTagName("TEXT");
      Node textNode = nList.item(0);
      String text = textNode.getTextContent();

      jcas.setDocumentText(text);

      // get the <dct> timex tag's value attribute for the dct
      Boolean gotDCT = false;
      String dctText = null;
      try {
        nList = doc.getDocumentElement().getElementsByTagName("DCT");
        nList = ((Element) nList.item(0)).getElementsByTagName("TIMEX3"); // timex3 tag
        Node dctTimex = nList.item(0);
        NamedNodeMap dctTimexAttr = dctTimex.getAttributes();
        Node dctValue = dctTimexAttr.getNamedItem("value");
        dctText = dctValue.getTextContent();
        gotDCT = true;
      } catch (Exception e) {
        gotDCT = false;
      }

      if (!gotDCT)
        try { // try a different location for the DCT timex element
          nList = doc.getDocumentElement().getElementsByTagName("TEXT");
          nList = ((Element) nList.item(0)).getElementsByTagName("TIMEX3"); // timex3 tag
          Node dctTimex = nList.item(0);
          NamedNodeMap dctTimexAttr = dctTimex.getAttributes();
          if (dctTimexAttr.getNamedItem("functionInDocument") != null
              && dctTimexAttr
                  .getNamedItem("functionInDocument")
                  .getTextContent()
                  .equals("CREATION_TIME")) {
            Node dctValue = dctTimexAttr.getNamedItem("value");
            dctText = dctValue.getTextContent();
          }
          gotDCT = true;
        } catch (Exception e) {
          gotDCT = false;
        }

      // get the document id
      nList = doc.getElementsByTagName("DOCID");
      String filename = null;
      if (nList != null && nList.getLength() > 0) filename = nList.item(0).getTextContent();
      else filename = f.getName().replaceAll("\\.[^\\.]+$", "");

      Dct dct = new Dct(jcas);
      dct.setBegin(0);
      dct.setEnd(text.length());
      dct.setFilename(filename);
      dct.setValue(dctText);
      dct.setTimexId("t0");
      dct.addToIndexes();
    } catch (Exception e) {
      e.printStackTrace();
      Logger.printError(
          component, "File " + f.getAbsolutePath() + " could not be properly parsed.");
    }
  }