/** initialization method where we fill configuration values and check some prerequisites */ public void initialize(UimaContext aContext) { // check if the supplied language is one that we can currently handle this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE)); // get configuration from the descriptor annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS); annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES); annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH); String cnTokPath = (String) aContext.getConfigParameterValue(PARAM_CHINESE_TOKENIZER_PATH); // set some configuration based upon these values ttprops.languageName = language.getTreeTaggerLangName(); if (ttprops.rootPath == null) ttprops.rootPath = System.getenv("TREETAGGER_HOME"); ttprops.tokScriptName = "utf8-tokenize.perl"; // parameter file if (!(new File( ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.languageName + "-utf8.par") .exists())) // get UTF8 version if it exists ttprops.parFileName = ttprops.languageName + ".par"; else ttprops.parFileName = ttprops.languageName + "-utf8.par"; // abbreviation file if (new File( ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.languageName + "-abbreviations-utf8") .exists()) { // get UTF8 version if it exists ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8"; } else { ttprops.abbFileName = ttprops.languageName + "-abbreviations"; } ttprops.languageSwitch = language.getTreeTaggerSwitch(); if (cnTokPath != null && !cnTokPath.equals("")) ttprops.chineseTokenizerPath = new File(cnTokPath); else ttprops.chineseTokenizerPath = new File(ttprops.rootPath, "cmd"); // handle the treetagger path from the environment variables if (ttprops.rootPath == null) { Logger.printError("TreeTagger environment variable is not present, aborting."); System.exit(-1); } // Check for whether the required treetagger parameter files are present Boolean abbFileFlag = true; Boolean parFileFlag = true; Boolean tokScriptFlag = true; File abbFile = new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.abbFileName); File parFile = new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.parFileName); File tokFile = new File(ttprops.rootPath + ttprops.fileSeparator + "cmd", ttprops.tokScriptName); if (!(abbFileFlag = abbFile.exists())) { if (language.equals(Language.CHINESE) || language.equals(Language.RUSSIAN)) { abbFileFlag = true; ttprops.abbFileName = null; } else { Logger.printError( component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName); } } if (!(parFileFlag = parFile.exists())) { Logger.printError( component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName); } if (!(tokScriptFlag = tokFile.exists())) { if (language.equals(Language.CHINESE)) tokScriptFlag = true; else Logger.printError( component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName); } if (!abbFileFlag || !parFileFlag || !tokScriptFlag) { Logger.printError( component, "Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator + "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." + " Make sure that path to tree tagger is set correctly in config.props!"); Logger.printError(component, "If path is set correctly:"); Logger.printError( component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz"); Logger.printError( component, "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz"); Logger.printError( component, "Extract this file and copy the missing file into the corresponding TreeTagger directories."); Logger.printError( component, "If missing, copy " + ttprops.abbFileName + " into " + ttprops.rootPath + ttprops.fileSeparator + "lib"); Logger.printError( component, "If missing, copy " + ttprops.parFileName + " into " + ttprops.rootPath + ttprops.fileSeparator + "lib"); Logger.printError( component, "If missing, copy " + ttprops.tokScriptName + " into " + ttprops.rootPath + ttprops.fileSeparator + "cmd"); System.exit(-1); } }
/** * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas * object using the treetagger program. * * @param jcas JCas object supplied by the pipeline */ private void doTreeTag(JCas jcas) { File tmpDocument = null; BufferedWriter tmpFileWriter; ArrayList<Token> tokens = new ArrayList<Token>(); try { // create a temporary file and write our pre-existing tokens to it. tmpDocument = File.createTempFile("postokens", null); tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8")); // iterate over existing tokens FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator(); while (ai.hasNext()) { Token t = (Token) ai.next(); tokens.add(t); if (!(t.getBegin() == t.getEnd())) { tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator); } } tmpFileWriter.close(); } catch (IOException e) { Logger.printError( "Something went wrong creating a temporary file for the treetagger to process."); System.exit(-1); } // Possible End-of-Sentence Tags HashSet<String> hsEndOfSentenceTag = new HashSet<String>(); hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK, hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH hsEndOfSentenceTag.add("FS"); // SPANISH hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN hsEndOfSentenceTag.add("ew"); // CHINESE try { Process p = ttprops.getTreeTaggingProcess(tmpDocument); Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName); BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8")); Sentence sentence = null; // iterate over all the output lines and tokens array (which have the same source and are // hence symmetric) int i = 0; String s = null; while ((s = in.readLine()) != null) { // grab a token Token token = tokens.get(i++); // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file while (token.getCoveredText().equals("")) { // if part of the configuration, also add sentences to the jcas document if ((annotate_sentences) && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } // Finish current sentence if end-of-sentence pos was found or document ended sentence.setEnd(token.getEnd()); if (sentence.getBegin() < sentence.getEnd()) { sentence.addToIndexes(); } // Make sure current sentence is not active anymore so that a new one might be created sentence = null; // sentence = new Sentence(jcas); } token.removeFromIndexes(); token = tokens.get(i++); } // remove tokens, otherwise they are in the index twice token.removeFromIndexes(); // set part of speech tag and add to indexes again if (!(token.getCoveredText().equals(""))) { token.setPos(s); token.addToIndexes(); } // if part of the configuration, also add sentences to the jcas document if (annotate_sentences) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } // Finish current sentence if end-of-sentence pos was found or document ended if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) { sentence.setEnd(token.getEnd()); sentence.addToIndexes(); // Make sure current sentence is not active anymore so that a new one might be created sentence = null; // sentence = new Sentence(jcas); } } } while (i < tokens.size()) { if (!(sentence == null)) { sentence.setEnd(tokens.get(tokens.size() - 1).getEnd()); sentence.addToIndexes(); } Token token = tokens.get(i++); if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) { token.removeFromIndexes(); } } in.close(); p.destroy(); } catch (Exception e) { e.printStackTrace(); } finally { // Delete temporary files tmpDocument.delete(); } }
private void fillJCas(JCas jcas) { // grab a file to process File f = files.poll(); try { // create xml parsing facilities DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); // parse input xml file Document doc = db.parse(f); doc.getDocumentElement().normalize(); // get the <text> tag's content to set the document text NodeList nList = doc.getElementsByTagName("TEXT"); Node textNode = nList.item(0); String text = textNode.getTextContent(); jcas.setDocumentText(text); // get the <dct> timex tag's value attribute for the dct Boolean gotDCT = false; String dctText = null; try { nList = doc.getDocumentElement().getElementsByTagName("DCT"); nList = ((Element) nList.item(0)).getElementsByTagName("TIMEX3"); // timex3 tag Node dctTimex = nList.item(0); NamedNodeMap dctTimexAttr = dctTimex.getAttributes(); Node dctValue = dctTimexAttr.getNamedItem("value"); dctText = dctValue.getTextContent(); gotDCT = true; } catch (Exception e) { gotDCT = false; } if (!gotDCT) try { // try a different location for the DCT timex element nList = doc.getDocumentElement().getElementsByTagName("TEXT"); nList = ((Element) nList.item(0)).getElementsByTagName("TIMEX3"); // timex3 tag Node dctTimex = nList.item(0); NamedNodeMap dctTimexAttr = dctTimex.getAttributes(); if (dctTimexAttr.getNamedItem("functionInDocument") != null && dctTimexAttr .getNamedItem("functionInDocument") .getTextContent() .equals("CREATION_TIME")) { Node dctValue = dctTimexAttr.getNamedItem("value"); dctText = dctValue.getTextContent(); } gotDCT = true; } catch (Exception e) { gotDCT = false; } // get the document id nList = doc.getElementsByTagName("DOCID"); String filename = null; if (nList != null && nList.getLength() > 0) filename = nList.item(0).getTextContent(); else filename = f.getName().replaceAll("\\.[^\\.]+$", ""); Dct dct = new Dct(jcas); dct.setBegin(0); dct.setEnd(text.length()); dct.setFilename(filename); dct.setValue(dctText); dct.setTimexId("t0"); dct.addToIndexes(); } catch (Exception e) { e.printStackTrace(); Logger.printError( component, "File " + f.getAbsolutePath() + " could not be properly parsed."); } }