/** * tokenizes a given JCas object's document text using the treetagger program and adds the * recognized tokens to the JCas object. * * @param jcas JCas object supplied by the pipeline */ private void tokenize(JCas jcas) { // read tokenized text to add tokens to the jcas Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName); EnumSet<Flag> flags = Flag.getSet(ttprops.languageSwitch); TreeTaggerTokenizer ttt; ttprops.abbFileName = "english-abbreviations"; if (ttprops.abbFileName != null) { ttt = new TreeTaggerTokenizer( ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName, flags); } else { ttt = new TreeTaggerTokenizer(null, flags); } String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n"); List<String> tokenized = ttt.tokenize(docText); int tokenOffset = 0; // loop through all the lines in the treetagger output for (String s : tokenized) { // charset missmatch fallback: signal (invalid) s if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)) throw new RuntimeException( "Opps! Could not find token " + s + " in JCas after tokenizing with TreeTagger." + " Hmm, there may exist a charset missmatch!" + " Default encoding is " + Charset.defaultCharset().name() + " and should always be UTF-8 (use -Dfile.encoding=UTF-8)." + " If input document is not UTF-8 use -e option to set it according to the input, additionally."); // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); if (s.equals("EMPTYLINE")) { newToken.setBegin(tokenOffset); newToken.setEnd(tokenOffset); newToken.setPos("EMPTYLINE"); if (annotate_partofspeech) { newToken.addToIndexes(); } } else { newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset)); newToken.setEnd(newToken.getBegin() + s.length()); newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } } }
/** * tokenizes a given JCas object's document text using the chinese tokenization script and adds * the recognized tokens to the JCas object. * * @param jcas JCas object supplied by the pipeline */ private void tokenizeChinese(JCas jcas) { try { // read tokenized text to add tokens to the jcas Process proc = ttprops.getChineseTokenizationProcess(); Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath); BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8")); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8")); Integer tokenOffset = 0; // loop through all the lines in the stdout output String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+"); for (String inSplit : inSplits) { out.write(inSplit); out.newLine(); out.flush(); // do one initial read String s = in.readLine(); do { // break out of the loop if we've read a null if (s == null) break; String[] outSplits = s.split("\\s+"); for (String tok : outSplits) { if (jcas.getDocumentText().indexOf(tok, tokenOffset) < 0) throw new RuntimeException( "Could not find token " + tok + " in JCas after tokenizing with Chinese tokenization script."); // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset)); newToken.setEnd(newToken.getBegin() + tok.length()); newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } // break out of the loop if the next read will block if (!in.ready()) break; s = in.readLine(); } while (true); } // clean up in.close(); proc.destroy(); } catch (Exception e) { e.printStackTrace(); } }