/** * improve german sentences; the treetagger splits german sentences incorrectly on some occasions * * @param jcas JCas object supplied by the pipeline */ private void improveGermanSentences(JCas jcas) { /* * these POS tag sequences will decide whether we want to merge two sentences * that have (supposedly wrongfully) been split. */ HashSet<String[]> posRules = new HashSet<String[]>(); posRules.add(new String[] {"CARD", "\\$.", "NN"}); posRules.add(new String[] {"CARD", "\\$.", "NE"}); FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator(); // compare two sentences at a time in order to have access to all POS tags HashSet<HashSet<Sentence>> toMerge = new HashSet<HashSet<Sentence>>(); Sentence prevSent = null, thisSent = null; while (sentIter.hasNext()) { if (thisSent == null) { thisSent = (Sentence) sentIter.next(); continue; } prevSent = thisSent; thisSent = (Sentence) sentIter.next(); /* * select the last two tokens within the previous sentence as well as the * first of the current one and check for matches. */ Token penultimateToken = null, ultimateToken = null, firstToken = null; FSIterator tokIter = jcas.getAnnotationIndex(Token.type).subiterator(thisSent); if (tokIter.hasNext()) { firstToken = (Token) tokIter.next(); } tokIter = jcas.getAnnotationIndex(Token.type).subiterator(prevSent); while (tokIter.hasNext()) { if (ultimateToken == null) { ultimateToken = (Token) tokIter.next(); continue; } penultimateToken = ultimateToken; ultimateToken = (Token) tokIter.next(); } // check that all tokens for further analysis are present. if not: skip if (penultimateToken == null || ultimateToken == null || firstToken == null) { continue; } // check rules, memorize sentences to be merged for (String[] posRule : posRules) { /* * either one of the pre-defined POS rules fit, or the first token's * covered text begins with lower case characters. */ if ((penultimateToken.getPos() != null && penultimateToken.getPos().matches(posRule[0]) && ultimateToken.getPos() != null && ultimateToken.getPos().matches(posRule[1]) && firstToken.getPos() != null && firstToken.getPos().matches(posRule[2])) || (firstToken.getCoveredText().matches("^[a-z/].*"))) { /* * check whether one of the previous candidate pairs already * contains one of our sentences. */ Boolean candidateExisted = false; for (HashSet<Sentence> mergeCandidate : toMerge) { if (mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) { // we add both here because sets ignore duplicates mergeCandidate.add(prevSent); mergeCandidate.add(thisSent); candidateExisted = true; break; } } /* * if one of the sentences was not already to be merged with another, * create a new merge candidate set */ if (!candidateExisted) { HashSet<Sentence> newCandidate = new HashSet<Sentence>(); newCandidate.add(prevSent); newCandidate.add(thisSent); toMerge.add(newCandidate); } break; // don't need to do the next rules; already merging. } } } // iterate over the previously collected merge candidates for (HashSet<Sentence> mergeCandidate : toMerge) { // find the earliest beginning and latest end for the set of sentences Integer beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE; Sentence mergedSent = new Sentence(jcas); for (Sentence s : mergeCandidate) { if (s.getBegin() < beginIndex) { beginIndex = s.getBegin(); } if (s.getEnd() > endIndex) { endIndex = s.getEnd(); } s.removeFromIndexes(); } // set values, add to jcas mergedSent.setBegin(beginIndex); mergedSent.setEnd(endIndex); mergedSent.addToIndexes(); } }
/** * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas * object using the treetagger program. * * @param jcas JCas object supplied by the pipeline */ private void doTreeTag(JCas jcas) { File tmpDocument = null; BufferedWriter tmpFileWriter; ArrayList<Token> tokens = new ArrayList<Token>(); try { // create a temporary file and write our pre-existing tokens to it. tmpDocument = File.createTempFile("postokens", null); tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8")); // iterate over existing tokens FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator(); while (ai.hasNext()) { Token t = (Token) ai.next(); tokens.add(t); if (!(t.getBegin() == t.getEnd())) { tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator); } } tmpFileWriter.close(); } catch (IOException e) { Logger.printError( "Something went wrong creating a temporary file for the treetagger to process."); System.exit(-1); } // Possible End-of-Sentence Tags HashSet<String> hsEndOfSentenceTag = new HashSet<String>(); hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK, hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH hsEndOfSentenceTag.add("FS"); // SPANISH hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN hsEndOfSentenceTag.add("ew"); // CHINESE try { Process p = ttprops.getTreeTaggingProcess(tmpDocument); Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName); BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8")); Sentence sentence = null; // iterate over all the output lines and tokens array (which have the same source and are // hence symmetric) int i = 0; String s = null; while ((s = in.readLine()) != null) { // grab a token Token token = tokens.get(i++); // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file while (token.getCoveredText().equals("")) { // if part of the configuration, also add sentences to the jcas document if ((annotate_sentences) && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } // Finish current sentence if end-of-sentence pos was found or document ended sentence.setEnd(token.getEnd()); if (sentence.getBegin() < sentence.getEnd()) { sentence.addToIndexes(); } // Make sure current sentence is not active anymore so that a new one might be created sentence = null; // sentence = new Sentence(jcas); } token.removeFromIndexes(); token = tokens.get(i++); } // remove tokens, otherwise they are in the index twice token.removeFromIndexes(); // set part of speech tag and add to indexes again if (!(token.getCoveredText().equals(""))) { token.setPos(s); token.addToIndexes(); } // if part of the configuration, also add sentences to the jcas document if (annotate_sentences) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } // Finish current sentence if end-of-sentence pos was found or document ended if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) { sentence.setEnd(token.getEnd()); sentence.addToIndexes(); // Make sure current sentence is not active anymore so that a new one might be created sentence = null; // sentence = new Sentence(jcas); } } } while (i < tokens.size()) { if (!(sentence == null)) { sentence.setEnd(tokens.get(tokens.size() - 1).getEnd()); sentence.addToIndexes(); } Token token = tokens.get(i++); if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) { token.removeFromIndexes(); } } in.close(); p.destroy(); } catch (Exception e) { e.printStackTrace(); } finally { // Delete temporary files tmpDocument.delete(); } }