// todo: give options for document splitting. A line or the whole file or // sentence splitting as now public Iterator<List<IN>> getIterator(Reader r) { Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r); // PTBTokenizer.newPTBTokenizer(r, false, true); List<IN> words = new ArrayList<IN>(); IN previous = tokenFactory.makeToken(); StringBuilder prepend = new StringBuilder(); /* * This changes SGML tags into whitespace -- it should maybe be moved * elsewhere */ while (tokenizer.hasNext()) { IN w = tokenizer.next(); String word = w.get(CoreAnnotations.TextAnnotation.class); Matcher m = sgml.matcher(word); if (m.matches()) { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); prepend.append(before).append(word); String previousTokenAfter = StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class)); previous.set(AfterAnnotation.class, previousTokenAfter + word + after); // previous.appendAfter(w.word() + w.after()); } else { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); if (prepend.length() > 0) { w.set(BeforeAnnotation.class, prepend.toString() + before); // w.prependBefore(prepend.toString()); prepend = new StringBuilder(); } words.add(w); previous = w; } } List<List<IN>> sentences = wts.process(words); String after = ""; IN last = null; for (List<IN> sentence : sentences) { int pos = 0; for (IN w : sentence) { w.set(PositionAnnotation.class, Integer.toString(pos)); after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); w.remove(AfterAnnotation.class); last = w; } } if (last != null) { last.set(AfterAnnotation.class, after); } return sentences.iterator(); }
public WordsToSentencesAnnotator( boolean verbose, String boundaryTokenRegex, Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard, String newlineIsSentenceBreak) { this( verbose, false, new WordToSentenceProcessor<CoreLabel>( boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak))); }
public WordsToSentencesAnnotator( boolean verbose, String boundaryTokenRegex, Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard, String newlineIsSentenceBreak, String boundaryMultiTokenRegex, Set<String> tokenRegexesToDiscard) { this( verbose, false, new WordToSentenceProcessor<CoreLabel>( boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak), (boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard)); }
/** * If setCountLineNumbers is set to true, we count line numbers by telling the underlying splitter * to return empty lists of tokens and then treating those empty lists as empty lines. We don't * actually include empty sentences in the annotation, though. */ @Override public void annotate(Annotation annotation) { if (VERBOSE) { System.err.print("Sentence splitting ..."); } if (!annotation.has(CoreAnnotations.TokensAnnotation.class)) { throw new IllegalArgumentException( "WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation); } // get text and tokens from the document String text = annotation.get(CoreAnnotations.TextAnnotation.class); List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class); String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class); // System.err.println("Tokens are: " + tokens); // assemble the sentence annotations int tokenOffset = 0; int lineNumber = 0; // section annotations to mark sentences with CoreMap sectionAnnotations = null; List<CoreMap> sentences = new ArrayList<>(); for (List<CoreLabel> sentenceTokens : wts.process(tokens)) { if (countLineNumbers) { ++lineNumber; } if (sentenceTokens.isEmpty()) { if (!countLineNumbers) { throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens); } else { continue; } } // get the sentence text from the first and last character offsets int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int last = sentenceTokens.size() - 1; int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); String sentenceText = text.substring(begin, end); // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens); sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset); tokenOffset += sentenceTokens.size(); sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset); sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size()); if (countLineNumbers) { sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber); } // Annotate sentence with section information. // Assume section start and end appear as first and last tokens of sentence CoreLabel sentenceStartToken = sentenceTokens.get(0); CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1); CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class); if (sectionStart != null) { // Section is started sectionAnnotations = sectionStart; } if (sectionAnnotations != null) { // transfer annotations over to sentence ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence); } String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class); if (sectionEnd != null) { sectionAnnotations = null; } if (docID != null) { sentence.set(CoreAnnotations.DocIDAnnotation.class, docID); } int index = 1; for (CoreLabel token : sentenceTokens) { token.setIndex(index++); token.setSentIndex(sentences.size()); if (docID != null) { token.setDocID(docID); } } // add the sentence to the list sentences.add(sentence); } // the condition below is possible if sentenceBoundaryToDiscard is initialized! /* if (tokenOffset != tokens.size()) { throw new RuntimeException(String.format( "expected %d tokens, found %d", tokens.size(), tokenOffset)); } */ // add the sentences annotations to the document annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); }