// todo: give options for document splitting. A line or the whole file or
  // sentence splitting as now
  public Iterator<List<IN>> getIterator(Reader r) {
    Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r);
    // PTBTokenizer.newPTBTokenizer(r, false, true);
    List<IN> words = new ArrayList<IN>();
    IN previous = tokenFactory.makeToken();
    StringBuilder prepend = new StringBuilder();

    /*
     * This changes SGML tags into whitespace -- it should maybe be moved
     * elsewhere
     */
    while (tokenizer.hasNext()) {
      IN w = tokenizer.next();
      String word = w.get(CoreAnnotations.TextAnnotation.class);
      Matcher m = sgml.matcher(word);
      if (m.matches()) {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        prepend.append(before).append(word);
        String previousTokenAfter =
            StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class));
        previous.set(AfterAnnotation.class, previousTokenAfter + word + after);
        // previous.appendAfter(w.word() + w.after());
      } else {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        if (prepend.length() > 0) {
          w.set(BeforeAnnotation.class, prepend.toString() + before);
          // w.prependBefore(prepend.toString());
          prepend = new StringBuilder();
        }
        words.add(w);
        previous = w;
      }
    }

    List<List<IN>> sentences = wts.process(words);
    String after = "";
    IN last = null;
    for (List<IN> sentence : sentences) {
      int pos = 0;
      for (IN w : sentence) {
        w.set(PositionAnnotation.class, Integer.toString(pos));
        after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        w.remove(AfterAnnotation.class);
        last = w;
      }
    }
    if (last != null) {
      last.set(AfterAnnotation.class, after);
    }

    return sentences.iterator();
  }
 public WordsToSentencesAnnotator(
     boolean verbose,
     String boundaryTokenRegex,
     Set<String> boundaryToDiscard,
     Set<String> htmlElementsToDiscard,
     String newlineIsSentenceBreak) {
   this(
       verbose,
       false,
       new WordToSentenceProcessor<CoreLabel>(
           boundaryTokenRegex,
           boundaryToDiscard,
           htmlElementsToDiscard,
           WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak)));
 }
 public WordsToSentencesAnnotator(
     boolean verbose,
     String boundaryTokenRegex,
     Set<String> boundaryToDiscard,
     Set<String> htmlElementsToDiscard,
     String newlineIsSentenceBreak,
     String boundaryMultiTokenRegex,
     Set<String> tokenRegexesToDiscard) {
   this(
       verbose,
       false,
       new WordToSentenceProcessor<CoreLabel>(
           boundaryTokenRegex,
           boundaryToDiscard,
           htmlElementsToDiscard,
           WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
           (boundaryMultiTokenRegex != null)
               ? TokenSequencePattern.compile(boundaryMultiTokenRegex)
               : null,
           tokenRegexesToDiscard));
 }
  /**
   * If setCountLineNumbers is set to true, we count line numbers by telling the underlying splitter
   * to return empty lists of tokens and then treating those empty lists as empty lines. We don't
   * actually include empty sentences in the annotation, though.
   */
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Sentence splitting ...");
    }
    if (!annotation.has(CoreAnnotations.TokensAnnotation.class)) {
      throw new IllegalArgumentException(
          "WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }

    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // System.err.println("Tokens are: " + tokens);

    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<>();
    for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
      if (countLineNumbers) {
        ++lineNumber;
      }
      if (sentenceTokens.isEmpty()) {
        if (!countLineNumbers) {
          throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
        } else {
          continue;
        }
      }

      // get the sentence text from the first and last character offsets
      int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int last = sentenceTokens.size() - 1;
      int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      String sentenceText = text.substring(begin, end);

      // create a sentence annotation with text and token offsets
      Annotation sentence = new Annotation(sentenceText);
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
      sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
      tokenOffset += sentenceTokens.size();
      sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
      sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());

      if (countLineNumbers) {
        sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
      }

      // Annotate sentence with section information.
      // Assume section start and end appear as first and last tokens of sentence
      CoreLabel sentenceStartToken = sentenceTokens.get(0);
      CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);

      CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
      if (sectionStart != null) {
        // Section is started
        sectionAnnotations = sectionStart;
      }
      if (sectionAnnotations != null) {
        // transfer annotations over to sentence
        ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
      }
      String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
      if (sectionEnd != null) {
        sectionAnnotations = null;
      }

      if (docID != null) {
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
      }

      int index = 1;
      for (CoreLabel token : sentenceTokens) {
        token.setIndex(index++);
        token.setSentIndex(sentences.size());
        if (docID != null) {
          token.setDocID(docID);
        }
      }

      // add the sentence to the list
      sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
    /*
    if (tokenOffset != tokens.size()) {
      throw new RuntimeException(String.format(
          "expected %d tokens, found %d", tokens.size(), tokenOffset));
    }
    */

    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }