Java WordToSentenceProcessor Examples

Programming Language: Java

Namespace/Package Name: edu.stanford.nlp.process

Examples at hotexamples.com: 4

Java WordToSentenceProcessor - 4 examples found. These are the top rated real world Java examples of edu.stanford.nlp.process.WordToSentenceProcessor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

process(2)

stringToNewlineIsSentenceBreak(2)

setSentenceBoundaryToDiscard(1)

Example #1

Show file

File: PlainTextDocumentReaderAndWriter.java Project: PeterisP/LVTagger

  // todo: give options for document splitting. A line or the whole file or
  // sentence splitting as now
  public Iterator<List<IN>> getIterator(Reader r) {
    Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r);
    // PTBTokenizer.newPTBTokenizer(r, false, true);
    List<IN> words = new ArrayList<IN>();
    IN previous = tokenFactory.makeToken();
    StringBuilder prepend = new StringBuilder();

    /*
     * This changes SGML tags into whitespace -- it should maybe be moved
     * elsewhere
     */
    while (tokenizer.hasNext()) {
      IN w = tokenizer.next();
      String word = w.get(CoreAnnotations.TextAnnotation.class);
      Matcher m = sgml.matcher(word);
      if (m.matches()) {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        prepend.append(before).append(word);
        String previousTokenAfter =
            StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class));
        previous.set(AfterAnnotation.class, previousTokenAfter + word + after);
        // previous.appendAfter(w.word() + w.after());
      } else {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        if (prepend.length() > 0) {
          w.set(BeforeAnnotation.class, prepend.toString() + before);
          // w.prependBefore(prepend.toString());
          prepend = new StringBuilder();
        }
        words.add(w);
        previous = w;
      }
    }

    List<List<IN>> sentences = wts.process(words);
    String after = "";
    IN last = null;
    for (List<IN> sentence : sentences) {
      int pos = 0;
      for (IN w : sentence) {
        w.set(PositionAnnotation.class, Integer.toString(pos));
        after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        w.remove(AfterAnnotation.class);
        last = w;
      }
    }
    if (last != null) {
      last.set(AfterAnnotation.class, after);
    }

    return sentences.iterator();
  }

Example #2

Show file

File: WordsToSentencesAnnotator.java Project: StonyBrookNLP/stingysentiment

 public WordsToSentencesAnnotator(
     boolean verbose,
     String boundaryTokenRegex,
     Set<String> boundaryToDiscard,
     Set<String> htmlElementsToDiscard,
     String newlineIsSentenceBreak) {
   this(
       verbose,
       false,
       new WordToSentenceProcessor<CoreLabel>(
           boundaryTokenRegex,
           boundaryToDiscard,
           htmlElementsToDiscard,
           WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak)));
 }

Example #3

Show file

File: WordsToSentencesAnnotator.java Project: StonyBrookNLP/stingysentiment

 public WordsToSentencesAnnotator(
     boolean verbose,
     String boundaryTokenRegex,
     Set<String> boundaryToDiscard,
     Set<String> htmlElementsToDiscard,
     String newlineIsSentenceBreak,
     String boundaryMultiTokenRegex,
     Set<String> tokenRegexesToDiscard) {
   this(
       verbose,
       false,
       new WordToSentenceProcessor<CoreLabel>(
           boundaryTokenRegex,
           boundaryToDiscard,
           htmlElementsToDiscard,
           WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
           (boundaryMultiTokenRegex != null)
               ? TokenSequencePattern.compile(boundaryMultiTokenRegex)
               : null,
           tokenRegexesToDiscard));
 }

Example #4

Show file

File: WordsToSentencesAnnotator.java Project: nicholas-leonard/CoreNLP

  /**
   * If setCountLineNumbers is set to true, we count line numbers by telling the underlying splitter
   * to return empty lists of tokens and then treating those empty lists as empty lines. We don't
   * actually include empty sentences in the annotation, though.
   */
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      System.err.print("Sentence splitting ...");
    }
    if (!annotation.has(CoreAnnotations.TokensAnnotation.class)) {
      throw new IllegalArgumentException(
          "WordsToSentencesAnnotator: unable to find words/tokens in: " + annotation);
    }

    // get text and tokens from the document
    String text = annotation.get(CoreAnnotations.TextAnnotation.class);
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    // System.err.println("Tokens are: " + tokens);

    // assemble the sentence annotations
    int tokenOffset = 0;
    int lineNumber = 0;
    // section annotations to mark sentences with
    CoreMap sectionAnnotations = null;
    List<CoreMap> sentences = new ArrayList<>();
    for (List<CoreLabel> sentenceTokens : wts.process(tokens)) {
      if (countLineNumbers) {
        ++lineNumber;
      }
      if (sentenceTokens.isEmpty()) {
        if (!countLineNumbers) {
          throw new IllegalStateException("unexpected empty sentence: " + sentenceTokens);
        } else {
          continue;
        }
      }

      // get the sentence text from the first and last character offsets
      int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int last = sentenceTokens.size() - 1;
      int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      String sentenceText = text.substring(begin, end);

      // create a sentence annotation with text and token offsets
      Annotation sentence = new Annotation(sentenceText);
      sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
      sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
      sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenOffset);
      tokenOffset += sentenceTokens.size();
      sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenOffset);
      sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());

      if (countLineNumbers) {
        sentence.set(CoreAnnotations.LineNumberAnnotation.class, lineNumber);
      }

      // Annotate sentence with section information.
      // Assume section start and end appear as first and last tokens of sentence
      CoreLabel sentenceStartToken = sentenceTokens.get(0);
      CoreLabel sentenceEndToken = sentenceTokens.get(sentenceTokens.size() - 1);

      CoreMap sectionStart = sentenceStartToken.get(CoreAnnotations.SectionStartAnnotation.class);
      if (sectionStart != null) {
        // Section is started
        sectionAnnotations = sectionStart;
      }
      if (sectionAnnotations != null) {
        // transfer annotations over to sentence
        ChunkAnnotationUtils.copyUnsetAnnotations(sectionAnnotations, sentence);
      }
      String sectionEnd = sentenceEndToken.get(CoreAnnotations.SectionEndAnnotation.class);
      if (sectionEnd != null) {
        sectionAnnotations = null;
      }

      if (docID != null) {
        sentence.set(CoreAnnotations.DocIDAnnotation.class, docID);
      }

      int index = 1;
      for (CoreLabel token : sentenceTokens) {
        token.setIndex(index++);
        token.setSentIndex(sentences.size());
        if (docID != null) {
          token.setDocID(docID);
        }
      }

      // add the sentence to the list
      sentences.add(sentence);
    }
    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
    /*
    if (tokenOffset != tokens.size()) {
      throw new RuntimeException(String.format(
          "expected %d tokens, found %d", tokens.size(), tokenOffset));
    }
    */

    // add the sentences annotations to the document
    annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
  }