Пример #1
0
 /**
  * Construct a fall through tree in case we can't parse this sentence
  *
  * @param words
  * @return a tree with X for all the internal nodes
  */
 public static Tree xTree(List<? extends HasWord> words) {
   TreeFactory lstf = new LabeledScoredTreeFactory();
   List<Tree> lst2 = new ArrayList<Tree>();
   for (HasWord obj : words) {
     String s = obj.word();
     Tree t = lstf.newLeaf(s);
     Tree t2 = lstf.newTreeNode("X", Collections.singletonList(t));
     lst2.add(t2);
   }
   return lstf.newTreeNode("X", lst2);
 }
Пример #2
0
 @SuppressWarnings("OverlyStrongTypeCast")
 private static String getString(Object o) {
   if (o instanceof HasWord) {
     HasWord h = (HasWord) o;
     return h.word();
   } else if (o instanceof String) {
     return (String) o;
   } else if (o instanceof CoreMap) {
     return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class);
   } else {
     throw new RuntimeException("Expected token to be either Word or String.");
   }
 }
Пример #3
0
 /**
  * Turns a sentence into a flat phrasal tree. The structure is S -> tag*. And then each tag goes
  * to a word. The tag is either found from the label or made "WD". The tag and phrasal node have a
  * StringLabel.
  *
  * @param s The Sentence to make the Tree from
  * @param lf The LabelFactory with which to create the new Tree labels
  * @return The one phrasal level Tree
  */
 public static Tree toFlatTree(Sentence<?> s, LabelFactory lf) {
   List<Tree> daughters = new ArrayList<Tree>(s.length());
   for (HasWord word : s) {
     Tree wordNode = new LabeledScoredTreeLeaf(lf.newLabel(word.word()));
     if (word instanceof TaggedWord) {
       TaggedWord taggedWord = (TaggedWord) word;
       wordNode =
           new LabeledScoredTreeNode(
               new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode));
     } else {
       wordNode =
           new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode));
     }
     daughters.add(wordNode);
   }
   return new LabeledScoredTreeNode(new StringLabel("S"), daughters);
 }
Пример #4
0
  /**
   * Returns a List of Lists where each element is built from a run of Words in the input Document.
   * Specifically, reads through each word in the input document and breaks off a sentence after
   * finding a valid sentence boundary token or end of file. Note that for this to work, the words
   * in the input document must have been tokenized with a tokenizer that makes sentence boundary
   * tokens their own tokens (e.g., {@link PTBTokenizer}).
   *
   * @param words A list of already tokenized words (must implement HasWord or be a String)
   * @return A list of Sentence
   * @see #WordToSentenceProcessor(String, Set, Set, Pattern, Pattern)
   */
  public List<List<IN>> wordsToSentences(List<? extends IN> words) {
    List<List<IN>> sentences = Generics.newArrayList();
    List<IN> currentSentence = null;
    List<IN> lastSentence = null;
    boolean insideRegion = false;
    for (IN o : words) {
      String word;
      if (o instanceof HasWord) {
        HasWord h = (HasWord) o;
        word = h.word();
      } else if (o instanceof String) {
        word = (String) o;
      } else if (o instanceof CoreMap) {
        word = ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class);
      } else {
        throw new RuntimeException("Expected token to be either Word or String.");
      }

      boolean forcedEnd = false;
      if (o instanceof CoreMap) {
        Boolean forcedEndValue =
            ((CoreMap) o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
        if (forcedEndValue != null) forcedEnd = forcedEndValue;
      }

      if (DEBUG) {
        EncodingPrintWriter.err.println("Word is " + word, "UTF-8");
      }
      if (currentSentence == null) {
        currentSentence = new ArrayList<IN>();
      }
      if (sentenceRegionBeginPattern != null && !insideRegion) {
        if (sentenceRegionBeginPattern.matcher(word).matches()) {
          insideRegion = true;
        }
        if (DEBUG) {
          System.err.println("  outside region");
        }
        continue;
      }
      if (sentenceBoundaryFollowers.contains(word)
          && lastSentence != null
          && currentSentence.isEmpty()) {
        lastSentence.add(o);
        if (DEBUG) {
          System.err.println("  added to last");
        }
      } else {
        boolean newSent = false;
        if (matchesSentenceBoundaryToDiscard(word)) {
          newSent = true;
        } else if (sentenceRegionEndPattern != null
            && sentenceRegionEndPattern.matcher(word).matches()) {
          insideRegion = false;
          newSent = true;
        } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
          currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  is sentence boundary; added to current");
          }
          newSent = true;
        } else if (forcedEnd) {
          currentSentence.add(o);
          newSent = true;
          if (DEBUG) {
            System.err.println("  annotated to be the end of a sentence");
          }
        } else {
          currentSentence.add(o);
          if (DEBUG) {
            System.err.println("  added to current");
          }
        }
        if (newSent && currentSentence.size() > 0) {
          if (DEBUG) {
            System.err.println("  beginning new sentence");
          }
          sentences.add(currentSentence);
          // adds this sentence now that it's complete
          lastSentence = currentSentence;
          currentSentence = null; // clears the current sentence
        }
      }
    }

    // add any words at the end, even if there isn't a sentence
    // terminator at the end of file
    if (currentSentence != null && currentSentence.size() > 0) {
      sentences.add(currentSentence); // adds last sentence
    }
    return sentences;
  }