Beispiel #1
0
  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
      label = (CoreLabel) ptbt.next();
      if (label.value().length() > 2) {
        System.err.println(label.toString());
        Tuple termText = tupleFactory.newTuple(label.word());
        bagOfTokens.add(termText);
      }
    }
    return bagOfTokens;
  }
 public static void main(String[] args) throws IOException {
   PrintWriter pw = new PrintWriter(filename + ".tokenized.SPL");
   @SuppressWarnings({"unchecked", "rawtypes"})
   PTBTokenizer tokenizer =
       new PTBTokenizer(new FileReader(filename), new CoreLabelTokenFactory(), "");
   Sentence s = new Sentence();
   for (CoreLabel label; tokenizer.hasNext(); ) {
     label = (CoreLabel) tokenizer.next();
     String token = label.toString("value");
     if (token.equals("-LRB-")) {
       token = "(";
     }
     if (token.equals("-RRB-")) {
       token = ")";
     }
     if (token.equals("-LSB-")) {
       token = "[";
     }
     if (token.equals("-RSB-")) {
       token = "]";
     }
     if (token.equals("-LCB-")) {
       token = "{";
     }
     if (token.equals("-RCB-")) {
       token = "}";
     }
     s.words.add(token);
     // is it sentence splitter?
     if (token.equals(".") || token.equals("?") || token.equals("!")) {
       if (s.isCleanSentence()) {
         pw.println(s.toString());
       }
       s = new Sentence();
     }
   }
   pw.close();
 }