public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; }
public static void main(String[] args) throws IOException { PrintWriter pw = new PrintWriter(filename + ".tokenized.SPL"); @SuppressWarnings({"unchecked", "rawtypes"}) PTBTokenizer tokenizer = new PTBTokenizer(new FileReader(filename), new CoreLabelTokenFactory(), ""); Sentence s = new Sentence(); for (CoreLabel label; tokenizer.hasNext(); ) { label = (CoreLabel) tokenizer.next(); String token = label.toString("value"); if (token.equals("-LRB-")) { token = "("; } if (token.equals("-RRB-")) { token = ")"; } if (token.equals("-LSB-")) { token = "["; } if (token.equals("-RSB-")) { token = "]"; } if (token.equals("-LCB-")) { token = "{"; } if (token.equals("-RCB-")) { token = "}"; } s.words.add(token); // is it sentence splitter? if (token.equals(".") || token.equals("?") || token.equals("!")) { if (s.isCleanSentence()) { pw.println(s.toString()); } s = new Sentence(); } } pw.close(); }