// todo: give options for document splitting. A line or the whole file or // sentence splitting as now public Iterator<List<IN>> getIterator(Reader r) { Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r); // PTBTokenizer.newPTBTokenizer(r, false, true); List<IN> words = new ArrayList<IN>(); IN previous = tokenFactory.makeToken(); StringBuilder prepend = new StringBuilder(); /* * This changes SGML tags into whitespace -- it should maybe be moved * elsewhere */ while (tokenizer.hasNext()) { IN w = tokenizer.next(); String word = w.get(CoreAnnotations.TextAnnotation.class); Matcher m = sgml.matcher(word); if (m.matches()) { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); prepend.append(before).append(word); String previousTokenAfter = StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class)); previous.set(AfterAnnotation.class, previousTokenAfter + word + after); // previous.appendAfter(w.word() + w.after()); } else { String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class)); if (prepend.length() > 0) { w.set(BeforeAnnotation.class, prepend.toString() + before); // w.prependBefore(prepend.toString()); prepend = new StringBuilder(); } words.add(w); previous = w; } } List<List<IN>> sentences = wts.process(words); String after = ""; IN last = null; for (List<IN> sentence : sentences) { int pos = 0; for (IN w : sentence) { w.set(PositionAnnotation.class, Integer.toString(pos)); after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class)); w.remove(AfterAnnotation.class); last = w; } } if (last != null) { last.set(AfterAnnotation.class, after); } return sentences.iterator(); }
@Override public Sequence<IString> process(String input) { String tokenizerInput = toUncased(input.trim()); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput)); List<String> outputStrings = new LinkedList<>(); while (tokenizer.hasNext()) { String string = tokenizer.next().get(TextAnnotation.class); outputStrings.add(string); } return IStrings.toIStringSequence(outputStrings); }
public static void main(String[] args) throws IOException { Tokenizer<String> att = new ArabicTreebankTokenizer(new FileReader(args[0])); while (att.hasNext()) { System.out.print(att.next()); } }