Java Tokenizer.next Examples

Programming Language: Java

Namespace/Package Name: edu.stanford.nlp.process

Class/Type: Tokenizer

Method/Function: next

Examples at hotexamples.com: 3

Java Tokenizer.next - 3 examples found. These are the top rated real world Java examples of edu.stanford.nlp.process.Tokenizer.next extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

tokenize(9)

hasNext(3)

next(3)

Example #1

Show file

File: PlainTextDocumentReaderAndWriter.java Project: PeterisP/LVTagger

  // todo: give options for document splitting. A line or the whole file or
  // sentence splitting as now
  public Iterator<List<IN>> getIterator(Reader r) {
    Tokenizer<IN> tokenizer = tokenizerFactory.getTokenizer(r);
    // PTBTokenizer.newPTBTokenizer(r, false, true);
    List<IN> words = new ArrayList<IN>();
    IN previous = tokenFactory.makeToken();
    StringBuilder prepend = new StringBuilder();

    /*
     * This changes SGML tags into whitespace -- it should maybe be moved
     * elsewhere
     */
    while (tokenizer.hasNext()) {
      IN w = tokenizer.next();
      String word = w.get(CoreAnnotations.TextAnnotation.class);
      Matcher m = sgml.matcher(word);
      if (m.matches()) {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        String after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        prepend.append(before).append(word);
        String previousTokenAfter =
            StringUtils.getNotNullString(previous.get(CoreAnnotations.AfterAnnotation.class));
        previous.set(AfterAnnotation.class, previousTokenAfter + word + after);
        // previous.appendAfter(w.word() + w.after());
      } else {

        String before = StringUtils.getNotNullString(w.get(CoreAnnotations.BeforeAnnotation.class));
        if (prepend.length() > 0) {
          w.set(BeforeAnnotation.class, prepend.toString() + before);
          // w.prependBefore(prepend.toString());
          prepend = new StringBuilder();
        }
        words.add(w);
        previous = w;
      }
    }

    List<List<IN>> sentences = wts.process(words);
    String after = "";
    IN last = null;
    for (List<IN> sentence : sentences) {
      int pos = 0;
      for (IN w : sentence) {
        w.set(PositionAnnotation.class, Integer.toString(pos));
        after = StringUtils.getNotNullString(w.get(CoreAnnotations.AfterAnnotation.class));
        w.remove(AfterAnnotation.class);
        last = w;
      }
    }
    if (last != null) {
      last.set(AfterAnnotation.class, after);
    }

    return sentences.iterator();
  }

Example #2

Show file

File: CoreNLPPreprocessor.java Project: wentaouc/phrasal

 @Override
 public Sequence<IString> process(String input) {
   String tokenizerInput = toUncased(input.trim());
   Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput));
   List<String> outputStrings = new LinkedList<>();
   while (tokenizer.hasNext()) {
     String string = tokenizer.next().get(TextAnnotation.class);
     outputStrings.add(string);
   }
   return IStrings.toIStringSequence(outputStrings);
 }

Example #3

Show file

File: ArabicTreebankTokenizer.java Project: photon3710/es_test

 public static void main(String[] args) throws IOException {
   Tokenizer<String> att = new ArabicTreebankTokenizer(new FileReader(args[0]));
   while (att.hasNext()) {
     System.out.print(att.next());
   }
 }