Example #1
0
 @Test
 public void testJacobEisensteinApostropheCase() {
   StringReader reader = new StringReader("it's");
   PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens = tokenizer.tokenize();
   // for (Word word : stemmedTokens) System.out.print (word+" ");
   reader = new StringReader(" it's ");
   tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens2 = tokenizer.tokenize();
   // System.out.println ();
   // for (Word word : stemmedTokens2) System.out.print (word+" ");
   // System.out.println();
   assertEquals(stemmedTokens, stemmedTokens2);
 }
Example #2
0
  @Test
  public void testInvertible() {
    String text = "  This     is     a      colourful sentence.    ";
    PTBTokenizer<CoreLabel> tokenizer =
        PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true);
    List<CoreLabel> tokens = tokenizer.tokenize();
    assertEquals(6, tokens.size());
    assertEquals("  ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals(
        "Wrong begin char offset",
        2,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals(
        "Wrong end char offset",
        6,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
    // note: after(x) and before(x+1) are the same
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals("     ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
    // americanize is now off by default
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("", tokens.get(4).after());
    assertEquals("", tokens.get(5).before());
    assertEquals("    ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class));

    StringBuilder result = new StringBuilder();
    result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    for (CoreLabel token : tokens) {
      result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class));
      String after = token.get(CoreAnnotations.AfterAnnotation.class);
      if (after != null) result.append(after);
    }
    assertEquals(text, result.toString());

    for (int i = 0; i < tokens.size() - 1; ++i) {
      assertEquals(
          tokens.get(i).get(CoreAnnotations.AfterAnnotation.class),
          tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class));
    }
  }
Example #3
0
 /**
  * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each
  * word, and writes the result to standard output. Note that the word stemmed is expected to be in
  * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name
  * file-name ...
  */
 public static void main(String[] args) throws IOException {
   Stemmer s = new Stemmer();
   if (args[0].equals("-file")) {
     Iterator<Word> it =
         PTBTokenizer.newPTBTokenizer(
             new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
     while (it.hasNext()) {
       Word token = it.next();
       System.out.print(s.stem(token.word()));
       System.out.print(' ');
     }
   } else {
     for (String arg : args) {
       System.out.print(s.stem(arg));
       System.out.print(' ');
     }
   }
   System.out.println();
 }