@Test public void testJacobEisensteinApostropheCase() { StringReader reader = new StringReader("it's"); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader); List<Word> stemmedTokens = tokenizer.tokenize(); // for (Word word : stemmedTokens) System.out.print (word+" "); reader = new StringReader(" it's "); tokenizer = PTBTokenizer.newPTBTokenizer(reader); List<Word> stemmedTokens2 = tokenizer.tokenize(); // System.out.println (); // for (Word word : stemmedTokens2) System.out.print (word+" "); // System.out.println(); assertEquals(stemmedTokens, stemmedTokens2); }
@Test public void testInvertible() { String text = " This is a colourful sentence. "; PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true); List<CoreLabel> tokens = tokenizer.tokenize(); assertEquals(6, tokens.size()); assertEquals(" ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); assertEquals( "Wrong begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals( "Wrong end char offset", 6, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class)); // note: after(x) and before(x+1) are the same assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); assertEquals(" ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class)); // americanize is now off by default assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class)); assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals("", tokens.get(4).after()); assertEquals("", tokens.get(5).before()); assertEquals(" ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class)); StringBuilder result = new StringBuilder(); result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); for (CoreLabel token : tokens) { result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class)); String after = token.get(CoreAnnotations.AfterAnnotation.class); if (after != null) result.append(after); } assertEquals(text, result.toString()); for (int i = 0; i < tokens.size() - 1; ++i) { assertEquals( tokens.get(i).get(CoreAnnotations.AfterAnnotation.class), tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class)); } }
/** * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each * word, and writes the result to standard output. Note that the word stemmed is expected to be in * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name * file-name ... */ public static void main(String[] args) throws IOException { Stemmer s = new Stemmer(); if (args[0].equals("-file")) { Iterator<Word> it = PTBTokenizer.newPTBTokenizer( new InputStreamReader(new FileInputStream(args[1]), "utf-8")); while (it.hasNext()) { Word token = it.next(); System.out.print(s.stem(token.word())); System.out.print(' '); } } else { for (String arg : args) { System.out.print(s.stem(arg)); System.out.print(' '); } } System.out.println(); }