@Test public void testCorp() { // We test a 2x2 design: {strict, regular} x {no following context, following context} for (int sent = 0; sent < 4; sent++) { PTBTokenizer<CoreLabel> ptbTokenizer = new PTBTokenizer<>( new StringReader(corpInputs[sent / 2]), new CoreLabelTokenFactory(), (sent % 2 == 0) ? "strictTreebank3" : ""); int i = 0; while (ptbTokenizer.hasNext()) { CoreLabel w = ptbTokenizer.next(); try { assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word()); } catch (ArrayIndexOutOfBoundsException aioobe) { // the assertion below outside the loop will fail } i++; } if (i != corpGold[sent % 2].length) { System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2])); List<CoreLabel> tokens = new PTBTokenizer<>( new StringReader(corpInputs[sent / 2]), new CoreLabelTokenFactory(), (sent % 2 == 0) ? "strictTreebank3" : "") .tokenize(); System.out.println("Guess: " + SentenceUtils.listToString(tokens)); System.out.flush(); } assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length); } }
@Test public void testFractions() { String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"}; String[][] tokenizedNormal = { { "5-1/4", "plus", "2\u00A03/16", "=", "7\u00A07/16", "in", "the", "U.S.S.R.", ".", "Why", "not", "?" } }; String[][] tokenizedStrict = { { "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not", "?" } }; TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory(); TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3"); runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal); runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict); }
@Test public void testJacobEisensteinApostropheCase() { StringReader reader = new StringReader("it's"); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader); List<Word> stemmedTokens = tokenizer.tokenize(); // for (Word word : stemmedTokens) System.out.print (word+" "); reader = new StringReader(" it's "); tokenizer = PTBTokenizer.newPTBTokenizer(reader); List<Word> stemmedTokens2 = tokenizer.tokenize(); // System.out.println (); // for (Word word : stemmedTokens2) System.out.print (word+" "); // System.out.println(); assertEquals(stemmedTokens, stemmedTokens2); }
@Test public void testUntok() { assert (untokInputs.length == untokOutputs.length); for (int i = 0; i < untokInputs.length; i++) { assertEquals( "untok gave the wrong result", untokOutputs[i], PTBTokenizer.ptb2Text(untokInputs[i])); } }
@Test public void testInvertible() { String text = " This is a colourful sentence. "; PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true); List<CoreLabel> tokens = tokenizer.tokenize(); assertEquals(6, tokens.size()); assertEquals(" ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); assertEquals( "Wrong begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals( "Wrong end char offset", 6, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class)); // note: after(x) and before(x+1) are the same assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); assertEquals(" ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class)); // americanize is now off by default assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class)); assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals("", tokens.get(4).after()); assertEquals("", tokens.get(5).before()); assertEquals(" ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class)); StringBuilder result = new StringBuilder(); result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); for (CoreLabel token : tokens) { result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class)); String after = token.get(CoreAnnotations.AfterAnnotation.class); if (after != null) result.append(after); } assertEquals(text, result.toString()); for (int i = 0; i < tokens.size() - 1; ++i) { assertEquals( tokens.get(i).get(CoreAnnotations.AfterAnnotation.class), tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class)); } }
@Test public void testPTBTokenizerSGML() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory(); runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold); }
@Test public void testPTBTokenizerWord() { TokenizerFactory<Word> tokFactory = PTBTokenizer.factory(); runOnTwoArrays(tokFactory, ptbInputs, ptbGold); }
@Test public void testPTBTokenizerMT() { TokenizerFactory<Word> tokFactory = PTBTokenizer.factory(); runOnTwoArrays(tokFactory, mtInputs, mtGold); }
@Test public void testPTBTokenizerTokenizeSplitHyphens() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true"); runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated); }
@Test public void testPTBTokenizerTokenizePerLineSGML() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true"); runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold); }