@Test public void testFractions() { String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"}; String[][] tokenizedNormal = { { "5-1/4", "plus", "2\u00A03/16", "=", "7\u00A07/16", "in", "the", "U.S.S.R.", ".", "Why", "not", "?" } }; String[][] tokenizedStrict = { { "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not", "?" } }; TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory(); TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3"); runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal); runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict); }
@Test public void testPTBTokenizerSGML() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory(); runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold); }
@Test public void testPTBTokenizerTokenizeSplitHyphens() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true"); runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated); }
@Test public void testPTBTokenizerTokenizePerLineSGML() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true"); runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold); }