예제 #1
0
 @Test
 public void testFractions() {
   String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"};
   String[][] tokenizedNormal = {
     {
       "5-1/4",
       "plus",
       "2\u00A03/16",
       "=",
       "7\u00A07/16",
       "in",
       "the",
       "U.S.S.R.",
       ".",
       "Why",
       "not",
       "?"
     }
   };
   String[][] tokenizedStrict = {
     {
       "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not",
       "?"
     }
   };
   TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory();
   TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3");
   runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal);
   runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict);
 }
예제 #2
0
 @Test
 public void testPTBTokenizerSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory();
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold);
 }
예제 #3
0
 @Test
 public void testPTBTokenizerTokenizeSplitHyphens() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true");
   runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated);
 }
예제 #4
0
 @Test
 public void testPTBTokenizerTokenizePerLineSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true");
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold);
 }