예제 #1
0
 @Test
 public void testCorp() {
   // We test a 2x2 design: {strict, regular} x {no following context, following context}
   for (int sent = 0; sent < 4; sent++) {
     PTBTokenizer<CoreLabel> ptbTokenizer =
         new PTBTokenizer<>(
             new StringReader(corpInputs[sent / 2]),
             new CoreLabelTokenFactory(),
             (sent % 2 == 0) ? "strictTreebank3" : "");
     int i = 0;
     while (ptbTokenizer.hasNext()) {
       CoreLabel w = ptbTokenizer.next();
       try {
         assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word());
       } catch (ArrayIndexOutOfBoundsException aioobe) {
         // the assertion below outside the loop will fail
       }
       i++;
     }
     if (i != corpGold[sent % 2].length) {
       System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2]));
       List<CoreLabel> tokens =
           new PTBTokenizer<>(
                   new StringReader(corpInputs[sent / 2]),
                   new CoreLabelTokenFactory(),
                   (sent % 2 == 0) ? "strictTreebank3" : "")
               .tokenize();
       System.out.println("Guess: " + SentenceUtils.listToString(tokens));
       System.out.flush();
     }
     assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length);
   }
 }
예제 #2
0
 @Test
 public void testFractions() {
   String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"};
   String[][] tokenizedNormal = {
     {
       "5-1/4",
       "plus",
       "2\u00A03/16",
       "=",
       "7\u00A07/16",
       "in",
       "the",
       "U.S.S.R.",
       ".",
       "Why",
       "not",
       "?"
     }
   };
   String[][] tokenizedStrict = {
     {
       "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not",
       "?"
     }
   };
   TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory();
   TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3");
   runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal);
   runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict);
 }
예제 #3
0
 @Test
 public void testJacobEisensteinApostropheCase() {
   StringReader reader = new StringReader("it's");
   PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens = tokenizer.tokenize();
   // for (Word word : stemmedTokens) System.out.print (word+" ");
   reader = new StringReader(" it's ");
   tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens2 = tokenizer.tokenize();
   // System.out.println ();
   // for (Word word : stemmedTokens2) System.out.print (word+" ");
   // System.out.println();
   assertEquals(stemmedTokens, stemmedTokens2);
 }
예제 #4
0
 @Test
 public void testUntok() {
   assert (untokInputs.length == untokOutputs.length);
   for (int i = 0; i < untokInputs.length; i++) {
     assertEquals(
         "untok gave the wrong result", untokOutputs[i], PTBTokenizer.ptb2Text(untokInputs[i]));
   }
 }
예제 #5
0
  @Test
  public void testInvertible() {
    String text = "  This     is     a      colourful sentence.    ";
    PTBTokenizer<CoreLabel> tokenizer =
        PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true);
    List<CoreLabel> tokens = tokenizer.tokenize();
    assertEquals(6, tokens.size());
    assertEquals("  ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals(
        "Wrong begin char offset",
        2,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals(
        "Wrong end char offset",
        6,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
    // note: after(x) and before(x+1) are the same
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals("     ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
    // americanize is now off by default
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("", tokens.get(4).after());
    assertEquals("", tokens.get(5).before());
    assertEquals("    ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class));

    StringBuilder result = new StringBuilder();
    result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    for (CoreLabel token : tokens) {
      result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class));
      String after = token.get(CoreAnnotations.AfterAnnotation.class);
      if (after != null) result.append(after);
    }
    assertEquals(text, result.toString());

    for (int i = 0; i < tokens.size() - 1; ++i) {
      assertEquals(
          tokens.get(i).get(CoreAnnotations.AfterAnnotation.class),
          tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class));
    }
  }
예제 #6
0
 @Test
 public void testPTBTokenizerSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory();
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold);
 }
예제 #7
0
 @Test
 public void testPTBTokenizerWord() {
   TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
   runOnTwoArrays(tokFactory, ptbInputs, ptbGold);
 }
예제 #8
0
 @Test
 public void testPTBTokenizerMT() {
   TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
   runOnTwoArrays(tokFactory, mtInputs, mtGold);
 }
예제 #9
0
 @Test
 public void testPTBTokenizerTokenizeSplitHyphens() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true");
   runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated);
 }
예제 #10
0
 @Test
 public void testPTBTokenizerTokenizePerLineSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true");
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold);
 }