Java PTBTokenizer Examples

Programming Language: Java

Namespace/Package Name: org.junit.Assert

Class/Type: PTBTokenizer

Examples at hotexamples.com: 10

Java PTBTokenizer - 10 examples found. These are the top rated real world Java examples of org.junit.Assert.PTBTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

coreLabelFactory(4)

factory(2)

newPTBTokenizer(2)

tokenize(2)

hasNext(1)

next(1)

ptb2Text(1)

Example #1

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testCorp() {
   // We test a 2x2 design: {strict, regular} x {no following context, following context}
   for (int sent = 0; sent < 4; sent++) {
     PTBTokenizer<CoreLabel> ptbTokenizer =
         new PTBTokenizer<>(
             new StringReader(corpInputs[sent / 2]),
             new CoreLabelTokenFactory(),
             (sent % 2 == 0) ? "strictTreebank3" : "");
     int i = 0;
     while (ptbTokenizer.hasNext()) {
       CoreLabel w = ptbTokenizer.next();
       try {
         assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word());
       } catch (ArrayIndexOutOfBoundsException aioobe) {
         // the assertion below outside the loop will fail
       }
       i++;
     }
     if (i != corpGold[sent % 2].length) {
       System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2]));
       List<CoreLabel> tokens =
           new PTBTokenizer<>(
                   new StringReader(corpInputs[sent / 2]),
                   new CoreLabelTokenFactory(),
                   (sent % 2 == 0) ? "strictTreebank3" : "")
               .tokenize();
       System.out.println("Guess: " + SentenceUtils.listToString(tokens));
       System.out.flush();
     }
     assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length);
   }
 }

Example #2

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testFractions() {
   String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"};
   String[][] tokenizedNormal = {
     {
       "5-1/4",
       "plus",
       "2\u00A03/16",
       "=",
       "7\u00A07/16",
       "in",
       "the",
       "U.S.S.R.",
       ".",
       "Why",
       "not",
       "?"
     }
   };
   String[][] tokenizedStrict = {
     {
       "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not",
       "?"
     }
   };
   TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory();
   TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3");
   runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal);
   runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict);
 }

Example #3

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testJacobEisensteinApostropheCase() {
   StringReader reader = new StringReader("it's");
   PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens = tokenizer.tokenize();
   // for (Word word : stemmedTokens) System.out.print (word+" ");
   reader = new StringReader(" it's ");
   tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens2 = tokenizer.tokenize();
   // System.out.println ();
   // for (Word word : stemmedTokens2) System.out.print (word+" ");
   // System.out.println();
   assertEquals(stemmedTokens, stemmedTokens2);
 }

Example #4

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testUntok() {
   assert (untokInputs.length == untokOutputs.length);
   for (int i = 0; i < untokInputs.length; i++) {
     assertEquals(
         "untok gave the wrong result", untokOutputs[i], PTBTokenizer.ptb2Text(untokInputs[i]));
   }
 }

Example #5

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

  @Test
  public void testInvertible() {
    String text = "  This     is     a      colourful sentence.    ";
    PTBTokenizer<CoreLabel> tokenizer =
        PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true);
    List<CoreLabel> tokens = tokenizer.tokenize();
    assertEquals(6, tokens.size());
    assertEquals("  ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals(
        "Wrong begin char offset",
        2,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals(
        "Wrong end char offset",
        6,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
    // note: after(x) and before(x+1) are the same
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals("     ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
    // americanize is now off by default
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("", tokens.get(4).after());
    assertEquals("", tokens.get(5).before());
    assertEquals("    ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class));

    StringBuilder result = new StringBuilder();
    result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    for (CoreLabel token : tokens) {
      result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class));
      String after = token.get(CoreAnnotations.AfterAnnotation.class);
      if (after != null) result.append(after);
    }
    assertEquals(text, result.toString());

    for (int i = 0; i < tokens.size() - 1; ++i) {
      assertEquals(
          tokens.get(i).get(CoreAnnotations.AfterAnnotation.class),
          tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class));
    }
  }

Example #6

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testPTBTokenizerSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory();
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold);
 }

Example #7

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testPTBTokenizerWord() {
   TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
   runOnTwoArrays(tokFactory, ptbInputs, ptbGold);
 }

Example #8

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testPTBTokenizerMT() {
   TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
   runOnTwoArrays(tokFactory, mtInputs, mtGold);
 }

Example #9

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testPTBTokenizerTokenizeSplitHyphens() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true");
   runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated);
 }

Example #10

0

Show file

File: PTBTokenizerTest.java Project: wayzou/CoreNLP

 @Test
 public void testPTBTokenizerTokenizePerLineSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true");
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold);
 }