Beispiel #1
0
 @Test
 public void testCorp() {
   // We test a 2x2 design: {strict, regular} x {no following context, following context}
   for (int sent = 0; sent < 4; sent++) {
     PTBTokenizer<CoreLabel> ptbTokenizer =
         new PTBTokenizer<>(
             new StringReader(corpInputs[sent / 2]),
             new CoreLabelTokenFactory(),
             (sent % 2 == 0) ? "strictTreebank3" : "");
     int i = 0;
     while (ptbTokenizer.hasNext()) {
       CoreLabel w = ptbTokenizer.next();
       try {
         assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word());
       } catch (ArrayIndexOutOfBoundsException aioobe) {
         // the assertion below outside the loop will fail
       }
       i++;
     }
     if (i != corpGold[sent % 2].length) {
       System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2]));
       List<CoreLabel> tokens =
           new PTBTokenizer<>(
                   new StringReader(corpInputs[sent / 2]),
                   new CoreLabelTokenFactory(),
                   (sent % 2 == 0) ? "strictTreebank3" : "")
               .tokenize();
       System.out.println("Guess: " + SentenceUtils.listToString(tokens));
       System.out.flush();
     }
     assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length);
   }
 }
Beispiel #2
0
 @Test
 public void testFractions() {
   String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"};
   String[][] tokenizedNormal = {
     {
       "5-1/4",
       "plus",
       "2\u00A03/16",
       "=",
       "7\u00A07/16",
       "in",
       "the",
       "U.S.S.R.",
       ".",
       "Why",
       "not",
       "?"
     }
   };
   String[][] tokenizedStrict = {
     {
       "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not",
       "?"
     }
   };
   TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory();
   TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3");
   runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal);
   runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict);
 }
Beispiel #3
0
 @Test
 public void testJacobEisensteinApostropheCase() {
   StringReader reader = new StringReader("it's");
   PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens = tokenizer.tokenize();
   // for (Word word : stemmedTokens) System.out.print (word+" ");
   reader = new StringReader(" it's ");
   tokenizer = PTBTokenizer.newPTBTokenizer(reader);
   List<Word> stemmedTokens2 = tokenizer.tokenize();
   // System.out.println ();
   // for (Word word : stemmedTokens2) System.out.print (word+" ");
   // System.out.println();
   assertEquals(stemmedTokens, stemmedTokens2);
 }
Beispiel #4
0
 @Test
 public void testUntok() {
   assert (untokInputs.length == untokOutputs.length);
   for (int i = 0; i < untokInputs.length; i++) {
     assertEquals(
         "untok gave the wrong result", untokOutputs[i], PTBTokenizer.ptb2Text(untokInputs[i]));
   }
 }
Beispiel #5
0
  @Test
  public void testInvertible() {
    String text = "  This     is     a      colourful sentence.    ";
    PTBTokenizer<CoreLabel> tokenizer =
        PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true);
    List<CoreLabel> tokens = tokenizer.tokenize();
    assertEquals(6, tokens.size());
    assertEquals("  ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals(
        "Wrong begin char offset",
        2,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    assertEquals(
        "Wrong end char offset",
        6,
        (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class));
    // note: after(x) and before(x+1) are the same
    assertEquals("     ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class));
    assertEquals("     ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class));
    // americanize is now off by default
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class));
    assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class));
    assertEquals("", tokens.get(4).after());
    assertEquals("", tokens.get(5).before());
    assertEquals("    ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class));

    StringBuilder result = new StringBuilder();
    result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class));
    for (CoreLabel token : tokens) {
      result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class));
      String after = token.get(CoreAnnotations.AfterAnnotation.class);
      if (after != null) result.append(after);
    }
    assertEquals(text, result.toString());

    for (int i = 0; i < tokens.size() - 1; ++i) {
      assertEquals(
          tokens.get(i).get(CoreAnnotations.AfterAnnotation.class),
          tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class));
    }
  }
Beispiel #6
0
 /**
  * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each
  * word, and writes the result to standard output. Note that the word stemmed is expected to be in
  * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name
  * file-name ...
  */
 public static void main(String[] args) throws IOException {
   Stemmer s = new Stemmer();
   if (args[0].equals("-file")) {
     Iterator<Word> it =
         PTBTokenizer.newPTBTokenizer(
             new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
     while (it.hasNext()) {
       Word token = it.next();
       System.out.print(s.stem(token.word()));
       System.out.print(' ');
     }
   } else {
     for (String arg : args) {
       System.out.print(s.stem(arg));
       System.out.print(' ');
     }
   }
   System.out.println();
 }
  public static void main(String[] args) // start of the main method
      {
    System.out.println("\n\n\nSTART\n\n\n"); // print START
    try // device to handle potential errors
    {
      // open file whose path is passed
      // as the first argument of the main method:
      FileInputStream fis = new FileInputStream(args[0]);
      DataInputStream dis = new DataInputStream(fis);
      BufferedReader br = new BufferedReader(new InputStreamReader(dis));

      // prepare Parser, Tokenizer and Tree printer:
      LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz");
      TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory());
      TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

      String sentence; // initialization
      // for each line of the file
      // retrieve it as a string called 'sentence':
      while ((sentence = br.readLine()) != null) {
        // print sentence:
        System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence);
        // put tokens in a list:
        List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize();
        lp.parse(tokens); // parse the tokens
        Tree t = lp.getBestParse(); // get the best parse tree
        System.out.println("\nPROCESSED:\n\n");
        tp.printTree(t); // print tree
      }
      dis.close(); // close input file
    } catch (Exception e) // catch error if any
    {
      System.err.println("ERROR: " + e.getMessage()); // print error message
    }
    System.out.println("\n\n\nTHE END\n\n\n"); // print THE END
  } // end of the main method
Beispiel #8
0
 @Test
 public void testPTBTokenizerSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory();
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold);
 }
Beispiel #9
0
 @Test
 public void testPTBTokenizerWord() {
   TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
   runOnTwoArrays(tokFactory, ptbInputs, ptbGold);
 }
Beispiel #10
0
 @Test
 public void testPTBTokenizerMT() {
   TokenizerFactory<Word> tokFactory = PTBTokenizer.factory();
   runOnTwoArrays(tokFactory, mtInputs, mtGold);
 }
Beispiel #11
0
 @Test
 public void testPTBTokenizerTokenizeSplitHyphens() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true");
   runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated);
 }
Beispiel #12
0
 @Test
 public void testPTBTokenizerTokenizePerLineSGML() {
   TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true");
   runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold);
 }