@Test public void testCorp() { // We test a 2x2 design: {strict, regular} x {no following context, following context} for (int sent = 0; sent < 4; sent++) { PTBTokenizer<CoreLabel> ptbTokenizer = new PTBTokenizer<>( new StringReader(corpInputs[sent / 2]), new CoreLabelTokenFactory(), (sent % 2 == 0) ? "strictTreebank3" : ""); int i = 0; while (ptbTokenizer.hasNext()) { CoreLabel w = ptbTokenizer.next(); try { assertEquals("PTBTokenizer problem", corpGold[sent % 2][i], w.word()); } catch (ArrayIndexOutOfBoundsException aioobe) { // the assertion below outside the loop will fail } i++; } if (i != corpGold[sent % 2].length) { System.out.println("Gold: " + Arrays.toString(corpGold[sent % 2])); List<CoreLabel> tokens = new PTBTokenizer<>( new StringReader(corpInputs[sent / 2]), new CoreLabelTokenFactory(), (sent % 2 == 0) ? "strictTreebank3" : "") .tokenize(); System.out.println("Guess: " + SentenceUtils.listToString(tokens)); System.out.flush(); } assertEquals("PTBTokenizer num tokens problem", i, corpGold[sent % 2].length); } }
@Test public void testFractions() { String[] sample = {"5-1/4 plus 2 3/16 = 7\u00A07/16 in the U.S.S.R. Why not?"}; String[][] tokenizedNormal = { { "5-1/4", "plus", "2\u00A03/16", "=", "7\u00A07/16", "in", "the", "U.S.S.R.", ".", "Why", "not", "?" } }; String[][] tokenizedStrict = { { "5-1/4", "plus", "2", "3/16", "=", "7", "7/16", "in", "the", "U.S.S.R", ".", "Why", "not", "?" } }; TokenizerFactory<CoreLabel> tokFactoryNormal = PTBTokenizer.coreLabelFactory(); TokenizerFactory<CoreLabel> tokFactoryStrict = PTBTokenizer.coreLabelFactory("strictTreebank3"); runOnTwoArrays(tokFactoryNormal, sample, tokenizedNormal); runOnTwoArrays(tokFactoryStrict, sample, tokenizedStrict); }
@Test public void testJacobEisensteinApostropheCase() { StringReader reader = new StringReader("it's"); PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(reader); List<Word> stemmedTokens = tokenizer.tokenize(); // for (Word word : stemmedTokens) System.out.print (word+" "); reader = new StringReader(" it's "); tokenizer = PTBTokenizer.newPTBTokenizer(reader); List<Word> stemmedTokens2 = tokenizer.tokenize(); // System.out.println (); // for (Word word : stemmedTokens2) System.out.print (word+" "); // System.out.println(); assertEquals(stemmedTokens, stemmedTokens2); }
@Test public void testUntok() { assert (untokInputs.length == untokOutputs.length); for (int i = 0; i < untokInputs.length; i++) { assertEquals( "untok gave the wrong result", untokOutputs[i], PTBTokenizer.ptb2Text(untokInputs[i])); } }
@Test public void testInvertible() { String text = " This is a colourful sentence. "; PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(new StringReader(text), false, true); List<CoreLabel> tokens = tokenizer.tokenize(); assertEquals(6, tokens.size()); assertEquals(" ", tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); assertEquals( "Wrong begin char offset", 2, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)); assertEquals( "Wrong end char offset", 6, (int) tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)); assertEquals("This", tokens.get(0).get(CoreAnnotations.OriginalTextAnnotation.class)); // note: after(x) and before(x+1) are the same assertEquals(" ", tokens.get(0).get(CoreAnnotations.AfterAnnotation.class)); assertEquals(" ", tokens.get(1).get(CoreAnnotations.BeforeAnnotation.class)); // americanize is now off by default assertEquals("colourful", tokens.get(3).get(CoreAnnotations.TextAnnotation.class)); assertEquals("colourful", tokens.get(3).get(CoreAnnotations.OriginalTextAnnotation.class)); assertEquals("", tokens.get(4).after()); assertEquals("", tokens.get(5).before()); assertEquals(" ", tokens.get(5).get(CoreAnnotations.AfterAnnotation.class)); StringBuilder result = new StringBuilder(); result.append(tokens.get(0).get(CoreAnnotations.BeforeAnnotation.class)); for (CoreLabel token : tokens) { result.append(token.get(CoreAnnotations.OriginalTextAnnotation.class)); String after = token.get(CoreAnnotations.AfterAnnotation.class); if (after != null) result.append(after); } assertEquals(text, result.toString()); for (int i = 0; i < tokens.size() - 1; ++i) { assertEquals( tokens.get(i).get(CoreAnnotations.AfterAnnotation.class), tokens.get(i + 1).get(CoreAnnotations.BeforeAnnotation.class)); } }
/** * Test program for demonstrating the Stemmer. It reads text from a a list of files, stems each * word, and writes the result to standard output. Note that the word stemmed is expected to be in * lower case: forcing lower case must be done outside the Stemmer class. Usage: Stemmer file-name * file-name ... */ public static void main(String[] args) throws IOException { Stemmer s = new Stemmer(); if (args[0].equals("-file")) { Iterator<Word> it = PTBTokenizer.newPTBTokenizer( new InputStreamReader(new FileInputStream(args[1]), "utf-8")); while (it.hasNext()) { Word token = it.next(); System.out.print(s.stem(token.word())); System.out.print(' '); } } else { for (String arg : args) { System.out.print(s.stem(arg)); System.out.print(' '); } } System.out.println(); }
public static void main(String[] args) // start of the main method { System.out.println("\n\n\nSTART\n\n\n"); // print START try // device to handle potential errors { // open file whose path is passed // as the first argument of the main method: FileInputStream fis = new FileInputStream(args[0]); DataInputStream dis = new DataInputStream(fis); BufferedReader br = new BufferedReader(new InputStreamReader(dis)); // prepare Parser, Tokenizer and Tree printer: LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence; // initialization // for each line of the file // retrieve it as a string called 'sentence': while ((sentence = br.readLine()) != null) { // print sentence: System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence); // put tokens in a list: List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree System.out.println("\nPROCESSED:\n\n"); tp.printTree(t); // print tree } dis.close(); // close input file } catch (Exception e) // catch error if any { System.err.println("ERROR: " + e.getMessage()); // print error message } System.out.println("\n\n\nTHE END\n\n\n"); // print THE END } // end of the main method
@Test public void testPTBTokenizerSGML() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory(); runOnTwoArrays(tokFactory, sgmlInputs, sgmlGold); }
@Test public void testPTBTokenizerWord() { TokenizerFactory<Word> tokFactory = PTBTokenizer.factory(); runOnTwoArrays(tokFactory, ptbInputs, ptbGold); }
@Test public void testPTBTokenizerMT() { TokenizerFactory<Word> tokFactory = PTBTokenizer.factory(); runOnTwoArrays(tokFactory, mtInputs, mtGold); }
@Test public void testPTBTokenizerTokenizeSplitHyphens() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("splitHyphenated=true"); runOnTwoArrays(tokFactory, ptbInputs, ptbGoldSplitHyphenated); }
@Test public void testPTBTokenizerTokenizePerLineSGML() { TokenizerFactory<CoreLabel> tokFactory = PTBTokenizer.coreLabelFactory("tokenizePerLine=true"); runOnTwoArrays(tokFactory, sgmlInputs, sgmlPerLineGold); }