@Override public void evaluate(Tree guess, Tree gold, PrintWriter pw) { if (gold == null || guess == null) { System.err.printf( "%s: Cannot compare against a null gold or guess tree!\n", this.getClass().getName()); return; } else if (guess.yield().size() != gold.yield().size()) { System.err.println("Warning: yield differs:"); System.err.println("Guess: " + Sentence.listToString(guess.yield())); System.err.println("Gold: " + Sentence.listToString(gold.yield())); } super.evaluate(guess, gold, pw); }
public static ArrayList<ArrayList<TaggedWord>> tag(String sentencedoc) { MaxentTagger tagger; try { tagger = new MaxentTagger("data/wsj-0-18-bidirectional-distsim.tagger"); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(sentencedoc))); ArrayList<ArrayList<TaggedWord>> taggedsentences = new ArrayList<ArrayList<TaggedWord>>(sentences.size()); for (List<HasWord> sentence : sentences) { ArrayList<TaggedWord> tSentence = tagger.tagSentence(sentence); taggedsentences.add(tSentence); System.out.println(Sentence.listToString(tSentence, false)); } return taggedsentences; } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } return null; }
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a // tagged sentence. List<HasWord> sent = Sentence.toWordList( "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
public static void handletaggedsentences(ArrayList<ArrayList<TaggedWord>> taggedsentences) { // rule 7: if a noun follows a plural demonstrative adjective (these or those) for (ArrayList<TaggedWord> sentence : taggedsentences) { for (int i = 0; i < sentence.size(); i++) { // adv adj if (i > 0) { // often/RB, become/VBN, robbery/NN, ,/,, sexual/JJ, harassment/NN, and/CC, other/JJ, // violence/NN, behavior/NN, of/IN, victim/NN if (sentence.get(i - 1).tag().equals("IN") && sentence.get(i - 1).word().equals("of") && sentence.get(i).tag().equals("NN")) { String word1 = sentence.get(i - 1).word(); String word2 = sentence.get(i).word(); int a = 1; } } // //rule 1: DT NN DT --> DT NN // if (i<sentence.size()-2) // { // if (sentence.get(i).tag().equals("DT") && sentence.get(i+1).tag().equals("NN") // && sentence.get(i+2).tag().equals("DT")) // { // sentence.remove(i + 2); // } // } // // //rule 2: JJ NN --> NN JJ // if (i>0) // { // if (sentence.get(i).tag().equals("JJ") && sentence.get(i-1).tag().equals("NN")) // { // TaggedWord temptagged2 = sentence.get(i); // // sentence.remove(i); // sentence.add(i-1, temptagged2); // } // } } } // if any rules need to be run after the first set, go here: for (ArrayList<TaggedWord> sentence : taggedsentences) { for (int i = 0; i < sentence.size(); i++) {} System.out.println(Sentence.listToString(sentence, false)); } }
/** TODO: clearly this should be a default method in ParserQuery once Java 8 comes out */ @Override public void restoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } List<Tree> leaves = tree.getLeaves(); if (leaves.size() != originalSentence.size()) { throw new IllegalStateException( "originalWords and sentence of different sizes: " + originalSentence.size() + " vs. " + leaves.size() + "\n Orig: " + Sentence.listToString(originalSentence) + "\n Pars: " + Sentence.listToString(leaves)); } // TODO: get rid of this cast Iterator<? extends Label> wordsIterator = (Iterator<? extends Label>) originalSentence.iterator(); for (Tree leaf : leaves) { leaf.setLabel(wordsIterator.next()); } }
/** Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations( StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set( CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(Sentence.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
public static void startParsing(String paragraph) throws FileNotFoundException, IOException { String parseTrees = ""; // Can we just split on new line as paragraph is already sentence splitted. Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); for (List<HasWord> sentence : dp) { String sentenceString = Sentence.listToString(sentence); sentenceList.add(sentenceString); } for (String sentence : sentenceList) { // System.out.println(sentence); parseTrees += createParseTree(sentence); } writeToFile(parseTrees, "trees.txt"); }
/** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; System.err.println("Writing parser in serialized format to file " + filename + ' '); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); System.err.println("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(Sentence.listToString(gold.preTerminalYield())); pw.println(Sentence.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); }