private static Sentence wordify(List wList) { Sentence s = new Sentence(); for (Object obj : wList) { s.add(new Word(obj.toString())); } return s; }
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(Sentence.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a // tagged sentence. List<HasWord> sent = Sentence.toWordList( "The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) { String[] sent = toSentence(words); /// lexical analyzer List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); // PrintStream outa = new PrintStream(new FileOutputStream("output1.txt")); // System.setOut(outa); // System.out.println("KKKKKKK"); // parse.pennPrint(); String oldTree = parse.toString(); // String oldTree=baos.toString(); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); // System.out.println(oldTree); words.get(index).setNewValue(newWord); sent = toSentence(words); rawWords = Sentence.toCoreLabelList(sent); parse = lp.apply(rawWords); // PrintStream outb = new PrintStream(new FileOutputStream("output2.txt")); // System.setOut(outb); // parse.pennPrint(); String newTree = parse.toString(); oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")"); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); System.out.println(oldTree + "\n" + newTree); // System.out.println(oldTree.equals(newTree)); if (oldTree.equals(newTree)) { if (index == 0) { String str = words.get(index).getNewValue(); String cap = str.substring(0, 1).toUpperCase() + str.substring(1); words.get(index).setNewValue(cap); } return true; } else { words.get(index).setNewValue(null); return false; } /* catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; }*/ // return true; }
@Override public void evaluate(Tree guess, Tree gold, PrintWriter pw) { if (gold == null || guess == null) { System.err.printf( "%s: Cannot compare against a null gold or guess tree!\n", this.getClass().getName()); return; } else if (guess.yield().size() != gold.yield().size()) { System.err.println("Warning: yield differs:"); System.err.println("Guess: " + Sentence.listToString(guess.yield())); System.err.println("Gold: " + Sentence.listToString(gold.yield())); } super.evaluate(guess, gold, pw); }
public List<? extends HasWord> defaultTestSentence() { String[] sent = { "H", "MWX", "MTPLC", "LA", "RQ", "M", "H", "TWPEH", "H", "MBIFH", "ALA", "GM", "M", "DRKI", "H", "HERMH", "yyDOT" }; return Sentence.toWordList(sent); }
public static void generate(String model, String fileToTag, String outfile) throws Exception { MaxentTagger tagger = new MaxentTagger(model); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outfile), "utf-8")); BufferedReader br = new BufferedReader(new FileReader(fileToTag)); String line = ""; ArrayList<String> toks = new ArrayList<>(); while ((line = br.readLine()) != null) { if (line.length() == 0) { String params[] = new String[toks.size()]; toks.toArray(params); List<HasWord> sent = Sentence.toWordList(params); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { pw.println(tw.word() + " " + tw.tag()); } pw.println(); toks = new ArrayList<>(); } else { toks.add(line); } } br.close(); pw.close(); }
public static ArrayList<ArrayList<TaggedWord>> tag(String sentencedoc) { MaxentTagger tagger; try { tagger = new MaxentTagger("data/wsj-0-18-bidirectional-distsim.tagger"); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(sentencedoc))); ArrayList<ArrayList<TaggedWord>> taggedsentences = new ArrayList<ArrayList<TaggedWord>>(sentences.size()); for (List<HasWord> sentence : sentences) { ArrayList<TaggedWord> tSentence = tagger.tagSentence(sentence); taggedsentences.add(tSentence); System.out.println(Sentence.listToString(tSentence, false)); } return taggedsentences; } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } return null; }
/** Return a default sentence for the language (for testing) */ @Override public ArrayList<Word> defaultTestSentence() { return Sentence.toUntaggedList( "\u951f\u65a4\u62f7", "\u951f\u65a4\u62f7", "\u5b66\u6821", "\u951f\u65a4\u62f7", "\u5b66\u4e60", "\u951f\u65a4\u62f7"); }
public static void handletaggedsentences(ArrayList<ArrayList<TaggedWord>> taggedsentences) { // rule 7: if a noun follows a plural demonstrative adjective (these or those) for (ArrayList<TaggedWord> sentence : taggedsentences) { for (int i = 0; i < sentence.size(); i++) { // adv adj if (i > 0) { // often/RB, become/VBN, robbery/NN, ,/,, sexual/JJ, harassment/NN, and/CC, other/JJ, // violence/NN, behavior/NN, of/IN, victim/NN if (sentence.get(i - 1).tag().equals("IN") && sentence.get(i - 1).word().equals("of") && sentence.get(i).tag().equals("NN")) { String word1 = sentence.get(i - 1).word(); String word2 = sentence.get(i).word(); int a = 1; } } // //rule 1: DT NN DT --> DT NN // if (i<sentence.size()-2) // { // if (sentence.get(i).tag().equals("DT") && sentence.get(i+1).tag().equals("NN") // && sentence.get(i+2).tag().equals("DT")) // { // sentence.remove(i + 2); // } // } // // //rule 2: JJ NN --> NN JJ // if (i>0) // { // if (sentence.get(i).tag().equals("JJ") && sentence.get(i-1).tag().equals("NN")) // { // TaggedWord temptagged2 = sentence.get(i); // // sentence.remove(i); // sentence.add(i-1, temptagged2); // } // } } } // if any rules need to be run after the first set, go here: for (ArrayList<TaggedWord> sentence : taggedsentences) { for (int i = 0; i < sentence.size(); i++) {} System.out.println(Sentence.listToString(sentence, false)); } }
public void testInitialStateFromTagged() { String[] words = {"This", "is", "a", "short", "test", "."}; String[] tags = {"DT", "VBZ", "DT", "JJ", "NN", "."}; assertEquals(words.length, tags.length); List<TaggedWord> sentence = Sentence.toTaggedList(Arrays.asList(words), Arrays.asList(tags)); State state = ShiftReduceParser.initialStateFromTaggedSentence(sentence); for (int i = 0; i < words.length; ++i) { assertEquals(tags[i], state.sentence.get(i).value()); assertEquals(1, state.sentence.get(i).children().length); assertEquals(words[i], state.sentence.get(i).children()[0].value()); } }
/** TODO: clearly this should be a default method in ParserQuery once Java 8 comes out */ @Override public void restoreOriginalWords(Tree tree) { if (originalSentence == null || tree == null) { return; } List<Tree> leaves = tree.getLeaves(); if (leaves.size() != originalSentence.size()) { throw new IllegalStateException( "originalWords and sentence of different sizes: " + originalSentence.size() + " vs. " + leaves.size() + "\n Orig: " + Sentence.listToString(originalSentence) + "\n Pars: " + Sentence.listToString(leaves)); } // TODO: get rid of this cast Iterator<? extends Label> wordsIterator = (Iterator<? extends Label>) originalSentence.iterator(); for (Tree leaf : leaves) { leaf.setLabel(wordsIterator.next()); } }
/** Reads an annotation from the given filename using the requested input. */ public static List<Annotation> getAnnotations( StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) { switch (inputFormat) { case TEXT: { String text = IOUtils.slurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.annotate(annotation); List<Annotation> annotations = Generics.newArrayList(); for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) { Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)); nextAnnotation.set( CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); annotations.add(nextAnnotation); } return annotations; } case TREES: { List<Tree> trees; if (filterUnknown) { trees = SentimentUtils.readTreesWithGoldLabels(filename); trees = SentimentUtils.filterUnknownRoots(trees); } else { trees = Generics.newArrayList(); MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.loadPath(filename, null); for (Tree tree : treebank) { trees.add(tree); } } List<Annotation> annotations = Generics.newArrayList(); for (Tree tree : trees) { CoreMap sentence = new Annotation(Sentence.listToString(tree.yield())); sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree); List<CoreMap> sentences = Collections.singletonList(sentence); Annotation annotation = new Annotation(""); annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences); annotations.add(annotation); } return annotations; } default: throw new IllegalArgumentException("Unknown format " + inputFormat); } }
/** * Turns a sentence into a flat phrasal tree. The structure is S -> tag*. And then each tag goes * to a word. The tag is either found from the label or made "WD". The tag and phrasal node have a * StringLabel. * * @param s The Sentence to make the Tree from * @param lf The LabelFactory with which to create the new Tree labels * @return The one phrasal level Tree */ public static Tree toFlatTree(Sentence<?> s, LabelFactory lf) { List<Tree> daughters = new ArrayList<Tree>(s.length()); for (HasWord word : s) { Tree wordNode = new LabeledScoredTreeLeaf(lf.newLabel(word.word())); if (word instanceof TaggedWord) { TaggedWord taggedWord = (TaggedWord) word; wordNode = new LabeledScoredTreeNode( new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode)); } else { wordNode = new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode)); } daughters.add(wordNode); } return new LabeledScoredTreeNode(new StringLabel("S"), daughters); }
public static void startParsing(String paragraph) throws FileNotFoundException, IOException { String parseTrees = ""; // Can we just split on new line as paragraph is already sentence splitted. Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); for (List<HasWord> sentence : dp) { String sentenceString = Sentence.listToString(sentence); sentenceList.add(sentenceString); } for (String sentence : sentenceList) { // System.out.println(sentence); parseTrees += createParseTree(sentence); } writeToFile(parseTrees, "trees.txt"); }
/** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; System.err.println("Writing parser in serialized format to file " + filename + ' '); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); System.err.println("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(Sentence.listToString(gold.preTerminalYield())); pw.println(Sentence.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); }
private static Sentence addLast(Sentence s) { Sentence s2 = new Sentence(s); // s2.add(new StringLabel(Lexicon.BOUNDARY)); s2.add(new Word(Lexicon.BOUNDARY)); return s2; }
private static Sentence cutLast(Sentence s) { return new Sentence(s.subList(0, s.size() - 1)); }
public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) { LexicalizedParser lp = LexicalizedParser.loadModel( "/home/mingrui/Desktop/englishPCFG.ser.gz", "-maxLength", "80", "-retainTmpSubcategories"); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); // Uncomment the following line to obtain original Stanford Dependencies // tlp.setGenerateOriginalDependencies(true); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); String[] array = sentence.split("\\s+"); Tree parse = lp.apply(Sentence.toWordList(array)); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); ArrayList<String> keywordsDependency = new ArrayList<String>(); ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>(); // String lemmatizedKeyword = lemmatize(keyword); for (TypedDependency t : tdl) { String d = t.toString(); String dependencyType = d.substring(0, d.indexOf("(")); String pair = d.substring(d.indexOf("(") + 1, d.indexOf("(")); String[] terms = pair.split(","); String term1 = terms[0].trim(); String term2 = terms[1].trim(); // Match keywords with the terms in the tuples, if matched, add the // tuple into the arraylist String[] wordsplitted = keyword.split(" "); for (String key : wordsplitted) { if (term1.equals(key)) { keywordsDependency.add(t.toString()); } if (term2.equals(key)) { keywordsDependency.add(t.toString()); } } } String lemmatizedKeywords = lemmatize(keyword); int lbefore = keyword.split(" ").length; int lafter = lemmatizedKeywords.split(" ").length; if (lbefore == lafter) { return keywordsDependency; } else { String[] split = keyword.split(" "); for (String s : split) { String[] lemmas = lemmatize(s).split(" "); boolean sameLength = lemmas.length == s.split(" ").length; if (sameLength) { // Compare the length of one key_word or key_phrase before and after // lemmatization continue; } else { for (String tuple : keywordsDependency) { if (getTupleTerms(tuple)[0].equals( s)) { // Find the tuple that contains the original keyword/key_phrase String dependent = getTupleTerms(tuple)[1]; // String[] } } // for(String l : lemma) } } return keywordsDependencyWithLemmatization; } }