/** * Parses a sentence and returns the parse tree. * * @param sentence a sentence * @return Tree character offsets in keys BEGIN_KEY and END_KEY */ @SuppressWarnings("unchecked") public static Tree parseTree(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce stanford Tree log.debug("Parsing sentence"); Tree tree = null; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); tree = parser.getBestParse(); } // label tree with character extents // log.debug("Setting character extents"); // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1)); // log.debug("Creating offset mapping"); // List<RangeMap> mapping = createMapping(sentence); // log.debug(mapping.toString()); // log.debug("Applying offset mapping"); // mapOffsets(tree, mapping); return tree; }
private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) { String[] sent = toSentence(words); /// lexical analyzer List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); // PrintStream outa = new PrintStream(new FileOutputStream("output1.txt")); // System.setOut(outa); // System.out.println("KKKKKKK"); // parse.pennPrint(); String oldTree = parse.toString(); // String oldTree=baos.toString(); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); // System.out.println(oldTree); words.get(index).setNewValue(newWord); sent = toSentence(words); rawWords = Sentence.toCoreLabelList(sent); parse = lp.apply(rawWords); // PrintStream outb = new PrintStream(new FileOutputStream("output2.txt")); // System.setOut(outb); // parse.pennPrint(); String newTree = parse.toString(); oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")"); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); System.out.println(oldTree + "\n" + newTree); // System.out.println(oldTree.equals(newTree)); if (oldTree.equals(newTree)) { if (index == 0) { String str = words.get(index).getNewValue(); String cap = str.substring(0, 1).toUpperCase() + str.substring(1); words.get(index).setNewValue(cap); } return true; } else { words.get(index).setNewValue(null); return false; } /* catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; }*/ // return true; }
public LinkedList<String> getKeyWrodsFromSentence(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) sentence.add(new Word(word)); Tree parse = lp.parse(sentence); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); String[] current; String type, key; List<CoreLabel> labelsList = parse.taggedLabeledYield(); for (Label l : labelsList) { current = l.toString().split("-"); type = current[0]; if (type.equals("NN") || type.equals("NNS")) { key = sent[Integer.parseInt(current[1])]; list.add(key); } } return list; }
public static void main(String args[]) { // String sentence1 = "A large bird standing on a table picks up a plastic glass // containing liquid and places it in a bowl of something."; // String sentence2 = "A bird picks up a plastic cup containing a liquid with it's beak // and puts the cup into a bowl."; // LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); // LeskWSD tm = new LeskWSD(lp); // WordNetSimilarity ws = new WordNetSimilarity(); // // System.out.println(LexicalSimilarityScoreWordNet(sentence1, sentence2, tm, lp, ws)); String sentence = "The broader Standard & Poor's 500 Index <.SPX> shed 2.38 points, or 0.24 percent, at 995.10."; LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); Tree parse = lp.apply(sentence); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); taggedWords = Preprocess(taggedWords); for (int i = 0; i < taggedWords.size(); i++) System.out.println(taggedWords.get(i).word()); }
/** * Parses a sentence and returns the PCFG score as a confidence measure. * * @param sentence a sentence * @return PCFG score */ @SuppressWarnings("unchecked") public static double getPCFGScore(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce PCFG score log.debug("Parsing sentence"); double score; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); score = parser.getPCFGScore(); } return score; }
public ParseEssay() { System.setProperty("wordnet.database.dir", "../war/dict"); synonyms = new ArrayList<String>(); database = WordNetDatabase.getFileInstance(); baos = new ByteArrayOutputStream(); lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); // ?? }
public static ArrayList<TaggedWord> StanfordParse(String sentence, LexicalizedParser lp) { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> rawWords2 = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree parse = lp.apply(rawWords2); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); return taggedWords; }
private Collection<TypedDependency> parseSentenceTDL(String text) { System.out.println("Parsing sentence..."); Collection<TypedDependency> tdl = null; TreebankLanguagePack tlp = lp.treebankLanguagePack(); GrammaticalStructureFactory gsf = null; if (tlp.supportsGrammaticalStructures()) { gsf = tlp.grammaticalStructureFactory(); } Reader reader = new StringReader(text); for (List<HasWord> sentence : new DocumentPreprocessor(reader)) { Tree parse = lp.apply(sentence); if (gsf != null) { GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); tdl = gs.allTypedDependencies(); } } return tdl; }
private static List<TypedDependency> getDependencies(String sentence) { if (pipeline == null) { loadModels(); } TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> rawWords2 = tok.tokenize(); Tree parse = lp.apply(rawWords2); // parse.pennPrint(); // // System.out.println(parse.toString()); TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); return tdl; }
public static void main(String[] args) // start of the main method { System.out.println("\n\n\nSTART\n\n\n"); // print START try // device to handle potential errors { // open file whose path is passed // as the first argument of the main method: FileInputStream fis = new FileInputStream(args[0]); DataInputStream dis = new DataInputStream(fis); BufferedReader br = new BufferedReader(new InputStreamReader(dis)); // prepare Parser, Tokenizer and Tree printer: LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence; // initialization // for each line of the file // retrieve it as a string called 'sentence': while ((sentence = br.readLine()) != null) { // print sentence: System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence); // put tokens in a list: List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree System.out.println("\nPROCESSED:\n\n"); tp.printTree(t); // print tree } dis.close(); // close input file } catch (Exception e) // catch error if any { System.err.println("ERROR: " + e.getMessage()); // print error message } System.out.println("\n\n\nTHE END\n\n\n"); // print THE END } // end of the main method
private ArrayList<TaggedWord> parseSentenceTD(String text) { System.out.println("Parsing sentence..."); ArrayList<TaggedWord> tw = new ArrayList<TaggedWord>(); Reader reader = new StringReader(text); for (List<HasWord> sentence : new DocumentPreprocessor(reader)) { Tree parse = lp.apply(sentence); tw = parse.taggedYield(); } return tw; }
public static double LexicalSimilarity2Level( String sentence1, String sentence2, DISCOSimilarity discoRAM, LexicalizedParser lp) { Tree parse1 = lp.apply(sentence1); Tree parse2 = lp.apply(sentence2); int phraseSizeLimit = 2; ArrayList<ArrayList<TaggedWord>> phrasesList1 = getPhrases(parse1, phraseSizeLimit); ArrayList<ArrayList<TaggedWord>> phrasesList2 = getPhrases(parse2, phraseSizeLimit); int length1 = phrasesList1.size(); int length2 = phrasesList2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { double edgeWeight = 0; ArrayList<TaggedWord> taggedWords1 = phrasesList1.get(i); ArrayList<TaggedWord> taggedWords2 = phrasesList2.get(j); // edgeWeight = LexicalSimilarityScore(taggedWords1, taggedWords2, discoRAM, lp)/5.0; edgeWeight = BestWordMatchEdgeWeight(taggedWords1, taggedWords2, discoRAM); array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; // int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; if (arrSize == 0) finalScore = 0; else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }
public LinkedList<String> getKeyWrodsFromSentenceTest(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) { sentence.add(new Word(word)); } Tree parse = lp.parse(sentence); parse.pennPrint(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); System.out.println("The words of the sentence:"); for (Label lab : parse.yield()) { if (lab instanceof CoreLabel) { System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP)); } else { System.out.println(lab); } } System.out.println(); System.out.println("tagged"); System.out.println(parse.taggedYield()); List<CoreLabel> temp = parse.taggedLabeledYield(); for (Label l : temp) { String[] sss = l.toString().split("-"); String type = sss[0]; System.out.println(sss[0] + " " + sss[1] + " " + sent[Integer.parseInt(sss[1])]); } for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next()); return list; }
public class Parser { private String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; private String[] options = {"-maxLength", "80", "-retainTmpSubcategories"}; private LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); private TreebankLanguagePack tlp = lp.getOp().langpack(); private GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); public Parser() {} public LinkedList<String> getKeyWrodsFromSentence(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) sentence.add(new Word(word)); Tree parse = lp.parse(sentence); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); String[] current; String type, key; List<CoreLabel> labelsList = parse.taggedLabeledYield(); for (Label l : labelsList) { current = l.toString().split("-"); type = current[0]; if (type.equals("NN") || type.equals("NNS")) { key = sent[Integer.parseInt(current[1])]; list.add(key); } } return list; } public LinkedList<String> getKeyWrodsFromSentenceTest(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) { sentence.add(new Word(word)); } Tree parse = lp.parse(sentence); parse.pennPrint(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); System.out.println("The words of the sentence:"); for (Label lab : parse.yield()) { if (lab instanceof CoreLabel) { System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP)); } else { System.out.println(lab); } } System.out.println(); System.out.println("tagged"); System.out.println(parse.taggedYield()); List<CoreLabel> temp = parse.taggedLabeledYield(); for (Label l : temp) { String[] sss = l.toString().split("-"); String type = sss[0]; System.out.println(sss[0] + " " + sss[1] + " " + sent[Integer.parseInt(sss[1])]); } for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next()); return list; } public static void main(String[] args) throws IOException { Parser parser = new Parser(); parser.getKeyWrodsFromSentence( "When athletes begin to exercise, their heart rates and respiration rates increase. At what level of organization does the human body coordinate these functions?"); parser.getKeyWrodsFromSentenceTest( "When athletes begin to exercise, their heart rates and respiration rates increase. At what level of organization does the human body coordinate these functions?"); // main2(); } }
public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ String grammar = "./jsan_resources/englishPCFG.ser.gz"; String[] options = { "-maxLength", "120", "-retainTmpSubcategories" }; // LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = new LexicalizedParser() TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); Iterable<List<? extends HasWord>> sentences; ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); everything.add(0,otherSampleStrings); everything.add(1,authorSampleStrings); everything.add(2,toModifyStrings); Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator(); int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings int numLoaded = 0; while(everythingIter.hasNext()){ docTypeNumber++; HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder(); Set<String> currentDocStrings = currentSampleStrings.keySet(); Iterator<String> docStrIter = currentDocStrings.iterator(); String docID; ArrayList<String> sentenceTokens; allTreeProcessors[docTypeNumber] = new TreeProcessor(); allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps(); numLoaded=0; while(docStrIter.hasNext()){ docID = docStrIter.next(); sentenceTokens = currentSampleStrings.get(docID); if(sentenceTokens == null){ allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false); numLoaded++; continue; } //System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext()); numSentences = sentenceTokens.size(); //initialize(numSentences); Iterator<String> sentIter = sentenceTokens.iterator(); List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); String tempSent; while(sentIter.hasNext()){ tempSent = sentIter.next(); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent)); List<? extends HasWord> sentenceTokenized = toke.tokenize(); tmp.add(sentenceTokenized); } sentences = tmp; //int numDone = 0; TreeProcessor.singleDocMap.clear(); boolean willSaveResults = true; for (List<? extends HasWord> sentence : sentences) { Tree parse = lp.apply(sentence); //parse.pennPrint(); //System.out.println(parse.treeSkeletonCopy().toString()); //System.out.println(parse.taggedYield()); //System.out.println(); //printSubTrees(parse); //TreeContainer.recurseTree(parse,"breadth"); allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //numDone++; //System.out.println("sent "+numDone+" of "+numSentences+" done "); //System.out.println(tc.processedTrees.toString()); //in.nextLine(); //TreeContainer.recurseTree(parse, "depth"); //in.nextLine(); //addTree(parse); //GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS //Collection tdl = gs.typedDependenciesCCprocessed(true); //System.out.println(tdl); //System.out.println(); } if(willSaveResults == true) ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR); //System.out.println("After all sents: "); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //String sent3 = "This is one last test!"; //Tree parse3 = lp.apply(sent3); //parse3.pennPrint(); //System.out.println("After sorting and writing:"); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //Scanner in = new Scanner(System.in); //System.out.println("First one done."); //in.nextLine(); //viewTrees(); } //TreeProcessor.writeTreeDataToCSV(sortedTD,docID); allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1); } int i= 0; allParsedAndOrdered.clear(); String[] docTypes = new String[]{"otherSample","authorSample","toModify"}; for(i=0; i < 3; i++){ allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees); allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps); allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps); allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap)); } //ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees); //TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS"); return allParsedAndOrdered; }
/** * parse sentence and generate .trees file * * @param en * @param align * @param out */ public static void parse(String en, String align, String out, boolean verbose) { // use alignments? boolean use_alignments = true; if (align.startsWith("no_align")) { use_alignments = false; System.err.println("Not using alignments."); } else { System.err.println("Using alignments from " + align); } // setup stanfordparser String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"}; LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); TreebankLanguagePack tlp = lp.getOp().langpack(); java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true; GrammaticalStructureFactory gsf = new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter); // read document Iterable<List<? extends HasWord>> sentences; Reader r = new Reader(en); String line = null; List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); while ((line = r.getNext()) != null) { Tokenizer<? extends HasWord> token = tlp.getTokenizerFactory().getTokenizer(new StringReader(line)); List<? extends HasWord> sentence = token.tokenize(); tmp.add(sentence); } sentences = tmp; // set up alignment file reader Reader alignment = new Reader(); if (use_alignments) { alignment = new Reader(align); } // set up tree file writer Writer treeWriter = new Writer(out); // parse long start = System.currentTimeMillis(); // System.err.print("Parsing sentences "); int sentID = 0; for (List<? extends HasWord> sentence : sentences) { Tree t = new Tree(); // t.setSentID(++sentID); System.err.println("parse Sentence :" + sentence + "..."); // System.err.print("."); System.err.println("-----------------------------------------------------------------------"); edu.stanford.nlp.trees.Tree parse = lp.parse(sentence); // parse.pennPrint(); // List for root node and lexical nodes List<Node> loneNodes = new LinkedList<Node>(); List<Node> governingNodes = new LinkedList<Node>(); // ROOT node Node root = new Node(true, true); root.setTag("ROOT"); t.setRoot(root); loneNodes.add(root); governingNodes.add(root); // tagging int counter = 0; String surface = ""; String tag = ""; for (TaggedWord tw : parse.taggedYield()) { Node n = new Node(); Node governingNode = new Node(); n.setNodeID(++counter); surface = tw.value(); tag = tw.tag(); if (surface.startsWith("-LRB-")) { surface = "("; } else if (surface.startsWith("-RRB-")) { surface = ")"; // } else if (surface.startsWith("-LSB-")){ // surface = "["; // } else if (surface.startsWith("-RSB-")){ // surface = "]"; // } else if (surface.startsWith("-LCB-")){ // surface = "{"; // } else if (surface.startsWith("-RCB-")){ // surface = "}"; } else if (surface.startsWith("''")) { surface = "\""; } tag = tag.replaceAll("#", "-NUM-"); surface = surface.replaceAll("&", "-AMP-"); surface = surface.replaceAll("#", "-NUM-"); surface = surface.replaceAll(">", "-GRE-"); surface = surface.replaceAll("=", "-EQU-"); n.setInitialLexicalIndex(counter); governingNode.setInitialLexicalIndex(counter); n.setSurface(surface); // System.out.print("("+tw.value()+" : "); n.setTag(tag); governingNode.setTag("_" + tag); governingNode.setLabel("_gov"); // System.out.print(tw.tag()+")"); loneNodes.add(n); governingNodes.add(governingNode); governingNode.setChild(n); } // System.out.println(""); // t.setSentLength(t.getNodes().size() - 1); // List<Node> loneNodes = new LinkedList<Node>(); Node[] nodes = new Node[2000]; // labeling int depIndex; int govIndex; String[] depInfo; String[] govInfo; GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependencies(false); // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); for (TypedDependency td : tdl) { depIndex = td.dep().index(); govIndex = td.gov().index(); // System.out.println("Index1:"+depIndex); // System.out.println("Index2:"+govIndex); // if (nodes[depIndex] == null){ // System.out.println("Making node!"); // nodes[depIndex] = new Node(); // } // if (nodes[govIndex] == null){ // System.out.println("Making node!"); // nodes[govIndex] = new Node(); // } Node dep = loneNodes.get((depIndex)); Node gov = governingNodes.get((govIndex)); Node depcopy = governingNodes.get((depIndex)); Node govcopy = loneNodes.get((govIndex)); dep.setLabel(td.reln().toString()); depcopy.setLabel(td.reln().toString()); govcopy.setLabel("head"); // System.out.println(td.toString()); govInfo = td.gov().toString().split("/"); depInfo = td.dep().toString().split("/"); // System.out.println(td.gov().toString()); // System.out.println(td.dep().toString()); // dep.setSurface(depInfo[0]); // dep.setTag(depInfo[1]); gov.setChild(governingNodes.get(depIndex)); governingNodes.get(depIndex).setParent(gov); // gov.setChild(dep); dep.setParent(governingNodes.get(depIndex)); } // t.setRoot(nodes[0]); // Collapse tree to remove unneeded governing nodes: Node gov; Node dep; Node parent; List<Node> children; for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root gov = governingNodes.get(i); dep = loneNodes.get(i); if (gov.getChildren().size() <= 1) { int k = 0; parent = gov.getParent(); children = parent.getChildren(); for (Node n : children) { if (n == gov) { gov.getParent().replaceChild(k, dep); dep.setParent(gov.getParent()); } k++; } } } // Mark head nodes with appropriate label: int k = 0; for (Node n : loneNodes) { if (k != 0) { if (n.getLabel() == n.getParent().getLabel()) { n.setLabel("head"); } } else { n.setLabel("null"); } k++; } // Sort lexical children of each governing node in lexical order for (Node n : governingNodes) { n.sortChildrenByInitialIndex(); } // combine with alignment if (use_alignments) { t.initialize(alignment.readNextAlign()); } else { t.initializeUnaligned(); } // write tree to file treeWriter.write(t); // print tree to console System.out.println(t.toSentence()); if (verbose) { System.err.println(t.toString()); // t.recursivePrint(); } System.err.println("#######################################################################"); } long stop = System.currentTimeMillis(); System.err.println("...done! [" + (stop - start) / 1000 + " sec]."); treeWriter.close(); }
private Tree getPosTree(String sentence) { final Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); final List<CoreLabel> tokens = tokenizer.tokenize(); return parser.apply(tokens); }
public static void main(String args[]) throws IOException { long startTime = System.currentTimeMillis(); LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence = "Where did the first President die ?"; System.out.println("Enter the question or press enter for default : "); String tempInput; BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in)); tempInput = b1.readLine(); if (tempInput.length() == 0) System.out.println("The question is the default one : " + sentence); else { sentence = tempInput; System.out.println("The question entered is : " + sentence); } String sentence1 = PreProcess.removeStopWords1(sentence); System.out.println(sentence1); StringTokenizer st1 = new StringTokenizer(sentence1, " "); int n = 0; while (st1.hasMoreTokens()) { String temp1 = st1.nextToken(); // System.out.println("temp replace all is // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]","")); map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", "")); n++; } // for(int s=0;s<n;s++) // System.out.println(map.get(s)); List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree\ tp.printTree(t); System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree // dependencies only print TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(t); // dependencies // Tree b = t.firstChild(); // System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b); String dependency = gs.typedDependenciesCollapsed().toString(); System.out.println("Dependencies :" + dependency); // BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) ); // String wordForm = reader.readLine(); String wordForm = "yes"; int i = -1; String s[][] = new String[20][3]; if (wordForm.equals("yes")) { StringTokenizer st = new StringTokenizer(dependency, " ([)],"); while (st.hasMoreTokens()) { String as = st.nextToken(); System.out.println(as); if (!as.contains("-")) { i++; s[i][0] = as; } else { s[i][1] = as; s[i][2] = st.nextToken(); } } } length = i + 1; interchange1(s); System.out.println("The sorted version is "); // System.out.println("\n\n***********Li8 from here on***********"); for (i = 0; i < length; i++) { for (int j = 0; j < 3; j++) { System.out.print(s[i][j] + " "); } System.out.println(); } // int adjmatrix[][] = new int[length][length]; System.out.println("What answer type is required: "); BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String answtype = reader.readLine(); String[] temp; temp = sentence.split(" ", 2); int g = 0; int h = 0; String secque = null; // dijikstra implementation int adjmatrix[][] = new int[length][length]; int j = 0; for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100; formadj(adjmatrix, s); print(adjmatrix); // Dijikstraalgo.dijikstra(adjmatrix,length-2); // Dijikstraalgo.dijikstra(adjmatrix,length-1); if (Dijikstraalgo.dijikstra(adjmatrix, length - 1) - Dijikstraalgo.dijikstra(adjmatrix, length - 2) >= 0) { System.out.println("Type 1"); if (makesentence(s, length - 1) == null) { secque = s[length - 1][2] + " " + s[length - 1][1]; System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?"); } else { secque = makesentence(s, length - 1); System.out.println(answtype + " is " + secque + " ?"); } } else { System.out.println("Type 2"); System.out.println( "Before entering the makesentence function(the cause of the null pointer exception) " + s[length - 2][0] + " " + s[length - 2][1]); if (makesentence(s, length - 2) == null) { secque = s[length - 2][2] + " " + s[length - 2][1]; System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?"); } else { // System.out.println("null"); secque = makesentence(s, length - 2); System.out.println(answtype + " is " + secque + " ?"); } } // System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]","")); System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), "")); long endTime = System.currentTimeMillis(); System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000); System.out.println("The end"); }
class StanfordParser { private final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; private final TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); private final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL); private final String serializedClassifier = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf" + ".ser.gz"; private final AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); public ParsedSentence parseSentence(String sentence, boolean removePunctuation) { if (removePunctuation) { sentence = cleanSentence(sentence); } final Tree posTree = getPosTree(sentence); return new ParsedSentence(posTree, getDependencies(posTree), findNamedEntities(sentence)); } public Tense calculateTense(String clause) { final Tree posTree = getPosTree(clause); final Tree word = posTree.getLeaves().get(0); final String pos = word.parent(posTree).label().value().toLowerCase(); if (pos.equals("md")) { return Tense.FUTURE; } if (pos.equals("vbd") || pos.equals("vbn")) { return Tense.PAST; } return Tense.PRESENT; } public Map<String, NamedEntity> findNamedEntities(String sentence) { final Map<String, NamedEntity> namedEntities = new HashMap<>(); final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence); for (final Triple<String, Integer, Integer> substring : nerSubstrings) { namedEntities.put( sentence.substring(substring.second(), substring.third()), NamedEntity.getNamedEntity(substring.first())); } return namedEntities; } private List<Triple<String, Integer, Integer>> findNerSubstrings(String sentence) { return classifier.classifyToCharacterOffsets(sentence); } private String cleanSentence(String sentence) { return sentence.replaceAll("\\p{Punct}", "").replaceAll("[ ]+", " "); } private Tree getPosTree(String sentence) { final Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); final List<CoreLabel> tokens = tokenizer.tokenize(); return parser.apply(tokens); } private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) { final TreebankLanguagePack tlp = new PennTreebankLanguagePack(); final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree); return gs.typedDependenciesCollapsed(); } }
public static Tree parse(String str) { List<CoreLabel> tokens = tokenize(str); Tree tree = parser.apply(tokens); return tree; }
@SuppressWarnings("serial") public class TextSimplification { public static List<String> replacementList = new ArrayList<String>() { { add("he"); add("him"); add("his"); add("she"); add("her"); add("they"); add("them"); add("their"); add("i"); add("her's"); add("you"); add("your"); add("your's"); add("mine"); add("my"); add("us"); add("we"); // add("it"); // add("its"); // add("this"); // add("that"); } }; public static String resolvedSentences = ""; private static final String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; private static final TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); private static final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL); public static void main(String[] args) throws IOException { // :TODO // * Do not consider roots with more than 2 words // * Root should not be he, she her, his, him etc... // * If it is, den take the last known gender noun and make it the root. String text = new String(Files.readAllBytes(Paths.get(args[0])), StandardCharsets.UTF_8); text = text.replace("\n", " "); // Resolve Anaphora System.out.println("Anaphora Resolution..."); resolveAnaphora(text); System.out.println( "Anaphora Resolution Completed!\nIntermediate Output in \"AnaphoraResolved.txt\""); writeToFile(resolvedSentences, "AnaphoraResolved.txt"); // Create ParseTrees System.out.println("Parse Tree Generation..."); startParsing((resolvedSentences)); System.out.println("Parse Tree Generation Completed!\nIntermediate Output in \"Tree.txt\""); } public static void resolveAnaphora(String text) { RedwoodConfiguration.empty().capture(System.err).apply(); Annotation document = new Annotation(text); Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); props.put("dcoref.female", "female.unigram.txt"); props.put("dcoref.male", "male.unigram.txt"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.annotate(document); RedwoodConfiguration.current().clear().apply(); Map<Integer, CorefChain> graph = document.get(CorefChainAnnotation.class); List<CoreMap> stnfrdSentences = document.get(SentencesAnnotation.class); ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> records = ImmutableMultimap.builder(); ImmutableMultimap.Builder<Integer, Pair<CorefChain, CorefMention>> recordsOrdered = ImmutableMultimap.builder(); graph.forEach( (key, value) -> { value .getMentionMap() .forEach( (intPair, corefSet) -> { corefSet.forEach( mention -> records.put(mention.sentNum, Pair.of(value, mention))); }); }); recordsOrdered = records.orderKeysBy( new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return o1 - o2; } }); recordsOrdered .build() .asMap() .forEach( (sentNum, mentionList) -> { CoreMap sentence = stnfrdSentences.get(sentNum - 1); List<CoreLabel> stnfrdtokens = sentence.get(TokensAnnotation.class); mentionList.forEach( pair -> { CorefChain chain = pair.getLeft(); CorefMention mention = pair.getRight(); String root = chain.getRepresentativeMention().mentionSpan; if (!mention.mentionSpan.equalsIgnoreCase(root) && (!root.contains(mention.mentionSpan) && !mention.mentionSpan.contains(root)) && (!replacementList.contains(root.toLowerCase())) && (root.split("\\s").length < 3) && (replacementList.contains(mention.mentionSpan.toLowerCase()))) { if (mention.mentionSpan.equalsIgnoreCase("her") || mention.mentionSpan.equalsIgnoreCase("his")) { root += "'s"; } stnfrdtokens.get(mention.startIndex - 1).setOriginalText(root); } }); String sent = ""; for (CoreLabel token : stnfrdtokens) { sent += token.originalText() + " "; } ; resolvedSentences += sent + "\n"; }); } public static Tree parse(String str) { List<CoreLabel> tokens = tokenize(str); Tree tree = parser.apply(tokens); return tree; } private static List<CoreLabel> tokenize(String str) { Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str)); return tokenizer.tokenize(); } public static void startParsing(String paragraph) throws FileNotFoundException, IOException { String parseTrees = ""; // Can we just split on new line as paragraph is already sentence splitted. Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); for (List<HasWord> sentence : dp) { String sentenceString = Sentence.listToString(sentence); sentenceList.add(sentenceString); } for (String sentence : sentenceList) { // System.out.println(sentence); parseTrees += createParseTree(sentence); } writeToFile(parseTrees, "trees.txt"); } public static void writeToFile(String content, String filename) throws IOException { File file = new File(filename); file.delete(); FileWriter fout = new FileWriter(filename); fout.write(content); fout.close(); } public static String createParseTree(String sentence) { Tree tree = parse(sentence); // System.out.println(tree.toString()); return (tree.toString() + "\n"); } }
public LexicalParsingEngine(String parserModel) throws FileNotFoundException, UnsupportedEncodingException { System.out.println("Initializing Lexical Parser..."); lp = LexicalizedParser.loadModel(parserModel); }
/** * for testing -- CURRENTLY BROKEN!!! * * @param args input dir and output filename * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 3) { throw new RuntimeException("args: treebankPath trainNums testNums"); } ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams(); ctpp.charTags = true; // TODO: these options are getting clobbered by reading in the // parser object (unless it's a text file parser?) Options op = new Options(ctpp); op.doDep = false; op.testOptions.maxLength = 90; LexicalizedParser lp; try { FileFilter trainFilt = new NumberRangesFileFilter(args[1], false); lp = LexicalizedParser.trainFromTreebank(args[0], trainFilt, op); try { String filename = "chineseCharTagPCFG.ser.gz"; System.err.println("Writing parser in serialized format to file " + filename + ' '); System.err.flush(); ObjectOutputStream out = IOUtils.writeStreamFromString(filename); out.writeObject(lp); out.close(); System.err.println("done."); } catch (IOException ioe) { ioe.printStackTrace(); } } catch (IllegalArgumentException e) { lp = LexicalizedParser.loadModel(args[1], op); } FileFilter testFilt = new NumberRangesFileFilter(args[2], false); MemoryTreebank testTreebank = ctpp.memoryTreebank(); testTreebank.loadPath(new File(args[0]), testFilt); PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true); WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser(); WordCatEqualityChecker eqcheck = new WordCatEqualityChecker(); EquivalenceClassEval eval = new EquivalenceClassEval(eqclass, eqcheck); // System.out.println("Preterminals:" + preterminals); System.out.println("Testing..."); for (Tree gold : testTreebank) { Tree tree; try { tree = lp.parseTree(gold.yieldHasWord()); if (tree == null) { System.out.println("Failed to parse " + gold.yieldHasWord()); continue; } } catch (Exception e) { e.printStackTrace(); continue; } gold = gold.firstChild(); pw.println(Sentence.listToString(gold.preTerminalYield())); pw.println(Sentence.listToString(gold.yield())); gold.pennPrint(pw); pw.println(tree.preTerminalYield()); pw.println(tree.yield()); tree.pennPrint(pw); // Collection allBrackets = WordCatConstituent.allBrackets(tree); // Collection goldBrackets = WordCatConstituent.allBrackets(gold); // eval.eval(allBrackets, goldBrackets); eval.displayLast(); } System.out.println(); System.out.println(); eval.display(); }
public ParseResult parseSentence(String sentence) { String result = ""; // see if a parser socket server is available int port = new Integer(ARKref.getProperties().getProperty("parserServerPort", "5556")); String host = "127.0.0.1"; Socket client; PrintWriter pw; BufferedReader br; String line; try { client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); pw.println(sentence); pw.flush(); // flush to complete the transmission while ((line = br.readLine()) != null) { // if(!line.matches(".*\\S.*")){ // System.out.println(); // } if (br.ready()) { line = line.replaceAll("\n", ""); line = line.replaceAll("\\s+", " "); result += line + " "; } else { lastParseScore = new Double(line); } } br.close(); pw.close(); client.close(); System.err.println("parser output:" + result); lastParse = readTreeFromString(result); boolean success = !Strings.normalizeWhitespace(result).equals("(ROOT (. .))"); return new ParseResult(success, lastParse, lastParseScore); } catch (Exception ex) { // ex.printStackTrace(); } // if socket server not available, then use a local parser object if (parser == null) { if (DEBUG) System.err.println("Could not connect to parser server. Loading parser..."); try { Options op = new Options(); String serializedInputFileOrUrl = ClassLoader.getSystemResource( ARKref.getProperties() .getProperty("parserGrammarFile", "lib/englishPCFG.ser.gz")) .toExternalForm(); parser = LexicalizedParser.loadModel(serializedInputFileOrUrl, op); // int maxLength = new Integer(ARKref.getProperties().getProperty("parserMaxLength", // "40")).intValue(); // parser.setMaxLength(maxLength); parser.setOptionFlags("-outputFormat", "oneline"); } catch (Exception e) { e.printStackTrace(); } } try { DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(sentence)); LexicalizedParserQuery query = parser.parserQuery(); if (query.parse(dp.iterator().next())) { lastParse = query.getBestParse(); lastParseScore = query.getPCFGScore(); TreePrint tp = new TreePrint("penn", "", new PennTreebankLanguagePack()); StringWriter sb = new StringWriter(); pw = new PrintWriter(sb); tp.printTree(lastParse, pw); pw.flush(); lastParse = readTreeFromString(sb.getBuffer().toString()); return new ParseResult(true, lastParse, lastParseScore); } } catch (Exception e) { } lastParse = readTreeFromString("(ROOT (. .))"); lastParseScore = -99999.0; return new ParseResult(false, lastParse, lastParseScore); }
public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) { LexicalizedParser lp = LexicalizedParser.loadModel( "/home/mingrui/Desktop/englishPCFG.ser.gz", "-maxLength", "80", "-retainTmpSubcategories"); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); // Uncomment the following line to obtain original Stanford Dependencies // tlp.setGenerateOriginalDependencies(true); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); String[] array = sentence.split("\\s+"); Tree parse = lp.apply(Sentence.toWordList(array)); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); ArrayList<String> keywordsDependency = new ArrayList<String>(); ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>(); // String lemmatizedKeyword = lemmatize(keyword); for (TypedDependency t : tdl) { String d = t.toString(); String dependencyType = d.substring(0, d.indexOf("(")); String pair = d.substring(d.indexOf("(") + 1, d.indexOf("(")); String[] terms = pair.split(","); String term1 = terms[0].trim(); String term2 = terms[1].trim(); // Match keywords with the terms in the tuples, if matched, add the // tuple into the arraylist String[] wordsplitted = keyword.split(" "); for (String key : wordsplitted) { if (term1.equals(key)) { keywordsDependency.add(t.toString()); } if (term2.equals(key)) { keywordsDependency.add(t.toString()); } } } String lemmatizedKeywords = lemmatize(keyword); int lbefore = keyword.split(" ").length; int lafter = lemmatizedKeywords.split(" ").length; if (lbefore == lafter) { return keywordsDependency; } else { String[] split = keyword.split(" "); for (String s : split) { String[] lemmas = lemmatize(s).split(" "); boolean sameLength = lemmas.length == s.split(" ").length; if (sameLength) { // Compare the length of one key_word or key_phrase before and after // lemmatization continue; } else { for (String tuple : keywordsDependency) { if (getTupleTerms(tuple)[0].equals( s)) { // Find the tuple that contains the original keyword/key_phrase String dependent = getTupleTerms(tuple)[1]; // String[] } } // for(String l : lemma) } } return keywordsDependencyWithLemmatization; } }