/** * Initializes static resources. * * @throws Exception */ public static void initialize() throws Exception { if (parser != null) return; Properties properties = Properties.loadFromClassName(StanfordParser.class.getName()); tlp = new PennTreebankLanguagePack(); gsf = tlp.grammaticalStructureFactory(); String modelFile = properties.getProperty("modelFile"); if (modelFile == null) throw new Exception("Required property '" + "modelFile' is undefined"); parser = new LexicalizedParser(modelFile); }
private Collection<TypedDependency> parseSentenceTDL(String text) { System.out.println("Parsing sentence..."); Collection<TypedDependency> tdl = null; TreebankLanguagePack tlp = lp.treebankLanguagePack(); GrammaticalStructureFactory gsf = null; if (tlp.supportsGrammaticalStructures()) { gsf = tlp.grammaticalStructureFactory(); } Reader reader = new StringReader(text); for (List<HasWord> sentence : new DocumentPreprocessor(reader)) { Tree parse = lp.apply(sentence); if (gsf != null) { GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); tdl = gs.allTypedDependencies(); } } return tdl; }
private static List<TypedDependency> getDependencies(String sentence) { if (pipeline == null) { loadModels(); } TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> rawWords2 = tok.tokenize(); Tree parse = lp.apply(rawWords2); // parse.pennPrint(); // // System.out.println(parse.toString()); TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); return tdl; }
public class Parser { private String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; private String[] options = {"-maxLength", "80", "-retainTmpSubcategories"}; private LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); private TreebankLanguagePack tlp = lp.getOp().langpack(); private GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); public Parser() {} public LinkedList<String> getKeyWrodsFromSentence(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) sentence.add(new Word(word)); Tree parse = lp.parse(sentence); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); String[] current; String type, key; List<CoreLabel> labelsList = parse.taggedLabeledYield(); for (Label l : labelsList) { current = l.toString().split("-"); type = current[0]; if (type.equals("NN") || type.equals("NNS")) { key = sent[Integer.parseInt(current[1])]; list.add(key); } } return list; } public LinkedList<String> getKeyWrodsFromSentenceTest(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) { sentence.add(new Word(word)); } Tree parse = lp.parse(sentence); parse.pennPrint(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); System.out.println("The words of the sentence:"); for (Label lab : parse.yield()) { if (lab instanceof CoreLabel) { System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP)); } else { System.out.println(lab); } } System.out.println(); System.out.println("tagged"); System.out.println(parse.taggedYield()); List<CoreLabel> temp = parse.taggedLabeledYield(); for (Label l : temp) { String[] sss = l.toString().split("-"); String type = sss[0]; System.out.println(sss[0] + " " + sss[1] + " " + sent[Integer.parseInt(sss[1])]); } for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next()); return list; } public static void main(String[] args) throws IOException { Parser parser = new Parser(); parser.getKeyWrodsFromSentence( "When athletes begin to exercise, their heart rates and respiration rates increase. At what level of organization does the human body coordinate these functions?"); parser.getKeyWrodsFromSentenceTest( "When athletes begin to exercise, their heart rates and respiration rates increase. At what level of organization does the human body coordinate these functions?"); // main2(); } }
private Collection<TypedDependency> getDependencies(Tree sentenceParseTree) { final TreebankLanguagePack tlp = new PennTreebankLanguagePack(); final GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); final GrammaticalStructure gs = gsf.newGrammaticalStructure(sentenceParseTree); return gs.typedDependenciesCollapsed(); }
public static void main(String args[]) throws IOException { long startTime = System.currentTimeMillis(); LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence = "Where did the first President die ?"; System.out.println("Enter the question or press enter for default : "); String tempInput; BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in)); tempInput = b1.readLine(); if (tempInput.length() == 0) System.out.println("The question is the default one : " + sentence); else { sentence = tempInput; System.out.println("The question entered is : " + sentence); } String sentence1 = PreProcess.removeStopWords1(sentence); System.out.println(sentence1); StringTokenizer st1 = new StringTokenizer(sentence1, " "); int n = 0; while (st1.hasMoreTokens()) { String temp1 = st1.nextToken(); // System.out.println("temp replace all is // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]","")); map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", "")); n++; } // for(int s=0;s<n;s++) // System.out.println(map.get(s)); List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree\ tp.printTree(t); System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree // dependencies only print TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(t); // dependencies // Tree b = t.firstChild(); // System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b); String dependency = gs.typedDependenciesCollapsed().toString(); System.out.println("Dependencies :" + dependency); // BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) ); // String wordForm = reader.readLine(); String wordForm = "yes"; int i = -1; String s[][] = new String[20][3]; if (wordForm.equals("yes")) { StringTokenizer st = new StringTokenizer(dependency, " ([)],"); while (st.hasMoreTokens()) { String as = st.nextToken(); System.out.println(as); if (!as.contains("-")) { i++; s[i][0] = as; } else { s[i][1] = as; s[i][2] = st.nextToken(); } } } length = i + 1; interchange1(s); System.out.println("The sorted version is "); // System.out.println("\n\n***********Li8 from here on***********"); for (i = 0; i < length; i++) { for (int j = 0; j < 3; j++) { System.out.print(s[i][j] + " "); } System.out.println(); } // int adjmatrix[][] = new int[length][length]; System.out.println("What answer type is required: "); BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String answtype = reader.readLine(); String[] temp; temp = sentence.split(" ", 2); int g = 0; int h = 0; String secque = null; // dijikstra implementation int adjmatrix[][] = new int[length][length]; int j = 0; for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100; formadj(adjmatrix, s); print(adjmatrix); // Dijikstraalgo.dijikstra(adjmatrix,length-2); // Dijikstraalgo.dijikstra(adjmatrix,length-1); if (Dijikstraalgo.dijikstra(adjmatrix, length - 1) - Dijikstraalgo.dijikstra(adjmatrix, length - 2) >= 0) { System.out.println("Type 1"); if (makesentence(s, length - 1) == null) { secque = s[length - 1][2] + " " + s[length - 1][1]; System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?"); } else { secque = makesentence(s, length - 1); System.out.println(answtype + " is " + secque + " ?"); } } else { System.out.println("Type 2"); System.out.println( "Before entering the makesentence function(the cause of the null pointer exception) " + s[length - 2][0] + " " + s[length - 2][1]); if (makesentence(s, length - 2) == null) { secque = s[length - 2][2] + " " + s[length - 2][1]; System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?"); } else { // System.out.println("null"); secque = makesentence(s, length - 2); System.out.println(answtype + " is " + secque + " ?"); } } // System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]","")); System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), "")); long endTime = System.currentTimeMillis(); System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000); System.out.println("The end"); }
public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ String grammar = "./jsan_resources/englishPCFG.ser.gz"; String[] options = { "-maxLength", "120", "-retainTmpSubcategories" }; // LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = new LexicalizedParser() TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); Iterable<List<? extends HasWord>> sentences; ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); everything.add(0,otherSampleStrings); everything.add(1,authorSampleStrings); everything.add(2,toModifyStrings); Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator(); int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings int numLoaded = 0; while(everythingIter.hasNext()){ docTypeNumber++; HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder(); Set<String> currentDocStrings = currentSampleStrings.keySet(); Iterator<String> docStrIter = currentDocStrings.iterator(); String docID; ArrayList<String> sentenceTokens; allTreeProcessors[docTypeNumber] = new TreeProcessor(); allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps(); numLoaded=0; while(docStrIter.hasNext()){ docID = docStrIter.next(); sentenceTokens = currentSampleStrings.get(docID); if(sentenceTokens == null){ allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false); numLoaded++; continue; } //System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext()); numSentences = sentenceTokens.size(); //initialize(numSentences); Iterator<String> sentIter = sentenceTokens.iterator(); List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); String tempSent; while(sentIter.hasNext()){ tempSent = sentIter.next(); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent)); List<? extends HasWord> sentenceTokenized = toke.tokenize(); tmp.add(sentenceTokenized); } sentences = tmp; //int numDone = 0; TreeProcessor.singleDocMap.clear(); boolean willSaveResults = true; for (List<? extends HasWord> sentence : sentences) { Tree parse = lp.apply(sentence); //parse.pennPrint(); //System.out.println(parse.treeSkeletonCopy().toString()); //System.out.println(parse.taggedYield()); //System.out.println(); //printSubTrees(parse); //TreeContainer.recurseTree(parse,"breadth"); allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //numDone++; //System.out.println("sent "+numDone+" of "+numSentences+" done "); //System.out.println(tc.processedTrees.toString()); //in.nextLine(); //TreeContainer.recurseTree(parse, "depth"); //in.nextLine(); //addTree(parse); //GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS //Collection tdl = gs.typedDependenciesCCprocessed(true); //System.out.println(tdl); //System.out.println(); } if(willSaveResults == true) ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR); //System.out.println("After all sents: "); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //String sent3 = "This is one last test!"; //Tree parse3 = lp.apply(sent3); //parse3.pennPrint(); //System.out.println("After sorting and writing:"); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //Scanner in = new Scanner(System.in); //System.out.println("First one done."); //in.nextLine(); //viewTrees(); } //TreeProcessor.writeTreeDataToCSV(sortedTD,docID); allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1); } int i= 0; allParsedAndOrdered.clear(); String[] docTypes = new String[]{"otherSample","authorSample","toModify"}; for(i=0; i < 3; i++){ allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees); allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps); allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps); allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap)); } //ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees); //TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS"); return allParsedAndOrdered; }
public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) { LexicalizedParser lp = LexicalizedParser.loadModel( "/home/mingrui/Desktop/englishPCFG.ser.gz", "-maxLength", "80", "-retainTmpSubcategories"); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); // Uncomment the following line to obtain original Stanford Dependencies // tlp.setGenerateOriginalDependencies(true); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); String[] array = sentence.split("\\s+"); Tree parse = lp.apply(Sentence.toWordList(array)); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); ArrayList<String> keywordsDependency = new ArrayList<String>(); ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>(); // String lemmatizedKeyword = lemmatize(keyword); for (TypedDependency t : tdl) { String d = t.toString(); String dependencyType = d.substring(0, d.indexOf("(")); String pair = d.substring(d.indexOf("(") + 1, d.indexOf("(")); String[] terms = pair.split(","); String term1 = terms[0].trim(); String term2 = terms[1].trim(); // Match keywords with the terms in the tuples, if matched, add the // tuple into the arraylist String[] wordsplitted = keyword.split(" "); for (String key : wordsplitted) { if (term1.equals(key)) { keywordsDependency.add(t.toString()); } if (term2.equals(key)) { keywordsDependency.add(t.toString()); } } } String lemmatizedKeywords = lemmatize(keyword); int lbefore = keyword.split(" ").length; int lafter = lemmatizedKeywords.split(" ").length; if (lbefore == lafter) { return keywordsDependency; } else { String[] split = keyword.split(" "); for (String s : split) { String[] lemmas = lemmatize(s).split(" "); boolean sameLength = lemmas.length == s.split(" ").length; if (sameLength) { // Compare the length of one key_word or key_phrase before and after // lemmatization continue; } else { for (String tuple : keywordsDependency) { if (getTupleTerms(tuple)[0].equals( s)) { // Find the tuple that contains the original keyword/key_phrase String dependent = getTupleTerms(tuple)[1]; // String[] } } // for(String l : lemma) } } return keywordsDependencyWithLemmatization; } }