public LinkedList<String> getKeyWrodsFromSentence(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) sentence.add(new Word(word)); Tree parse = lp.parse(sentence); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); String[] current; String type, key; List<CoreLabel> labelsList = parse.taggedLabeledYield(); for (Label l : labelsList) { current = l.toString().split("-"); type = current[0]; if (type.equals("NN") || type.equals("NNS")) { key = sent[Integer.parseInt(current[1])]; list.add(key); } } return list; }
/** * Parses a sentence and returns the parse tree. * * @param sentence a sentence * @return Tree character offsets in keys BEGIN_KEY and END_KEY */ @SuppressWarnings("unchecked") public static Tree parseTree(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce stanford Tree log.debug("Parsing sentence"); Tree tree = null; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); tree = parser.getBestParse(); } // label tree with character extents // log.debug("Setting character extents"); // updateTreeLabels(tree, tree, new MutableInteger(), new MutableInteger(-1)); // log.debug("Creating offset mapping"); // List<RangeMap> mapping = createMapping(sentence); // log.debug(mapping.toString()); // log.debug("Applying offset mapping"); // mapOffsets(tree, mapping); return tree; }
/** * Parses a sentence and returns the PCFG score as a confidence measure. * * @param sentence a sentence * @return PCFG score */ @SuppressWarnings("unchecked") public static double getPCFGScore(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce PCFG score log.debug("Parsing sentence"); double score; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); score = parser.getPCFGScore(); } return score; }
public LinkedList<String> getKeyWrodsFromSentenceTest(String string) { LinkedList<String> list = new LinkedList<String>(); String[] sent = string.split(" "); List<HasWord> sentence = new ArrayList<HasWord>(); for (String word : sent) { sentence.add(new Word(word)); } Tree parse = lp.parse(sentence); parse.pennPrint(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); System.out.println("The words of the sentence:"); for (Label lab : parse.yield()) { if (lab instanceof CoreLabel) { System.out.println(((CoreLabel) lab).toString(CoreLabel.OutputFormat.VALUE_MAP)); } else { System.out.println(lab); } } System.out.println(); System.out.println("tagged"); System.out.println(parse.taggedYield()); List<CoreLabel> temp = parse.taggedLabeledYield(); for (Label l : temp) { String[] sss = l.toString().split("-"); String type = sss[0]; System.out.println(sss[0] + " " + sss[1] + " " + sent[Integer.parseInt(sss[1])]); } for (Iterator<String> ite = list.iterator(); ite.hasNext(); ) System.out.println(ite.next()); return list; }
public static void main(String[] args) // start of the main method { System.out.println("\n\n\nSTART\n\n\n"); // print START try // device to handle potential errors { // open file whose path is passed // as the first argument of the main method: FileInputStream fis = new FileInputStream(args[0]); DataInputStream dis = new DataInputStream(fis); BufferedReader br = new BufferedReader(new InputStreamReader(dis)); // prepare Parser, Tokenizer and Tree printer: LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence; // initialization // for each line of the file // retrieve it as a string called 'sentence': while ((sentence = br.readLine()) != null) { // print sentence: System.out.println("\n\n\n\nORIGINAL:\n\n" + sentence); // put tokens in a list: List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree System.out.println("\nPROCESSED:\n\n"); tp.printTree(t); // print tree } dis.close(); // close input file } catch (Exception e) // catch error if any { System.err.println("ERROR: " + e.getMessage()); // print error message } System.out.println("\n\n\nTHE END\n\n\n"); // print THE END } // end of the main method
/** * parse sentence and generate .trees file * * @param en * @param align * @param out */ public static void parse(String en, String align, String out, boolean verbose) { // use alignments? boolean use_alignments = true; if (align.startsWith("no_align")) { use_alignments = false; System.err.println("Not using alignments."); } else { System.err.println("Using alignments from " + align); } // setup stanfordparser String grammar = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; String[] options = {"-outputFormat", "wordsAndTags, typedDependencies"}; LexicalizedParser lp = LexicalizedParser.loadModel(grammar, options); TreebankLanguagePack tlp = lp.getOp().langpack(); java.util.function.Predicate<java.lang.String> punctuationFilter = x -> true; GrammaticalStructureFactory gsf = new edu.stanford.nlp.trees.EnglishGrammaticalStructureFactory(punctuationFilter); // read document Iterable<List<? extends HasWord>> sentences; Reader r = new Reader(en); String line = null; List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); while ((line = r.getNext()) != null) { Tokenizer<? extends HasWord> token = tlp.getTokenizerFactory().getTokenizer(new StringReader(line)); List<? extends HasWord> sentence = token.tokenize(); tmp.add(sentence); } sentences = tmp; // set up alignment file reader Reader alignment = new Reader(); if (use_alignments) { alignment = new Reader(align); } // set up tree file writer Writer treeWriter = new Writer(out); // parse long start = System.currentTimeMillis(); // System.err.print("Parsing sentences "); int sentID = 0; for (List<? extends HasWord> sentence : sentences) { Tree t = new Tree(); // t.setSentID(++sentID); System.err.println("parse Sentence :" + sentence + "..."); // System.err.print("."); System.err.println("-----------------------------------------------------------------------"); edu.stanford.nlp.trees.Tree parse = lp.parse(sentence); // parse.pennPrint(); // List for root node and lexical nodes List<Node> loneNodes = new LinkedList<Node>(); List<Node> governingNodes = new LinkedList<Node>(); // ROOT node Node root = new Node(true, true); root.setTag("ROOT"); t.setRoot(root); loneNodes.add(root); governingNodes.add(root); // tagging int counter = 0; String surface = ""; String tag = ""; for (TaggedWord tw : parse.taggedYield()) { Node n = new Node(); Node governingNode = new Node(); n.setNodeID(++counter); surface = tw.value(); tag = tw.tag(); if (surface.startsWith("-LRB-")) { surface = "("; } else if (surface.startsWith("-RRB-")) { surface = ")"; // } else if (surface.startsWith("-LSB-")){ // surface = "["; // } else if (surface.startsWith("-RSB-")){ // surface = "]"; // } else if (surface.startsWith("-LCB-")){ // surface = "{"; // } else if (surface.startsWith("-RCB-")){ // surface = "}"; } else if (surface.startsWith("''")) { surface = "\""; } tag = tag.replaceAll("#", "-NUM-"); surface = surface.replaceAll("&", "-AMP-"); surface = surface.replaceAll("#", "-NUM-"); surface = surface.replaceAll(">", "-GRE-"); surface = surface.replaceAll("=", "-EQU-"); n.setInitialLexicalIndex(counter); governingNode.setInitialLexicalIndex(counter); n.setSurface(surface); // System.out.print("("+tw.value()+" : "); n.setTag(tag); governingNode.setTag("_" + tag); governingNode.setLabel("_gov"); // System.out.print(tw.tag()+")"); loneNodes.add(n); governingNodes.add(governingNode); governingNode.setChild(n); } // System.out.println(""); // t.setSentLength(t.getNodes().size() - 1); // List<Node> loneNodes = new LinkedList<Node>(); Node[] nodes = new Node[2000]; // labeling int depIndex; int govIndex; String[] depInfo; String[] govInfo; GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependencies(false); // List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); for (TypedDependency td : tdl) { depIndex = td.dep().index(); govIndex = td.gov().index(); // System.out.println("Index1:"+depIndex); // System.out.println("Index2:"+govIndex); // if (nodes[depIndex] == null){ // System.out.println("Making node!"); // nodes[depIndex] = new Node(); // } // if (nodes[govIndex] == null){ // System.out.println("Making node!"); // nodes[govIndex] = new Node(); // } Node dep = loneNodes.get((depIndex)); Node gov = governingNodes.get((govIndex)); Node depcopy = governingNodes.get((depIndex)); Node govcopy = loneNodes.get((govIndex)); dep.setLabel(td.reln().toString()); depcopy.setLabel(td.reln().toString()); govcopy.setLabel("head"); // System.out.println(td.toString()); govInfo = td.gov().toString().split("/"); depInfo = td.dep().toString().split("/"); // System.out.println(td.gov().toString()); // System.out.println(td.dep().toString()); // dep.setSurface(depInfo[0]); // dep.setTag(depInfo[1]); gov.setChild(governingNodes.get(depIndex)); governingNodes.get(depIndex).setParent(gov); // gov.setChild(dep); dep.setParent(governingNodes.get(depIndex)); } // t.setRoot(nodes[0]); // Collapse tree to remove unneeded governing nodes: Node gov; Node dep; Node parent; List<Node> children; for (int i = 1; i < governingNodes.size(); i++) { // start with index 1 to skip root gov = governingNodes.get(i); dep = loneNodes.get(i); if (gov.getChildren().size() <= 1) { int k = 0; parent = gov.getParent(); children = parent.getChildren(); for (Node n : children) { if (n == gov) { gov.getParent().replaceChild(k, dep); dep.setParent(gov.getParent()); } k++; } } } // Mark head nodes with appropriate label: int k = 0; for (Node n : loneNodes) { if (k != 0) { if (n.getLabel() == n.getParent().getLabel()) { n.setLabel("head"); } } else { n.setLabel("null"); } k++; } // Sort lexical children of each governing node in lexical order for (Node n : governingNodes) { n.sortChildrenByInitialIndex(); } // combine with alignment if (use_alignments) { t.initialize(alignment.readNextAlign()); } else { t.initializeUnaligned(); } // write tree to file treeWriter.write(t); // print tree to console System.out.println(t.toSentence()); if (verbose) { System.err.println(t.toString()); // t.recursivePrint(); } System.err.println("#######################################################################"); } long stop = System.currentTimeMillis(); System.err.println("...done! [" + (stop - start) / 1000 + " sec]."); treeWriter.close(); }
public static void main(String args[]) throws IOException { long startTime = System.currentTimeMillis(); LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); TokenizerFactory tf = PTBTokenizer.factory(false, new WordTokenFactory()); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); String sentence = "Where did the first President die ?"; System.out.println("Enter the question or press enter for default : "); String tempInput; BufferedReader b1 = new BufferedReader(new InputStreamReader(System.in)); tempInput = b1.readLine(); if (tempInput.length() == 0) System.out.println("The question is the default one : " + sentence); else { sentence = tempInput; System.out.println("The question entered is : " + sentence); } String sentence1 = PreProcess.removeStopWords1(sentence); System.out.println(sentence1); StringTokenizer st1 = new StringTokenizer(sentence1, " "); int n = 0; while (st1.hasMoreTokens()) { String temp1 = st1.nextToken(); // System.out.println("temp replace all is // "+temp1.replaceAll("'s","").replaceAll("[^A-Za-z]","")); map.put(n, temp1.replaceAll("'s", "").replaceAll("[^A-Za-z]", "")); n++; } // for(int s=0;s<n;s++) // System.out.println(map.get(s)); List tokens = tf.getTokenizer(new StringReader(sentence)).tokenize(); lp.parse(tokens); // parse the tokens Tree t = lp.getBestParse(); // get the best parse tree\ tp.printTree(t); System.out.println("\nPROCESSED:\n\n"); // tp.printTree(t); // print tree // dependencies only print TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(t); // dependencies // Tree b = t.firstChild(); // System.out.println("\nFirst child of the tree is :\n\n"); tp.printTree(b); String dependency = gs.typedDependenciesCollapsed().toString(); System.out.println("Dependencies :" + dependency); // BufferedReader reader = new BufferedReader( new InputStreamReader(System.in) ); // String wordForm = reader.readLine(); String wordForm = "yes"; int i = -1; String s[][] = new String[20][3]; if (wordForm.equals("yes")) { StringTokenizer st = new StringTokenizer(dependency, " ([)],"); while (st.hasMoreTokens()) { String as = st.nextToken(); System.out.println(as); if (!as.contains("-")) { i++; s[i][0] = as; } else { s[i][1] = as; s[i][2] = st.nextToken(); } } } length = i + 1; interchange1(s); System.out.println("The sorted version is "); // System.out.println("\n\n***********Li8 from here on***********"); for (i = 0; i < length; i++) { for (int j = 0; j < 3; j++) { System.out.print(s[i][j] + " "); } System.out.println(); } // int adjmatrix[][] = new int[length][length]; System.out.println("What answer type is required: "); BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String answtype = reader.readLine(); String[] temp; temp = sentence.split(" ", 2); int g = 0; int h = 0; String secque = null; // dijikstra implementation int adjmatrix[][] = new int[length][length]; int j = 0; for (i = 0; i < length; i++) for (j = 0; j < length; j++) adjmatrix[i][j] = 100; formadj(adjmatrix, s); print(adjmatrix); // Dijikstraalgo.dijikstra(adjmatrix,length-2); // Dijikstraalgo.dijikstra(adjmatrix,length-1); if (Dijikstraalgo.dijikstra(adjmatrix, length - 1) - Dijikstraalgo.dijikstra(adjmatrix, length - 2) >= 0) { System.out.println("Type 1"); if (makesentence(s, length - 1) == null) { secque = s[length - 1][2] + " " + s[length - 1][1]; System.out.println(answtype + " is " + s[length - 1][2] + " " + s[length - 1][1] + " ?"); } else { secque = makesentence(s, length - 1); System.out.println(answtype + " is " + secque + " ?"); } } else { System.out.println("Type 2"); System.out.println( "Before entering the makesentence function(the cause of the null pointer exception) " + s[length - 2][0] + " " + s[length - 2][1]); if (makesentence(s, length - 2) == null) { secque = s[length - 2][2] + " " + s[length - 2][1]; System.out.println(answtype + " is " + s[length - 2][2] + " " + s[length - 2][1] + " ?"); } else { // System.out.println("null"); secque = makesentence(s, length - 2); System.out.println(answtype + " is " + secque + " ?"); } } // System.out.println("Secque is "+secque.replaceAll("[^A-Za-z ]","")); System.out.println(sentence.replace(secque.replaceAll("[^A-Za-z ]", ""), "")); long endTime = System.currentTimeMillis(); System.out.println("The time elapsed is : " + (int) (endTime - startTime) / 1000); System.out.println("The end"); }