private boolean LexicalAnalyzer(ArrayList<Word> words, int index, String newWord) { String[] sent = toSentence(words); /// lexical analyzer List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); // PrintStream outa = new PrintStream(new FileOutputStream("output1.txt")); // System.setOut(outa); // System.out.println("KKKKKKK"); // parse.pennPrint(); String oldTree = parse.toString(); // String oldTree=baos.toString(); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); // System.out.println(oldTree); words.get(index).setNewValue(newWord); sent = toSentence(words); rawWords = Sentence.toCoreLabelList(sent); parse = lp.apply(rawWords); // PrintStream outb = new PrintStream(new FileOutputStream("output2.txt")); // System.setOut(outb); // parse.pennPrint(); String newTree = parse.toString(); oldTree = oldTree.replaceAll(words.get(index).getOrigValue() + "[)]", newWord + ")"); // System.setOut(new PrintStream(new FileOutputStream(FileDescriptor.out))); System.out.println(oldTree + "\n" + newTree); // System.out.println(oldTree.equals(newTree)); if (oldTree.equals(newTree)) { if (index == 0) { String str = words.get(index).getNewValue(); String cap = str.substring(0, 1).toUpperCase() + str.substring(1); words.get(index).setNewValue(cap); } return true; } else { words.get(index).setNewValue(null); return false; } /* catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; }*/ // return true; }
public static ArrayList<TaggedWord> StanfordParse(String sentence, LexicalizedParser lp) { TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> rawWords2 = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); Tree parse = lp.apply(rawWords2); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); return taggedWords; }
private ArrayList<TaggedWord> parseSentenceTD(String text) { System.out.println("Parsing sentence..."); ArrayList<TaggedWord> tw = new ArrayList<TaggedWord>(); Reader reader = new StringReader(text); for (List<HasWord> sentence : new DocumentPreprocessor(reader)) { Tree parse = lp.apply(sentence); tw = parse.taggedYield(); } return tw; }
public static void main(String args[]) { // String sentence1 = "A large bird standing on a table picks up a plastic glass // containing liquid and places it in a bowl of something."; // String sentence2 = "A bird picks up a plastic cup containing a liquid with it's beak // and puts the cup into a bowl."; // LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); // LeskWSD tm = new LeskWSD(lp); // WordNetSimilarity ws = new WordNetSimilarity(); // // System.out.println(LexicalSimilarityScoreWordNet(sentence1, sentence2, tm, lp, ws)); String sentence = "The broader Standard & Poor's 500 Index <.SPX> shed 2.38 points, or 0.24 percent, at 995.10."; LexicalizedParser lp = new LexicalizedParser("englishPCFG.ser.gz"); Tree parse = lp.apply(sentence); ArrayList<TaggedWord> taggedWords = parse.taggedYield(); taggedWords = Preprocess(taggedWords); for (int i = 0; i < taggedWords.size(); i++) System.out.println(taggedWords.get(i).word()); }
public static double LexicalSimilarity2Level( String sentence1, String sentence2, DISCOSimilarity discoRAM, LexicalizedParser lp) { Tree parse1 = lp.apply(sentence1); Tree parse2 = lp.apply(sentence2); int phraseSizeLimit = 2; ArrayList<ArrayList<TaggedWord>> phrasesList1 = getPhrases(parse1, phraseSizeLimit); ArrayList<ArrayList<TaggedWord>> phrasesList2 = getPhrases(parse2, phraseSizeLimit); int length1 = phrasesList1.size(); int length2 = phrasesList2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { double edgeWeight = 0; ArrayList<TaggedWord> taggedWords1 = phrasesList1.get(i); ArrayList<TaggedWord> taggedWords2 = phrasesList2.get(j); // edgeWeight = LexicalSimilarityScore(taggedWords1, taggedWords2, discoRAM, lp)/5.0; edgeWeight = BestWordMatchEdgeWeight(taggedWords1, taggedWords2, discoRAM); array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; // int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; if (arrSize == 0) finalScore = 0; else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }
private Collection<TypedDependency> parseSentenceTDL(String text) { System.out.println("Parsing sentence..."); Collection<TypedDependency> tdl = null; TreebankLanguagePack tlp = lp.treebankLanguagePack(); GrammaticalStructureFactory gsf = null; if (tlp.supportsGrammaticalStructures()) { gsf = tlp.grammaticalStructureFactory(); } Reader reader = new StringReader(text); for (List<HasWord> sentence : new DocumentPreprocessor(reader)) { Tree parse = lp.apply(sentence); if (gsf != null) { GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); tdl = gs.allTypedDependencies(); } } return tdl; }
private static List<TypedDependency> getDependencies(String sentence) { if (pipeline == null) { loadModels(); } TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> rawWords2 = tok.tokenize(); Tree parse = lp.apply(rawWords2); // parse.pennPrint(); // // System.out.println(parse.toString()); TreebankLanguagePack tlp = lp.treebankLanguagePack(); // PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); return tdl; }
private Tree getPosTree(String sentence) { final Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); final List<CoreLabel> tokens = tokenizer.tokenize(); return parser.apply(tokens); }
public static Tree parse(String str) { List<CoreLabel> tokens = tokenize(str); Tree tree = parser.apply(tokens); return tree; }
public HashMap<String,ArrayList<TreeData>> parseAllDocs() throws IOException{ String grammar = "./jsan_resources/englishPCFG.ser.gz"; String[] options = { "-maxLength", "120", "-retainTmpSubcategories" }; // LexicalizedParser lp = new LexicalizedParser(grammar, options); LexicalizedParser lp = new LexicalizedParser() TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); Iterable<List<? extends HasWord>> sentences; ArrayList<HashMap<String,ArrayList<String>>> everything = new ArrayList<HashMap<String,ArrayList<String>>>(3); everything.add(0,otherSampleStrings); everything.add(1,authorSampleStrings); everything.add(2,toModifyStrings); Iterator<HashMap<String,ArrayList<String>>> everythingIter = everything.iterator(); int docTypeNumber = -1; // 0 for otherSampleStrings, 1 for authorSampleStrings, 2 for toModifyStrings int numLoaded = 0; while(everythingIter.hasNext()){ docTypeNumber++; HashMap<String,ArrayList<String>> currentSampleStrings = docPathFinder(); Set<String> currentDocStrings = currentSampleStrings.keySet(); Iterator<String> docStrIter = currentDocStrings.iterator(); String docID; ArrayList<String> sentenceTokens; allTreeProcessors[docTypeNumber] = new TreeProcessor(); allTreeProcessors[docTypeNumber].clearLoadedTreeDataMaps(); numLoaded=0; while(docStrIter.hasNext()){ docID = docStrIter.next(); sentenceTokens = currentSampleStrings.get(docID); if(sentenceTokens == null){ allTreeProcessors[docTypeNumber].loadTreeDataMap(docID, GRAMMAR_DIR, false); numLoaded++; continue; } //System.out.println(sentenceTokens.size()+", strIter.hasNext? -> "+strIter.hasNext()); numSentences = sentenceTokens.size(); //initialize(numSentences); Iterator<String> sentIter = sentenceTokens.iterator(); List<List<? extends HasWord>> tmp = new ArrayList<List<? extends HasWord>>(); String tempSent; while(sentIter.hasNext()){ tempSent = sentIter.next(); Tokenizer<? extends HasWord> toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(tempSent)); List<? extends HasWord> sentenceTokenized = toke.tokenize(); tmp.add(sentenceTokenized); } sentences = tmp; //int numDone = 0; TreeProcessor.singleDocMap.clear(); boolean willSaveResults = true; for (List<? extends HasWord> sentence : sentences) { Tree parse = lp.apply(sentence); //parse.pennPrint(); //System.out.println(parse.treeSkeletonCopy().toString()); //System.out.println(parse.taggedYield()); //System.out.println(); //printSubTrees(parse); //TreeContainer.recurseTree(parse,"breadth"); allTreeProcessors[docTypeNumber].processTree(parse, 0, willSaveResults); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //numDone++; //System.out.println("sent "+numDone+" of "+numSentences+" done "); //System.out.println(tc.processedTrees.toString()); //in.nextLine(); //TreeContainer.recurseTree(parse, "depth"); //in.nextLine(); //addTree(parse); //GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);//TODO: LOOK AT THIS //Collection tdl = gs.typedDependenciesCCprocessed(true); //System.out.println(tdl); //System.out.println(); } if(willSaveResults == true) ObjectIO.writeObject(TreeProcessor.singleDocMap,docID, GRAMMAR_DIR); //System.out.println("After all sents: "); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //String sent3 = "This is one last test!"; //Tree parse3 = lp.apply(sent3); //parse3.pennPrint(); //System.out.println("After sorting and writing:"); //System.out.println(tc.processedTrees.toString().replaceAll("\\]\\], \\(","\\]\\]\n\\(")); //Scanner in = new Scanner(System.in); //System.out.println("First one done."); //in.nextLine(); //viewTrees(); } //TreeProcessor.writeTreeDataToCSV(sortedTD,docID); allTreeProcessors[docTypeNumber].unmergedMaps = new ArrayList<HashMap<String,TreeData>>(numLoaded+1); } int i= 0; allParsedAndOrdered.clear(); String[] docTypes = new String[]{"otherSample","authorSample","toModify"}; for(i=0; i < 3; i++){ allTreeProcessors[i].unmergedMaps.add(allTreeProcessors[i].processedTrees); allTreeProcessors[i].unmergedMaps.addAll(allTreeProcessors[i].loadedTreeDataMaps); allTreeProcessors[i].mergeTreeDataLists(allTreeProcessors[i].unmergedMaps); allParsedAndOrdered.put(docTypes[i],allTreeProcessors[i].sortTreeData(allTreeProcessors[i].mergedMap)); } //ArrayList<TreeData> sortedTD = TreeContainer.sortTreeData(TreeContainer.allProcessedTrees); //TreeContainer.writeTreeDataToCSV(sortedTD,"ALL_AUTHORS"); return allParsedAndOrdered; }
public ArrayList<String> getKeyWordsDependency(String sentence, String keyword) { LexicalizedParser lp = LexicalizedParser.loadModel( "/home/mingrui/Desktop/englishPCFG.ser.gz", "-maxLength", "80", "-retainTmpSubcategories"); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); // Uncomment the following line to obtain original Stanford Dependencies // tlp.setGenerateOriginalDependencies(true); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); String[] array = sentence.split("\\s+"); Tree parse = lp.apply(Sentence.toWordList(array)); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); ArrayList<String> keywordsDependency = new ArrayList<String>(); ArrayList<String> keywordsDependencyWithLemmatization = new ArrayList<String>(); // String lemmatizedKeyword = lemmatize(keyword); for (TypedDependency t : tdl) { String d = t.toString(); String dependencyType = d.substring(0, d.indexOf("(")); String pair = d.substring(d.indexOf("(") + 1, d.indexOf("(")); String[] terms = pair.split(","); String term1 = terms[0].trim(); String term2 = terms[1].trim(); // Match keywords with the terms in the tuples, if matched, add the // tuple into the arraylist String[] wordsplitted = keyword.split(" "); for (String key : wordsplitted) { if (term1.equals(key)) { keywordsDependency.add(t.toString()); } if (term2.equals(key)) { keywordsDependency.add(t.toString()); } } } String lemmatizedKeywords = lemmatize(keyword); int lbefore = keyword.split(" ").length; int lafter = lemmatizedKeywords.split(" ").length; if (lbefore == lafter) { return keywordsDependency; } else { String[] split = keyword.split(" "); for (String s : split) { String[] lemmas = lemmatize(s).split(" "); boolean sameLength = lemmas.length == s.split(" ").length; if (sameLength) { // Compare the length of one key_word or key_phrase before and after // lemmatization continue; } else { for (String tuple : keywordsDependency) { if (getTupleTerms(tuple)[0].equals( s)) { // Find the tuple that contains the original keyword/key_phrase String dependent = getTupleTerms(tuple)[1]; // String[] } } // for(String l : lemma) } } return keywordsDependencyWithLemmatization; } }