public static double LexicalSimilarityScoreMin( ArrayList<TaggedWord> taggedWords1, ArrayList<TaggedWord> taggedWords2, DISCOSimilarity discoRAM, LexicalizedParser lp) { // System.out.println(taggedWords1.size() + "," + taggedWords2.size()); // array of edge weights with default weight 0 int length1 = taggedWords1.size(); int length2 = taggedWords2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { String word1 = taggedWords1.get(i).word(); String word2 = taggedWords2.get(j).word(); double edgeWeight = 0; // LSA Similarity // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2); // DISCO Similarity // DISCOSimilarity discoObj = new DISCOSimilarity(); try { if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1; else { edgeWeight = discoRAM.similarity2(word1, word2); // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2); } } catch (Exception ex) { ex.printStackTrace(); } array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; int minLength = Math.min(length1, length2); finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / minLength * 5; // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/arrSize * 5; return finalScore; }
public Object formResult() { Set brs = new HashSet(); Set urs = new HashSet(); // scan each rule / history pair int ruleCount = 0; for (Iterator pairI = rulePairs.keySet().iterator(); pairI.hasNext(); ) { if (ruleCount % 100 == 0) { System.err.println("Rules multiplied: " + ruleCount); } ruleCount++; Pair rulePair = (Pair) pairI.next(); Rule baseRule = (Rule) rulePair.first; String baseLabel = (String) ruleToLabel.get(baseRule); List history = (List) rulePair.second; double totalProb = 0; for (int depth = 1; depth <= HISTORY_DEPTH() && depth <= history.size(); depth++) { List subHistory = history.subList(0, depth); double c_label = labelPairs.getCount(new Pair(baseLabel, subHistory)); double c_rule = rulePairs.getCount(new Pair(baseRule, subHistory)); // System.out.println("Multiplying out "+baseRule+" with history "+subHistory); // System.out.println("Count of "+baseLabel+" with "+subHistory+" is "+c_label); // System.out.println("Count of "+baseRule+" with "+subHistory+" is "+c_rule ); double prob = (1.0 / HISTORY_DEPTH()) * (c_rule) / (c_label); totalProb += prob; for (int childDepth = 0; childDepth <= Math.min(HISTORY_DEPTH() - 1, depth); childDepth++) { Rule rule = specifyRule(baseRule, subHistory, childDepth); rule.score = (float) Math.log(totalProb); // System.out.println("Created "+rule+" with score "+rule.score); if (rule instanceof UnaryRule) { urs.add(rule); } else { brs.add(rule); } } } } System.out.println("Total states: " + stateNumberer.total()); BinaryGrammar bg = new BinaryGrammar(stateNumberer.total()); UnaryGrammar ug = new UnaryGrammar(stateNumberer.total()); for (Iterator brI = brs.iterator(); brI.hasNext(); ) { BinaryRule br = (BinaryRule) brI.next(); bg.addRule(br); } for (Iterator urI = urs.iterator(); urI.hasNext(); ) { UnaryRule ur = (UnaryRule) urI.next(); ug.addRule(ur); } return new Pair(ug, bg); }
public static double LexicalSimilarityScoreMax( ArrayList<TaggedWord> taggedWords1, ArrayList<TaggedWord> taggedWords2, DISCOSimilarity discoRAM, LexicalizedParser lp) { // System.out.println(taggedWords1.size() + "," + taggedWords2.size()); // array of edge weights with default weight 0 int length1 = taggedWords1.size(); int length2 = taggedWords2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { String word1 = taggedWords1.get(i).word(); String posTag1 = taggedWords1.get(i).tag(); String word2 = taggedWords2.get(j).word(); String posTag2 = taggedWords2.get(j).tag(); ArrayList<TaggedWord> newList1 = new ArrayList<TaggedWord>(); if (posTag1.length() >= 3 && posTag1.substring(0, 3).equals("NNP")) { newList1.add(taggedWords1.get(i)); } else { String[] words = word1.split(" "); for (int k = 0; k < words.length; k++) newList1.add(new TaggedWord(words[k], posTag1)); } ArrayList<TaggedWord> newList2 = new ArrayList<TaggedWord>(); if (posTag2.length() >= 3 && posTag2.substring(0, 3).equals("NNP")) { newList2.add(taggedWords2.get(j)); } else { String[] words = word2.split(" "); for (int k = 0; k < words.length; k++) newList2.add(new TaggedWord(words[k], posTag2)); } double edgeWeight = LexicalSimilarityScoreMin(newList1, newList2, discoRAM, lp); array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; // int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }
protected void tallyInternalNode(Tree lt, List parents) { // form base rule String label = lt.label().value(); Rule baseR = ltToRule(lt); ruleToLabel.put(baseR, label); // act on each history depth for (int depth = 0, maxDepth = Math.min(HISTORY_DEPTH(), parents.size()); depth <= maxDepth; depth++) { List history = new ArrayList(parents.subList(0, depth)); // tally each history level / rewrite pair rulePairs.incrementCount(new Pair(baseR, history), 1); labelPairs.incrementCount(new Pair(label, history), 1); } }
public static double LexicalSimilarity2Level( String sentence1, String sentence2, DISCOSimilarity discoRAM, LexicalizedParser lp) { Tree parse1 = lp.apply(sentence1); Tree parse2 = lp.apply(sentence2); int phraseSizeLimit = 2; ArrayList<ArrayList<TaggedWord>> phrasesList1 = getPhrases(parse1, phraseSizeLimit); ArrayList<ArrayList<TaggedWord>> phrasesList2 = getPhrases(parse2, phraseSizeLimit); int length1 = phrasesList1.size(); int length2 = phrasesList2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { double edgeWeight = 0; ArrayList<TaggedWord> taggedWords1 = phrasesList1.get(i); ArrayList<TaggedWord> taggedWords2 = phrasesList2.get(j); // edgeWeight = LexicalSimilarityScore(taggedWords1, taggedWords2, discoRAM, lp)/5.0; edgeWeight = BestWordMatchEdgeWeight(taggedWords1, taggedWords2, discoRAM); array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; // int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; if (arrSize == 0) finalScore = 0; else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }
public static double LexicalSimilarityScoreWordNet( String sentence1, String sentence2, LeskWSD tm, LexicalizedParser lp, WordNetSimilarity ws) { ArrayList<TaggedWord> taggedWordsPrev1 = Preprocess(StanfordParse(sentence1, lp)); ArrayList<TaggedWord> taggedWordsPrev2 = Preprocess(StanfordParse(sentence2, lp)); ArrayList<TaggedWord> taggedWords1 = new ArrayList<TaggedWord>(); ArrayList<TaggedWord> taggedWords2 = new ArrayList<TaggedWord>(); WordNetSense[] sensesPrev1 = tm.LeskJWI(sentence1); WordNetSense[] sensesPrev2 = tm.LeskJWI(sentence2); // System.out.println("Senses found!"); ArrayList<WordNetSense> senses1 = new ArrayList<WordNetSense>(); ArrayList<WordNetSense> senses2 = new ArrayList<WordNetSense>(); for (int i = 0; i < taggedWordsPrev1.size(); i++) { String word = taggedWordsPrev1.get(i).word(); String posTag = taggedWordsPrev1.get(i).tag(); if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) { taggedWords1.add(new TaggedWord(word, "NN")); senses1.add(sensesPrev1[i]); } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) { taggedWords1.add(new TaggedWord(word, "VB")); senses1.add(sensesPrev1[i]); } } for (int i = 0; i < taggedWordsPrev2.size(); i++) { String word = taggedWordsPrev2.get(i).word(); String posTag = taggedWordsPrev2.get(i).tag(); if (posTag.length() >= 2 && posTag.substring(0, 2).equals("NN")) { taggedWords2.add(new TaggedWord(word, "NN")); senses2.add(sensesPrev2[i]); } else if (posTag.length() >= 2 && posTag.substring(0, 2).equals("VB")) { taggedWords2.add(new TaggedWord(word, "VB")); senses2.add(sensesPrev2[i]); } } // System.out.println(taggedWords1.size() + "," + taggedWords2.size()); // array of edge weights with default weight 0 int length1 = taggedWords1.size(); int length2 = taggedWords2.size(); int arrSize = Math.max(length1, length2); double[][] array = new double[arrSize][arrSize]; for (int i = 0; i < arrSize; i++) { for (int j = 0; j < arrSize; j++) { array[i][j] = 0; } } for (int i = 0; i < length1; i++) { for (int j = 0; j < length2; j++) { String word1 = taggedWords1.get(i).word(); String posTag1 = taggedWords1.get(i).tag(); String word2 = taggedWords2.get(j).word(); String posTag2 = taggedWords2.get(j).tag(); double edgeWeight = 0; // LSA Similarity // edgeWeight = LSASimilarity.LSAWordSimilarity(word1, word2); // DISCO Similarity // DISCOSimilarity discoObj = new DISCOSimilarity(); try { if (word1.compareToIgnoreCase(word2) == 0) edgeWeight = 1; else { // edgeWeight = ws.wuPalmerSimilarity(senses1.get(i), senses2.get(j)); edgeWeight = ws.linSimilarity(senses1.get(i), senses2.get(j)); } } catch (Exception ex) { ex.printStackTrace(); } array[i][j] = edgeWeight; } } // System.out.println("Hungarian starts " + arrSize); double finalScore; String sumType = "max"; int minLength = Math.min(length1, length2); // finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType)/minLength * 5; if (arrSize == 0) finalScore = 0; else finalScore = HungarianAlgorithm.hgAlgorithm(array, sumType) / arrSize * 5; return finalScore; }