@Test public void testLevenshtein() { Assert.assertEquals(2, StringDistance.levenshtein("GUMBO", "GAMBOL")); Assert.assertEquals(6, StringDistance.levenshtein("", "GAMBOL")); Assert.assertEquals(3, StringDistance.levenshtein("Windows", "XP Windows")); Assert.assertEquals(3, StringDistance.levenshtein("XP Windows", "Windows")); Assert.assertEquals(0, StringDistance.levenshtein("Windows", "Windows")); }
@Test public void testCountSimilar() { Assert.assertEquals(3, StringDistance.countSimilar("GUMBO", "GAMBOL")); Assert.assertEquals(0, StringDistance.countSimilar("", "GAMBOL")); Assert.assertEquals(7, StringDistance.countSimilar("Windows", "XP Windows")); Assert.assertEquals(7, StringDistance.countSimilar("XP Windows", "Windows")); Assert.assertEquals(7, StringDistance.countSimilar("Windows", "Windows")); Assert.assertEquals(3, StringDistance.countSimilar("CMD", "CMD")); }
/** * @param object: we want to rerank this * @param contexts: all contexts * @param context_table: related contexts to the object * @return */ public Annotation rank( ArrayList<Annotation> object, ArrayList<ArrayList<Annotation>> contexts, Hashtable<String, ArrayList<String>> context_table) { float max_rank_score = -1; float final_co_occur = 0; float final_is_related = 0; Annotation topRank = new Annotation("", "", 0); if (object.size() > 0) topRank = object.get(0); for (int i = 0; i < object.size(); i++) { Annotation currentAnnotation = object.get(i); String url1 = currentAnnotation.getAnnotation(); String url1_name = Utils.getDBpediaURLName(url1).toLowerCase(); if (url1_name.startsWith(".")) continue; // String url_name_for_sim = Utils.getDBpediaURLName(url1).toLowerCase(); String mention = currentAnnotation.getKeyword().replaceAll("\\(.*\\)", ""); float defaultNameSimilarity = (float) computeNameSimilarity(url1, mention, "defaultLabel"); float urlNameSimilarity = (float) JaccardSimilarity.computeSimilarity(url1_name, mention); float labelNameSimilarity = (float) computeNameSimilarity(url1, mention, "label"); // float nameStringSimilarity = (float)StringDistance.getSim(url1_name, // currentAnnotation.getKeyword()); // System.out.println(url1+" vs "+currentAnnotation.getKeyword()+": "+defaultNameSimilarity+" // "+urlNameSimilarity+" "+urlNameSimilarity); float nameSimilarity = (defaultNameSimilarity + urlNameSimilarity + urlNameSimilarity) / 3; if (defaultNameSimilarity == 1 || urlNameSimilarity == 1 || labelNameSimilarity == 1) { nameSimilarity = 1; } if (nameSimilarity == 0) { nameSimilarity = (float) StringDistance.getSim(url1_name, currentAnnotation.getKeyword()); } System.out.println("processing " + currentAnnotation + " nameSim: " + nameSimilarity); ArrayList<String> related_context = context_table.get(currentAnnotation.getKeyword()); // System.out.println("related: context:"+related_context); float co_occur_score = 0; float is_relate_score = 0; // System.out.println("all context: "+contexts); float base = contexts.size() - 1; for (int j = 0; j < contexts.size(); j++) { ArrayList<Annotation> currentContext = contexts.get(j); float max_co_occur = 0; boolean is_relate_to_the_context = false; for (int k = 0; k < currentContext.size(); k++) { if (currentContext.get(k).getKeyword().equals(currentAnnotation.getKeyword())) continue; if (!related_context.contains(currentContext.get(k).getKeyword())) { // System.out.println("not relate: "+currentContext.get(k).getKeyword()); continue; } String url2 = currentContext.get(k).getAnnotation(); // in case both url1 and url2 are used to annotate some entity mentions: biology, cell // (Cell_biology) // if(url1.equals(url2)){ // base--; // continue; // } String url2_name = Utils.getDBpediaURLName(url2).toLowerCase(); float co_occur_count = getRelationCount(url1_name, url2_name); // float co_occur_count = getRelationCount(url1, url2); if (!is_relate_to_the_context) { // is_relate_to_the_context = isURLDirectRelated(url1, url2); // if(!is_relate_to_the_context){ // is_relate_to_the_context = isURLDirectRelated(url2, url1); // } is_relate_to_the_context = isDirectRelated(url1_name, url2_name); } // System.out.println("max: "+max_co_occur+" curr: "+co_occur_count); if (co_occur_count > max_co_occur) { max_co_occur = co_occur_count; // System.out.println("max: "+max_co_occur+" curr: // "+currentContext.get(k).getAnnotation()); } if (max_co_occur == max_co_occur_threshold && is_relate_to_the_context) { break; } } if (max_co_occur > 0) { co_occur_score += (max_co_occur / (double) max_co_occur_threshold); // currentScore += 1+ (max_co_occur/1000 * // Math.log(1+max_co_occur_annotation.getScore())); // System.out.println(); // System.out.println(currentAnnotation.getAnnotation()+" "+max_co_occur+" new score: // "+co_occur_score); } if (is_relate_to_the_context) { is_relate_score++; System.out.println("watching is relate score: " + is_relate_score); } System.out.println("-------------"); // System.out.println("max_co_occur: "+currentAnnotation.getAnnotation()+" // "+max_co_occur_annotation.getAnnotation()+" " +currentScore); } // currentScore = (float) (1+Math.log(currentScore)); // System.out.println("processing "+currentAnnotation); // is_relate_score = is_relate_score / contexts.size(); // System.out.println("base: "+contexts.size()+"-1="+base); // System.out.println("co occur score: // "+co_occur_score+"^2/"+base+"^2="+((co_occur_score*co_occur_score) / (base * base))); // System.out.println("is relate score: // "+is_relate_score+"^2/"+base+"^2="+((is_relate_score*is_relate_score) / (base * base))); // System.out.println("name score: "+nameSimilarity); // is_relate_score = (is_relate_score*is_relate_score) / (base * base); // co_occur_score = (co_occur_score*co_occur_score) / (base * base); // nameSimilarity = nameSimilarity * nameSimilarity; float currentScore = 0; if (base == 0) { currentScore = (float) (nameSimilarity * Math.log(1 + currentAnnotation.getScore())); } else { if (is_relate_score > final_is_related) { final_is_related = is_relate_score; } if (co_occur_score > final_co_occur) { final_co_occur = co_occur_score; } is_relate_score = (float) Math.log10( 1 + (is_relate_score / base) * 9); // (is_relate_score/ base) * (is_relate_score/ base); co_occur_score = (float) Math.log10( 1 + (co_occur_score / base) * 9); // (co_occur_score/ base) * (co_occur_score/ base); nameSimilarity = (float) Math.log10(1 + nameSimilarity * 9); // nameSimilarity * nameSimilarity; // is_relate_score = Math.log(arg0) System.out.println("co occur score: " + co_occur_score); System.out.println("is relate score: " + is_relate_score); System.out.println("name score: " + nameSimilarity); currentScore = (float) (((co_occur_score + nameSimilarity + is_relate_score) / 3) * Math.log(1 + currentAnnotation.getScore())); System.out.println("avg: " + ((co_occur_score + nameSimilarity + is_relate_score) / 3)); // currentScore += currentAnnotation.getScore(); } System.out.println("processing " + currentAnnotation + " -> new: " + currentScore); if (currentScore > max_rank_score) { max_rank_score = currentScore; topRank = currentAnnotation; System.out.println("top: " + currentAnnotation + " new: " + currentScore); } } topRank.setScore(max_rank_score); return topRank; }