コード例 #1
0
  public float computeNameSimilarity(String url, String mention, String field) {
    try {
      ArrayList<String> defaultLabels = ReadIndex.readIndexByTerm(labelSearcher, "url", url, field);
      float maxScore = 0;
      for (int i = 0; i < defaultLabels.size(); i++) {

        if (defaultLabels.get(i).matches(".*u\\d+.*\\s+.*")) {
          continue;
        }
        String defaultLabel = defaultLabels.get(i).split("\\|")[0];
        float score = (float) JaccardSimilarity.computeSimilarity(defaultLabel, mention);
        if (score > maxScore) {
          maxScore = score;
          // System.out.println(url+" default: "+defaultLabel+" "+maxScore);
        }
      }

      return maxScore;
    } catch (Exception e) {
      e.printStackTrace();
    }
    return 0;
  }
コード例 #2
0
  /**
   * @param object: we want to rerank this
   * @param contexts: all contexts
   * @param context_table: related contexts to the object
   * @return
   */
  public Annotation rank(
      ArrayList<Annotation> object,
      ArrayList<ArrayList<Annotation>> contexts,
      Hashtable<String, ArrayList<String>> context_table) {
    float max_rank_score = -1;
    float final_co_occur = 0;
    float final_is_related = 0;

    Annotation topRank = new Annotation("", "", 0);

    if (object.size() > 0) topRank = object.get(0);

    for (int i = 0; i < object.size(); i++) {
      Annotation currentAnnotation = object.get(i);

      String url1 = currentAnnotation.getAnnotation();
      String url1_name = Utils.getDBpediaURLName(url1).toLowerCase();
      if (url1_name.startsWith(".")) continue;
      // String url_name_for_sim = Utils.getDBpediaURLName(url1).toLowerCase();
      String mention = currentAnnotation.getKeyword().replaceAll("\\(.*\\)", "");
      float defaultNameSimilarity = (float) computeNameSimilarity(url1, mention, "defaultLabel");
      float urlNameSimilarity = (float) JaccardSimilarity.computeSimilarity(url1_name, mention);
      float labelNameSimilarity = (float) computeNameSimilarity(url1, mention, "label");
      // float nameStringSimilarity = (float)StringDistance.getSim(url1_name,
      // currentAnnotation.getKeyword());
      // System.out.println(url1+" vs "+currentAnnotation.getKeyword()+": "+defaultNameSimilarity+"
      // "+urlNameSimilarity+" "+urlNameSimilarity);
      float nameSimilarity = (defaultNameSimilarity + urlNameSimilarity + urlNameSimilarity) / 3;
      if (defaultNameSimilarity == 1 || urlNameSimilarity == 1 || labelNameSimilarity == 1) {
        nameSimilarity = 1;
      }

      if (nameSimilarity == 0) {
        nameSimilarity = (float) StringDistance.getSim(url1_name, currentAnnotation.getKeyword());
      }

      System.out.println("processing " + currentAnnotation + " nameSim: " + nameSimilarity);
      ArrayList<String> related_context = context_table.get(currentAnnotation.getKeyword());
      // System.out.println("related: context:"+related_context);

      float co_occur_score = 0;
      float is_relate_score = 0;
      // System.out.println("all context: "+contexts);

      float base = contexts.size() - 1;

      for (int j = 0; j < contexts.size(); j++) {

        ArrayList<Annotation> currentContext = contexts.get(j);
        float max_co_occur = 0;

        boolean is_relate_to_the_context = false;
        for (int k = 0; k < currentContext.size(); k++) {
          if (currentContext.get(k).getKeyword().equals(currentAnnotation.getKeyword())) continue;

          if (!related_context.contains(currentContext.get(k).getKeyword())) {
            // System.out.println("not relate: "+currentContext.get(k).getKeyword());
            continue;
          }
          String url2 = currentContext.get(k).getAnnotation();

          // in case both url1 and url2 are used to annotate some entity mentions: biology, cell
          // (Cell_biology)
          //					if(url1.equals(url2)){
          //						base--;
          //						continue;
          //					}
          String url2_name = Utils.getDBpediaURLName(url2).toLowerCase();
          float co_occur_count = getRelationCount(url1_name, url2_name);
          // float co_occur_count = getRelationCount(url1, url2);
          if (!is_relate_to_the_context) {
            // is_relate_to_the_context = isURLDirectRelated(url1, url2);
            //						if(!is_relate_to_the_context){
            //							is_relate_to_the_context = isURLDirectRelated(url2, url1);
            //						}
            is_relate_to_the_context = isDirectRelated(url1_name, url2_name);
          }

          // System.out.println("max: "+max_co_occur+" curr: "+co_occur_count);
          if (co_occur_count > max_co_occur) {
            max_co_occur = co_occur_count;
            // System.out.println("max: "+max_co_occur+" curr:
            // "+currentContext.get(k).getAnnotation());
          }
          if (max_co_occur == max_co_occur_threshold && is_relate_to_the_context) {
            break;
          }
        }
        if (max_co_occur > 0) {

          co_occur_score += (max_co_occur / (double) max_co_occur_threshold);
          // currentScore += 1+ (max_co_occur/1000 *
          // Math.log(1+max_co_occur_annotation.getScore()));
          // System.out.println();
          // System.out.println(currentAnnotation.getAnnotation()+" "+max_co_occur+" new score:
          // "+co_occur_score);
        }
        if (is_relate_to_the_context) {
          is_relate_score++;

          System.out.println("watching is relate score: " + is_relate_score);
        }
        System.out.println("-------------");
        // System.out.println("max_co_occur: "+currentAnnotation.getAnnotation()+"
        // "+max_co_occur_annotation.getAnnotation()+" "  +currentScore);
      }
      // currentScore = (float) (1+Math.log(currentScore));
      // System.out.println("processing "+currentAnnotation);
      // is_relate_score = is_relate_score / contexts.size();

      //			System.out.println("base: "+contexts.size()+"-1="+base);
      //			System.out.println("co occur score:
      // "+co_occur_score+"^2/"+base+"^2="+((co_occur_score*co_occur_score) / (base * base)));
      //			System.out.println("is relate score:
      // "+is_relate_score+"^2/"+base+"^2="+((is_relate_score*is_relate_score) / (base * base)));
      //			System.out.println("name score: "+nameSimilarity);

      //			is_relate_score = (is_relate_score*is_relate_score) / (base * base);
      //			co_occur_score = (co_occur_score*co_occur_score) / (base * base);
      //			nameSimilarity = nameSimilarity * nameSimilarity;
      float currentScore = 0;
      if (base == 0) {
        currentScore = (float) (nameSimilarity * Math.log(1 + currentAnnotation.getScore()));
      } else {

        if (is_relate_score > final_is_related) {
          final_is_related = is_relate_score;
        }

        if (co_occur_score > final_co_occur) {
          final_co_occur = co_occur_score;
        }

        is_relate_score =
            (float)
                Math.log10(
                    1
                        + (is_relate_score / base)
                            * 9); // (is_relate_score/ base) * (is_relate_score/ base);
        co_occur_score =
            (float)
                Math.log10(
                    1
                        + (co_occur_score / base)
                            * 9); // (co_occur_score/ base) * (co_occur_score/ base);
        nameSimilarity =
            (float) Math.log10(1 + nameSimilarity * 9); // nameSimilarity * nameSimilarity;

        // is_relate_score = Math.log(arg0)

        System.out.println("co occur score: " + co_occur_score);
        System.out.println("is relate score: " + is_relate_score);
        System.out.println("name score: " + nameSimilarity);

        currentScore =
            (float)
                (((co_occur_score + nameSimilarity + is_relate_score) / 3)
                    * Math.log(1 + currentAnnotation.getScore()));
        System.out.println("avg: " + ((co_occur_score + nameSimilarity + is_relate_score) / 3));
        // currentScore += currentAnnotation.getScore();
      }
      System.out.println("processing " + currentAnnotation + " -> new: " + currentScore);
      if (currentScore > max_rank_score) {
        max_rank_score = currentScore;
        topRank = currentAnnotation;
        System.out.println("top: " + currentAnnotation + " new: " + currentScore);
      }
    }

    topRank.setScore(max_rank_score);
    return topRank;
  }