예제 #1
0
  /**
   * gets the similarity of the two strings using JaccardSimilarity.
   *
   * @param string1
   * @param string2
   * @return a value between 0-1 of the similarity
   */
  public float getSimilarity(final String string1, final String string2) {
    /*
     * Each instance is represented as a Jaccard vector similarity function.
     * The Jaccard between two vectors X and Y is
     *
     * (X*Y) / (|X||Y|-(X*Y))
     *
     * where (X*Y) is the inner product of X and Y, and |X| = (X*X)^1/2,
     * i.e. the Euclidean norm of X.
     *
     * This can more easily be described as ( |X & Y| ) / ( | X or Y | )
     */
    // todo this needs checking
    final ArrayList<String> str1Tokens = tokeniser.tokenizeToArrayList(string1);
    final ArrayList<String> str2Tokens = tokeniser.tokenizeToArrayList(string2);

    final Set<String> allTokens = new HashSet<String>();
    allTokens.addAll(str1Tokens);
    final int termsInString1 = allTokens.size();
    final Set<String> secondStringTokens = new HashSet<String>();
    secondStringTokens.addAll(str2Tokens);
    final int termsInString2 = secondStringTokens.size();

    // now combine the sets
    allTokens.addAll(secondStringTokens);
    final int commonTerms = (termsInString1 + termsInString2) - allTokens.size();

    // return JaccardSimilarity
    return (float) (commonTerms) / (float) (allTokens.size());
  }
 /**
  * gets the estimated time in milliseconds it takes to perform a similarity timing.
  *
  * @param string1 string 1
  * @param string2 string 2
  * @return the estimated time in milliseconds taken to perform the similarity measure
  */
 public float getSimilarityTimingEstimated(final String string1, final String string2) {
   // timed millisecond times with string lengths from 1 + 50 each increment
   // 0	0.01	0.03	0.05	0.07	0.11	0.14	0.18	0.23	0.27	0.33	0.38	0.46	0.51	0.59	0.67	0.75	0.86	0.94
   //	1.01	1.15	1.22	1.5	1.45	1.93	1.7	2.28	1.95	2.42	2.21	2.99	2.54	3.34	2.86	3.76	3.17	4.06	3.5
   //	4.32	3.9	5.23	4.32	5.34	4.83	6.15	5.07	6.34	5.64	7.29	5.97	8.12	6.55	8.46	7	8.83	7.52	9.71
   //	8.12	10.68	8.46
   final float str1Tokens = tokeniser.tokenizeToArrayList(string1).size();
   final float str2Tokens = tokeniser.tokenizeToArrayList(string2).size();
   return (str1Tokens * str2Tokens) * ESTIMATEDTIMINGCONST;
 }
예제 #3
0
 /**
  * gets the estimated time in milliseconds it takes to perform a similarity timing.
  *
  * @param string1 string 1
  * @param string2 string 2
  * @return the estimated time in milliseconds taken to perform the similarity measure
  */
 public float getSimilarityTimingEstimated(final String string1, final String string2) {
   // timed millisecond times with string lengths from 1 + 50 each
   // increment
   // 0 0.02 0.03 0.05 0.07 0.11 0.14 0.18 0.23 0.27 0.34 0.38 0.45 0.51
   // 0.59 0.67 0.75 0.83 0.94 1 1.15 1.22 1.49 1.46 1.93 1.69 2.11 1.95
   // 2.42 2.21 2.87 2.51 3.27 2.86 3.69 3.22 3.9 3.5 4.74 3.9 4.95 4.23
   // 5.49 4.72 5.8 5.21 6.38 5.64 7.25 5.97 7.81 6.55 8.46 7 9.27 7.52
   // 10.15 8.12 10.15 8.46
   final float str1Tokens = tokeniser.tokenizeToArrayList(string1).size();
   final float str2Tokens = tokeniser.tokenizeToArrayList(string2).size();
   return (str1Tokens * str2Tokens) * ESTIMATEDTIMINGCONST;
 }
  /**
   * gets the similarity of the two strings using OverlapCoefficient
   *
   * <p>overlap_coefficient(q,r) = ( | q & r | ) / min{ | q | , | r | }.
   *
   * @param string1
   * @param string2
   * @return a value between 0-1 of the similarity
   */
  public float getSimilarity(final String string1, final String string2) {
    final ArrayList<String> str1Tokens = tokeniser.tokenizeToArrayList(string1);
    final ArrayList<String> str2Tokens = tokeniser.tokenizeToArrayList(string2);

    final Set<String> allTokens = new HashSet<String>();
    allTokens.addAll(str1Tokens);
    final int termsInString1 = allTokens.size();
    final Set<String> secondStringTokens = new HashSet<String>();
    secondStringTokens.addAll(str2Tokens);
    final int termsInString2 = secondStringTokens.size();

    // now combine the sets
    allTokens.addAll(secondStringTokens);
    final int commonTerms = (termsInString1 + termsInString2) - allTokens.size();

    // return overlap_coefficient
    return (float) (commonTerms) / (float) Math.min(termsInString1, termsInString2);
  }