Пример #1
0
  /**
   * Checks if the two entries represent the same publication.
   *
   * @param one BibtexEntry
   * @param two BibtexEntry
   * @return boolean
   */
  public static boolean isDuplicate(BibtexEntry one, BibtexEntry two) {

    // First check if they are of the same type - a necessary condition:
    if (one.getType() != two.getType()) {
      return false;
    }

    // The check if they have the same required fields:
    String[] fields = one.getType().getRequiredFields().toArray(new String[0]);
    double[] req;
    if (fields == null) {
      req = new double[] {0., 0.};
    } else {
      req = DuplicateCheck.compareFieldSet(fields, one, two);
    }

    if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.doubtRange) {
      // Far from the threshold value, so we base our decision on the req. fields only
      return req[0] >= DuplicateCheck.duplicateThreshold;
    }
    // Close to the threshold value, so we take a look at the optional fields, if any:
    fields = one.getType().getOptionalFields().toArray(new String[0]);
    if (fields != null) {
      double[] opt = DuplicateCheck.compareFieldSet(fields, one, two);
      double totValue =
          ((DuplicateCheck.reqWeight * req[0] * req[1]) + (opt[0] * opt[1]))
              / ((req[1] * DuplicateCheck.reqWeight) + opt[1]);
      return totValue >= DuplicateCheck.duplicateThreshold;
    }
    return req[0] >= DuplicateCheck.duplicateThreshold;
  }
Пример #2
0
  private static double correlateStrings(String s1, String s2, boolean truncate) {
    int minLength = Math.min(s1.length(), s2.length());
    if (truncate && (minLength == 1)) {
      return s1.charAt(0) == s2.charAt(0) ? 1.0 : 0.0;
    } else if ((s1.length() == 1) && (s2.length() == 1)) {
      return s1.equals(s2) ? 1.0 : 0.0;
    } else if (minLength == 0) {
      return s1.isEmpty() && s2.isEmpty() ? 1.0 : 0;
    }

    // Convert strings to numbers and harmonize length in a method dependent on truncate:
    if (truncate) {
      // Harmonize length by truncation:
      if (s1.length() > minLength) {
        s1 = s1.substring(0, minLength);
      }
      if (s2.length() > minLength) {
        s2 = s2.substring(0, minLength);
      }
    }
    double[] n1 = DuplicateCheck.numberizeString(s1);
    double[] n2 = DuplicateCheck.numberizeString(s2);
    // If truncation is disabled, harmonize length by interpolation:
    if (!truncate) {
      if (n1.length < n2.length) {
        n1 = DuplicateCheck.stretchArray(n1, n2.length);
      } else if (n2.length < n1.length) {
        n2 = DuplicateCheck.stretchArray(n2, n1.length);
      }
    }
    return DuplicateCheck.corrCoef(n1, n2);
  }
Пример #3
0
  private static int compareSingleField(String field, BibtexEntry one, BibtexEntry two) {
    String s1 = one.getField(field);
    String s2 = two.getField(field);
    if (s1 == null) {
      if (s2 == null) {
        return EMPTY_IN_BOTH;
      }
      return EMPTY_IN_ONE;
    } else if (s2 == null) {
      return EMPTY_IN_TWO;
    }

    if (field.equals("author") || field.equals("editor")) {
      // Specific for name fields.
      // Harmonise case:
      String auth1 =
          AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replaceAll(" and ", " ").toLowerCase();
      String auth2 =
          AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replaceAll(" and ", " ").toLowerCase();
      double similarity = DuplicateCheck.correlateByWords(auth1, auth2, false);
      if (similarity > 0.8) {
        return EQUAL;
      }
      return NOT_EQUAL;
    } else if (field.equals("pages")) {
      // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ".
      // We do a replace to harmonize these to a simple "-":
      // After this, a simple test for equality should be enough:
      s1 = s1.replaceAll("[- ]+", "-");
      s2 = s2.replaceAll("[- ]+", "-");
      if (s1.equals(s2)) {
        return EQUAL;
      }
      return NOT_EQUAL;
    } else if (field.equals("journal")) {
      // We do not attempt to harmonize abbreviation state of the journal names,
      // but we remove periods from the names in case they are abbreviated with
      // and without dots:
      s1 = s1.replaceAll("\\.", "").toLowerCase();
      s2 = s2.replaceAll("\\.", "").toLowerCase();
      double similarity = DuplicateCheck.correlateByWords(s1, s2, true);
      if (similarity > 0.8) {
        return EQUAL;
      }
      return NOT_EQUAL;
    } else {
      s1 = s1.toLowerCase();
      s2 = s2.toLowerCase();
      double similarity = DuplicateCheck.correlateByWords(s1, s2, false);
      if (similarity > 0.8) {
        return EQUAL;
      }
      return NOT_EQUAL;
    }
  }
Пример #4
0
 /**
  * Goes through all entries in the given database, and if at least one of them is a duplicate of
  * the given entry, as per Util.isDuplicate(BibtexEntry, BibtexEntry), the duplicate is returned.
  * The search is terminated when the first duplicate is found.
  *
  * @param database The database to search.
  * @param entry The entry of which we are looking for duplicates.
  * @return The first duplicate entry found. null if no duplicates are found.
  */
 public static BibtexEntry containsDuplicate(BibtexDatabase database, BibtexEntry entry) {
   for (BibtexEntry other : database.getEntries()) {
     if (DuplicateCheck.isDuplicate(entry, other)) {
       return other; // Duplicate found.
     }
   }
   return null; // No duplicate found.
 }
Пример #5
0
 /**
  * Compare two strings on the basis of word-by-word correlation analysis.
  *
  * @param s1 The first string
  * @param s2 The second string
  * @param truncate if true, always truncate the longer of two words to be compared to harmonize
  *     their length. If false, use interpolation to harmonize the strings.
  * @return a value in the interval [0, 1] indicating the degree of match.
  */
 static double correlateByWords(String s1, String s2, boolean truncate) {
   String[] w1 = s1.split("\\s");
   String[] w2 = s2.split("\\s");
   int n = Math.min(w1.length, w2.length);
   int misses = 0;
   for (int i = 0; i < n; i++) {
     double corr = DuplicateCheck.correlateStrings(w1[i], w2[i], truncate);
     if (corr < 0.75) {
       misses++;
     }
   }
   double missRate = (double) misses / (double) n;
   return 1 - missRate;
 }
Пример #6
0
  public EntryDeleteChange(BibtexEntry memEntry, BibtexEntry tmpEntry) {
    super("Deleted entry");
    this.memEntry = memEntry;
    this.tmpEntry = tmpEntry;

    // Compare the deleted entry in memory with the one in the tmpfile. The
    // entry could have been removed in memory.
    double matchWithTmp = DuplicateCheck.compareEntriesStrictly(memEntry, tmpEntry);

    // Check if it has been modified locally, since last tempfile was saved.
    boolean isModifiedLocally = !(matchWithTmp > 1);

    // Util.pr("Modified entry: "+memEntry.getCiteKey()+"\n Modified locally: "+isModifiedLocally
    //        +" Modifications agree: "+modificationsAgree);

    PreviewPanel pp =
        new PreviewPanel(
            null, memEntry, null, new MetaData(), Globals.prefs.get(JabRefPreferences.PREVIEW_0));
    sp = new JScrollPane(pp);
  }
Пример #7
0
 private static double[] compareFieldSet(String[] fields, BibtexEntry one, BibtexEntry two) {
   double res = 0;
   double totWeights = 0.;
   for (String field : fields) {
     double weight;
     if (DuplicateCheck.fieldWeights.containsKey(field)) {
       weight = DuplicateCheck.fieldWeights.get(field);
     } else {
       weight = 1.0;
     }
     totWeights += weight;
     int result = DuplicateCheck.compareSingleField(field, one, two);
     if (result == EQUAL) {
       res += weight;
     } else if (result == EMPTY_IN_BOTH) {
       totWeights -= weight;
     }
   }
   if (totWeights > 0) {
     return new double[] {res / totWeights, totWeights};
   }
   return new double[] {0.5, 0.0};
 }