/** * Checks if the two entries represent the same publication. * * @param one BibtexEntry * @param two BibtexEntry * @return boolean */ public static boolean isDuplicate(BibtexEntry one, BibtexEntry two) { // First check if they are of the same type - a necessary condition: if (one.getType() != two.getType()) { return false; } // The check if they have the same required fields: String[] fields = one.getType().getRequiredFields().toArray(new String[0]); double[] req; if (fields == null) { req = new double[] {0., 0.}; } else { req = DuplicateCheck.compareFieldSet(fields, one, two); } if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.doubtRange) { // Far from the threshold value, so we base our decision on the req. fields only return req[0] >= DuplicateCheck.duplicateThreshold; } // Close to the threshold value, so we take a look at the optional fields, if any: fields = one.getType().getOptionalFields().toArray(new String[0]); if (fields != null) { double[] opt = DuplicateCheck.compareFieldSet(fields, one, two); double totValue = ((DuplicateCheck.reqWeight * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.reqWeight) + opt[1]); return totValue >= DuplicateCheck.duplicateThreshold; } return req[0] >= DuplicateCheck.duplicateThreshold; }
private static double correlateStrings(String s1, String s2, boolean truncate) { int minLength = Math.min(s1.length(), s2.length()); if (truncate && (minLength == 1)) { return s1.charAt(0) == s2.charAt(0) ? 1.0 : 0.0; } else if ((s1.length() == 1) && (s2.length() == 1)) { return s1.equals(s2) ? 1.0 : 0.0; } else if (minLength == 0) { return s1.isEmpty() && s2.isEmpty() ? 1.0 : 0; } // Convert strings to numbers and harmonize length in a method dependent on truncate: if (truncate) { // Harmonize length by truncation: if (s1.length() > minLength) { s1 = s1.substring(0, minLength); } if (s2.length() > minLength) { s2 = s2.substring(0, minLength); } } double[] n1 = DuplicateCheck.numberizeString(s1); double[] n2 = DuplicateCheck.numberizeString(s2); // If truncation is disabled, harmonize length by interpolation: if (!truncate) { if (n1.length < n2.length) { n1 = DuplicateCheck.stretchArray(n1, n2.length); } else if (n2.length < n1.length) { n2 = DuplicateCheck.stretchArray(n2, n1.length); } } return DuplicateCheck.corrCoef(n1, n2); }
private static int compareSingleField(String field, BibtexEntry one, BibtexEntry two) { String s1 = one.getField(field); String s2 = two.getField(field); if (s1 == null) { if (s2 == null) { return EMPTY_IN_BOTH; } return EMPTY_IN_ONE; } else if (s2 == null) { return EMPTY_IN_TWO; } if (field.equals("author") || field.equals("editor")) { // Specific for name fields. // Harmonise case: String auth1 = AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replaceAll(" and ", " ").toLowerCase(); String auth2 = AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replaceAll(" and ", " ").toLowerCase(); double similarity = DuplicateCheck.correlateByWords(auth1, auth2, false); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } else if (field.equals("pages")) { // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ". // We do a replace to harmonize these to a simple "-": // After this, a simple test for equality should be enough: s1 = s1.replaceAll("[- ]+", "-"); s2 = s2.replaceAll("[- ]+", "-"); if (s1.equals(s2)) { return EQUAL; } return NOT_EQUAL; } else if (field.equals("journal")) { // We do not attempt to harmonize abbreviation state of the journal names, // but we remove periods from the names in case they are abbreviated with // and without dots: s1 = s1.replaceAll("\\.", "").toLowerCase(); s2 = s2.replaceAll("\\.", "").toLowerCase(); double similarity = DuplicateCheck.correlateByWords(s1, s2, true); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } else { s1 = s1.toLowerCase(); s2 = s2.toLowerCase(); double similarity = DuplicateCheck.correlateByWords(s1, s2, false); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } }
/** * Goes through all entries in the given database, and if at least one of them is a duplicate of * the given entry, as per Util.isDuplicate(BibtexEntry, BibtexEntry), the duplicate is returned. * The search is terminated when the first duplicate is found. * * @param database The database to search. * @param entry The entry of which we are looking for duplicates. * @return The first duplicate entry found. null if no duplicates are found. */ public static BibtexEntry containsDuplicate(BibtexDatabase database, BibtexEntry entry) { for (BibtexEntry other : database.getEntries()) { if (DuplicateCheck.isDuplicate(entry, other)) { return other; // Duplicate found. } } return null; // No duplicate found. }
/** * Compare two strings on the basis of word-by-word correlation analysis. * * @param s1 The first string * @param s2 The second string * @param truncate if true, always truncate the longer of two words to be compared to harmonize * their length. If false, use interpolation to harmonize the strings. * @return a value in the interval [0, 1] indicating the degree of match. */ static double correlateByWords(String s1, String s2, boolean truncate) { String[] w1 = s1.split("\\s"); String[] w2 = s2.split("\\s"); int n = Math.min(w1.length, w2.length); int misses = 0; for (int i = 0; i < n; i++) { double corr = DuplicateCheck.correlateStrings(w1[i], w2[i], truncate); if (corr < 0.75) { misses++; } } double missRate = (double) misses / (double) n; return 1 - missRate; }
public EntryDeleteChange(BibtexEntry memEntry, BibtexEntry tmpEntry) { super("Deleted entry"); this.memEntry = memEntry; this.tmpEntry = tmpEntry; // Compare the deleted entry in memory with the one in the tmpfile. The // entry could have been removed in memory. double matchWithTmp = DuplicateCheck.compareEntriesStrictly(memEntry, tmpEntry); // Check if it has been modified locally, since last tempfile was saved. boolean isModifiedLocally = !(matchWithTmp > 1); // Util.pr("Modified entry: "+memEntry.getCiteKey()+"\n Modified locally: "+isModifiedLocally // +" Modifications agree: "+modificationsAgree); PreviewPanel pp = new PreviewPanel( null, memEntry, null, new MetaData(), Globals.prefs.get(JabRefPreferences.PREVIEW_0)); sp = new JScrollPane(pp); }
private static double[] compareFieldSet(String[] fields, BibtexEntry one, BibtexEntry two) { double res = 0; double totWeights = 0.; for (String field : fields) { double weight; if (DuplicateCheck.fieldWeights.containsKey(field)) { weight = DuplicateCheck.fieldWeights.get(field); } else { weight = 1.0; } totWeights += weight; int result = DuplicateCheck.compareSingleField(field, one, two); if (result == EQUAL) { res += weight; } else if (result == EMPTY_IN_BOTH) { totWeights -= weight; } } if (totWeights > 0) { return new double[] {res / totWeights, totWeights}; } return new double[] {0.5, 0.0}; }