/** * Checks if the two entries represent the same publication. * * @param one BibEntry * @param two BibEntry * @return boolean */ public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) { // First check if they are of the same type - a necessary condition: if (!one.getType().equals(two.getType())) { return false; } EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode); // The check if they have the same required fields: java.util.List<String> var = type.getRequiredFieldsFlat(); String[] fields = var.toArray(new String[var.size()]); double[] req; if (fields == null) { req = new double[] {0., 0.}; } else { req = DuplicateCheck.compareFieldSet(fields, one, two); } if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) { // Far from the threshold value, so we base our decision on the req. fields only return req[0] >= DuplicateCheck.duplicateThreshold; } // Close to the threshold value, so we take a look at the optional fields, if any: java.util.List<String> optionalFields = type.getOptionalFields(); fields = optionalFields.toArray(new String[optionalFields.size()]); if (fields != null) { double[] opt = DuplicateCheck.compareFieldSet(fields, one, two); double totValue = ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1])) / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]); return totValue >= DuplicateCheck.duplicateThreshold; } return req[0] >= DuplicateCheck.duplicateThreshold; }
private static int compareSingleField(String field, BibEntry one, BibEntry two) { String s1 = one.getField(field); String s2 = two.getField(field); if (s1 == null) { if (s2 == null) { return EMPTY_IN_BOTH; } return EMPTY_IN_ONE; } else if (s2 == null) { return EMPTY_IN_TWO; } if ("author".equals(field) || "editor".equals(field)) { // Specific for name fields. // Harmonise case: String auth1 = AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replace(" and ", " ").toLowerCase(); String auth2 = AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replace(" and ", " ").toLowerCase(); double similarity = DuplicateCheck.correlateByWords(auth1, auth2); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } else if ("pages".equals(field)) { // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ". // We do a replace to harmonize these to a simple "-": // After this, a simple test for equality should be enough: s1 = s1.replaceAll("[- ]+", "-"); s2 = s2.replaceAll("[- ]+", "-"); if (s1.equals(s2)) { return EQUAL; } return NOT_EQUAL; } else if ("journal".equals(field)) { // We do not attempt to harmonize abbreviation state of the journal names, // but we remove periods from the names in case they are abbreviated with // and without dots: s1 = s1.replace(".", "").toLowerCase(); s2 = s2.replace(".", "").toLowerCase(); double similarity = DuplicateCheck.correlateByWords(s1, s2); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } else { s1 = s1.toLowerCase(); s2 = s2.toLowerCase(); double similarity = DuplicateCheck.correlateByWords(s1, s2); if (similarity > 0.8) { return EQUAL; } return NOT_EQUAL; } }
/** * Goes through all entries in the given database, and if at least one of them is a duplicate of * the given entry, as per Util.isDuplicate(BibEntry, BibEntry), the duplicate is returned. The * search is terminated when the first duplicate is found. * * @param database The database to search. * @param entry The entry of which we are looking for duplicates. * @return The first duplicate entry found. null if no duplicates are found. */ public static Optional<BibEntry> containsDuplicate( BibDatabase database, BibEntry entry, BibDatabaseMode bibDatabaseMode) { for (BibEntry other : database.getEntries()) { if (DuplicateCheck.isDuplicate(entry, other, bibDatabaseMode)) { return Optional.of(other); // Duplicate found. } } return Optional.empty(); // No duplicate found. }
private static double[] compareFieldSet(String[] fields, BibEntry one, BibEntry two) { double res = 0; double totWeights = 0.; for (String field : fields) { double weight; if (DuplicateCheck.FIELD_WEIGHTS.containsKey(field)) { weight = DuplicateCheck.FIELD_WEIGHTS.get(field); } else { weight = 1.0; } totWeights += weight; int result = DuplicateCheck.compareSingleField(field, one, two); if (result == EQUAL) { res += weight; } else if (result == EMPTY_IN_BOTH) { totWeights -= weight; } } if (totWeights > 0) { return new double[] {res / totWeights, totWeights}; } return new double[] {0.5, 0.0}; }