Ejemplo n.º 1
0
  /**
   * Checks if the two entries represent the same publication.
   *
   * @param one BibEntry
   * @param two BibEntry
   * @return boolean
   */
  public static boolean isDuplicate(BibEntry one, BibEntry two, BibDatabaseMode bibDatabaseMode) {

    // First check if they are of the same type - a necessary condition:
    if (!one.getType().equals(two.getType())) {
      return false;
    }
    EntryType type = EntryTypes.getTypeOrDefault(one.getType(), bibDatabaseMode);

    // The check if they have the same required fields:
    java.util.List<String> var = type.getRequiredFieldsFlat();
    String[] fields = var.toArray(new String[var.size()]);
    double[] req;
    if (fields == null) {
      req = new double[] {0., 0.};
    } else {
      req = DuplicateCheck.compareFieldSet(fields, one, two);
    }

    if (Math.abs(req[0] - DuplicateCheck.duplicateThreshold) > DuplicateCheck.DOUBT_RANGE) {
      // Far from the threshold value, so we base our decision on the req. fields only
      return req[0] >= DuplicateCheck.duplicateThreshold;
    }
    // Close to the threshold value, so we take a look at the optional fields, if any:
    java.util.List<String> optionalFields = type.getOptionalFields();
    fields = optionalFields.toArray(new String[optionalFields.size()]);
    if (fields != null) {
      double[] opt = DuplicateCheck.compareFieldSet(fields, one, two);
      double totValue =
          ((DuplicateCheck.REQUIRED_WEIGHT * req[0] * req[1]) + (opt[0] * opt[1]))
              / ((req[1] * DuplicateCheck.REQUIRED_WEIGHT) + opt[1]);
      return totValue >= DuplicateCheck.duplicateThreshold;
    }
    return req[0] >= DuplicateCheck.duplicateThreshold;
  }
Ejemplo n.º 2
0
  private static int compareSingleField(String field, BibEntry one, BibEntry two) {
    String s1 = one.getField(field);
    String s2 = two.getField(field);
    if (s1 == null) {
      if (s2 == null) {
        return EMPTY_IN_BOTH;
      }
      return EMPTY_IN_ONE;
    } else if (s2 == null) {
      return EMPTY_IN_TWO;
    }

    if ("author".equals(field) || "editor".equals(field)) {
      // Specific for name fields.
      // Harmonise case:
      String auth1 =
          AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replace(" and ", " ").toLowerCase();
      String auth2 =
          AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replace(" and ", " ").toLowerCase();
      double similarity = DuplicateCheck.correlateByWords(auth1, auth2);
      if (similarity > 0.8) {
        return EQUAL;
      }
      return NOT_EQUAL;
    } else if ("pages".equals(field)) {
      // Pages can be given with a variety of delimiters, "-", "--", " - ", " -- ".
      // We do a replace to harmonize these to a simple "-":
      // After this, a simple test for equality should be enough:
      s1 = s1.replaceAll("[- ]+", "-");
      s2 = s2.replaceAll("[- ]+", "-");
      if (s1.equals(s2)) {
        return EQUAL;
      }
      return NOT_EQUAL;
    } else if ("journal".equals(field)) {
      // We do not attempt to harmonize abbreviation state of the journal names,
      // but we remove periods from the names in case they are abbreviated with
      // and without dots:
      s1 = s1.replace(".", "").toLowerCase();
      s2 = s2.replace(".", "").toLowerCase();
      double similarity = DuplicateCheck.correlateByWords(s1, s2);
      if (similarity > 0.8) {
        return EQUAL;
      }
      return NOT_EQUAL;
    } else {
      s1 = s1.toLowerCase();
      s2 = s2.toLowerCase();
      double similarity = DuplicateCheck.correlateByWords(s1, s2);
      if (similarity > 0.8) {
        return EQUAL;
      }
      return NOT_EQUAL;
    }
  }
Ejemplo n.º 3
0
 /**
  * Goes through all entries in the given database, and if at least one of them is a duplicate of
  * the given entry, as per Util.isDuplicate(BibEntry, BibEntry), the duplicate is returned. The
  * search is terminated when the first duplicate is found.
  *
  * @param database The database to search.
  * @param entry The entry of which we are looking for duplicates.
  * @return The first duplicate entry found. null if no duplicates are found.
  */
 public static Optional<BibEntry> containsDuplicate(
     BibDatabase database, BibEntry entry, BibDatabaseMode bibDatabaseMode) {
   for (BibEntry other : database.getEntries()) {
     if (DuplicateCheck.isDuplicate(entry, other, bibDatabaseMode)) {
       return Optional.of(other); // Duplicate found.
     }
   }
   return Optional.empty(); // No duplicate found.
 }
Ejemplo n.º 4
0
 private static double[] compareFieldSet(String[] fields, BibEntry one, BibEntry two) {
   double res = 0;
   double totWeights = 0.;
   for (String field : fields) {
     double weight;
     if (DuplicateCheck.FIELD_WEIGHTS.containsKey(field)) {
       weight = DuplicateCheck.FIELD_WEIGHTS.get(field);
     } else {
       weight = 1.0;
     }
     totWeights += weight;
     int result = DuplicateCheck.compareSingleField(field, one, two);
     if (result == EQUAL) {
       res += weight;
     } else if (result == EMPTY_IN_BOTH) {
       totWeights -= weight;
     }
   }
   if (totWeights > 0) {
     return new double[] {res / totWeights, totWeights};
   }
   return new double[] {0.5, 0.0};
 }