Esempio n. 1
0
  /**
   * Validates the configuration to verify that it makes sense. Rejects configurations that will
   * fail during runtime.
   */
  public void validate() {
    // verify that we do have properties
    if (properties == null || properties.isEmpty())
      throw new DukeConfigException("Configuration has no properties at all");

    // check if max prob is below threshold
    // this code duplicates code in findLookupProperties(), but prefer
    // that to creating an attribute
    double prob = 0.5;
    for (Property prop : properties.values()) {
      if (prop.getHighProbability() == 0.0)
        // if the probability is zero we ignore the property entirely
        continue;

      prob = Utils.computeBayes(prob, prop.getHighProbability());
    }
    if (prob < threshold)
      throw new DukeConfigException(
          "Maximum possible probability is "
              + prob
              + ", which is below threshold ("
              + threshold
              + "), which means no duplicates will ever "
              + "be found");

    // check that we have at least one ID property
    if (getIdentityProperties().isEmpty()) throw new DukeConfigException("No ID properties.");
  }
Esempio n. 2
0
  private void findLookupProperties() {
    List<Property> candidates = new ArrayList();
    for (Property prop : properties.values())
      // leave out properties that are either not used for comparisons,
      // or which have lookup turned off explicitly
      if (!prop.isIdProperty()
          && !prop.isIgnoreProperty()
          && prop.getLookupBehaviour() != Property.Lookup.FALSE
          && prop.getHighProbability() != 0.0) candidates.add(prop);

    // sort them, lowest high prob to highest high prob
    Collections.sort(candidates, new HighComparator());

    // run over and find all those needed to get above the threshold
    int last = -1;
    double prob = 0.5;
    for (int ix = 0; ix < candidates.size(); ix++) {
      Property prop = candidates.get(ix);
      prob = Utils.computeBayes(prob, prop.getHighProbability());
      if (prob >= threshold) {
        last = ix;
        break;
      }
    }

    if (last == -1) lookups = new ArrayList();
    else lookups = new ArrayList(candidates.subList(0, last + 1));

    // need to also add TRUE and REQUIRED
    for (Property p : proplist) {
      if (p.getLookupBehaviour() != Property.Lookup.TRUE
          && p.getLookupBehaviour() != Property.Lookup.REQUIRED) continue;

      if (lookups.contains(p)) continue;

      lookups.add(p);
    }
  }