/** * Validates the configuration to verify that it makes sense. Rejects configurations that will * fail during runtime. */ public void validate() { // verify that we do have properties if (properties == null || properties.isEmpty()) throw new DukeConfigException("Configuration has no properties at all"); // check if max prob is below threshold // this code duplicates code in findLookupProperties(), but prefer // that to creating an attribute double prob = 0.5; for (Property prop : properties.values()) { if (prop.getHighProbability() == 0.0) // if the probability is zero we ignore the property entirely continue; prob = Utils.computeBayes(prob, prop.getHighProbability()); } if (prob < threshold) throw new DukeConfigException( "Maximum possible probability is " + prob + ", which is below threshold (" + threshold + "), which means no duplicates will ever " + "be found"); // check that we have at least one ID property if (getIdentityProperties().isEmpty()) throw new DukeConfigException("No ID properties."); }
private void findLookupProperties() { List<Property> candidates = new ArrayList(); for (Property prop : properties.values()) // leave out properties that are either not used for comparisons, // or which have lookup turned off explicitly if (!prop.isIdProperty() && !prop.isIgnoreProperty() && prop.getLookupBehaviour() != Property.Lookup.FALSE && prop.getHighProbability() != 0.0) candidates.add(prop); // sort them, lowest high prob to highest high prob Collections.sort(candidates, new HighComparator()); // run over and find all those needed to get above the threshold int last = -1; double prob = 0.5; for (int ix = 0; ix < candidates.size(); ix++) { Property prop = candidates.get(ix); prob = Utils.computeBayes(prob, prop.getHighProbability()); if (prob >= threshold) { last = ix; break; } } if (last == -1) lookups = new ArrayList(); else lookups = new ArrayList(candidates.subList(0, last + 1)); // need to also add TRUE and REQUIRED for (Property p : proplist) { if (p.getLookupBehaviour() != Property.Lookup.TRUE && p.getLookupBehaviour() != Property.Lookup.REQUIRED) continue; if (lookups.contains(p)) continue; lookups.add(p); } }