public double compare(String s1, String s2) {
    if (s1.equals(s2)) return 1.0;

    // tokenize
    String[] t1 = StringUtils.split(s1);
    String[] t2 = StringUtils.split(s2);

    // ensure that t1 is shorter than or same length as t2
    if (t1.length > t2.length) {
      String[] tmp = t2;
      t2 = t1;
      t1 = tmp;
    }

    // find best matches for each token in t1
    double sum = 0;
    for (int ix1 = 0; ix1 < t1.length; ix1++) {
      double highest = 0;
      for (int ix2 = 0; ix2 < t2.length; ix2++)
        highest = Math.max(highest, subcomp.compare(t1[ix1], t2[ix2]));
      sum += highest;
    }

    return (sum * 2) / (t1.length + t2.length);
  }
Exemple #2
0
    private Cleaner makeCleaner(String value) {
      if (value == null) return null;

      String[] names = StringUtils.split(value);
      Cleaner[] cleaners = new Cleaner[names.length];
      for (int ix = 0; ix < cleaners.length; ix++) cleaners[ix] = _makeCleaner(names[ix]);

      if (cleaners.length == 1) return cleaners[0];
      else return new ChainedCleaner(cleaners);
    }
  /** Add the record to the index. */
  public void index(Record record) {
    // FIXME: check if record is already indexed

    // allocate an ID for this record
    long id = store.makeNewRecordId();
    store.registerRecord(id, record);

    // go through ID properties and register them
    for (Property p : config.getIdentityProperties())
      for (String extid : record.getValues(p.getName())) store.registerId(id, extid);

    // go through lookup properties and register those
    for (Property p : config.getLookupProperties()) {
      String propname = p.getName();
      for (String value : record.getValues(propname)) {
        String[] tokens = StringUtils.split(value);
        for (int ix = 0; ix < tokens.length; ix++) store.registerToken(id, propname, tokens[ix]);
      }
    }
  }
  /** Tokenizes lookup fields and returns all matching buckets in the index. */
  private List<Bucket> lookup(Record record) {
    List<Bucket> buckets = new ArrayList();
    for (Property p : config.getLookupProperties()) {
      String propname = p.getName();
      Collection<String> values = record.getValues(propname);
      if (values == null) continue;

      for (String value : values) {
        String[] tokens = StringUtils.split(value);
        for (int ix = 0; ix < tokens.length; ix++) {
          Bucket b = store.lookupToken(propname, tokens[ix]);
          if (b == null || b.records == null) continue;
          long[] ids = b.records;
          if (DEBUG)
            System.out.println(
                propname + ", " + tokens[ix] + ": " + b.nextfree + " (" + b.getScore() + ")");
          buckets.add(b);
        }
      }
    }

    return buckets;
  }