Ejemplo n.º 1
0
 /**
  * Goes through the buckets from ix and out, checking for each candidate if it's in one of the
  * buckets, and if so, increasing its score accordingly. No new candidates are added.
  */
 private void bumpScores(Map<Long, Score> candidates, List<Bucket> buckets, int ix) {
   for (; ix < buckets.size(); ix++) {
     Bucket b = buckets.get(ix);
     if (b.nextfree > CUTOFF_FACTOR_2 * candidates.size()) return;
     double score = b.getScore();
     for (Score s : candidates.values()) if (b.contains(s.id)) s.score += score;
   }
 }
Ejemplo n.º 2
0
  /**
   * Goes through the first buckets, picking out candidate records and tallying up their scores.
   *
   * @return the index of the first bucket we did not process
   */
  private int collectCandidates(Map<Long, Score> candidates, List<Bucket> buckets, int threshold) {
    int ix;
    for (ix = 0; ix < threshold && candidates.size() < (CUTOFF_FACTOR_1 * max_search_hits); ix++) {
      Bucket b = buckets.get(ix);
      long[] ids = b.records;
      double score = b.getScore();

      for (int ix2 = 0; ix2 < b.nextfree; ix2++) {
        Score s = candidates.get(ids[ix2]);
        if (s == null) {
          s = new Score(ids[ix2]);
          candidates.put(ids[ix2], s);
        }
        s.score += score;
      }
      if (DEBUG) System.out.println("Bucket " + b.nextfree + " -> " + candidates.size());
    }
    return ix;
  }
Ejemplo n.º 3
0
  /** Tokenizes lookup fields and returns all matching buckets in the index. */
  private List<Bucket> lookup(Record record) {
    List<Bucket> buckets = new ArrayList();
    for (Property p : config.getLookupProperties()) {
      String propname = p.getName();
      Collection<String> values = record.getValues(propname);
      if (values == null) continue;

      for (String value : values) {
        String[] tokens = StringUtils.split(value);
        for (int ix = 0; ix < tokens.length; ix++) {
          Bucket b = store.lookupToken(propname, tokens[ix]);
          if (b == null || b.records == null) continue;
          long[] ids = b.records;
          if (DEBUG)
            System.out.println(
                propname + ", " + tokens[ix] + ": " + b.nextfree + " (" + b.getScore() + ")");
          buckets.add(b);
        }
      }
    }

    return buckets;
  }
Ejemplo n.º 4
0
  /** Look up potentially matching records. */
  public Collection<Record> findCandidateMatches(Record record) {
    if (DEBUG)
      System.out.println(
          "---------------------------------------------------------------------------");

    // do lookup on all tokens from all lookup properties
    // (we only identify the buckets for now. later we decide how to process
    // them)
    List<Bucket> buckets = lookup(record);

    // preprocess the list of buckets
    Collections.sort(buckets);
    double score_sum = 0.0;
    for (Bucket b : buckets) score_sum += b.getScore();

    double score_so_far = 0.0;
    int threshold = buckets.size() - 1;
    for (; (score_so_far / score_sum) < min_relevance; threshold--) {
      score_so_far += buckets.get(threshold).getScore();
      if (DEBUG)
        System.out.println("score_so_far: " + (score_so_far / score_sum) + " (" + threshold + ")");
    }
    // bucket.get(threshold) made us go over the limit, so we need to step
    // one back
    threshold++;
    if (DEBUG) System.out.println("Threshold: " + threshold);

    // the collection of candidates
    Map<Long, Score> candidates = new HashMap();

    // go through the buckets that we're going to collect candidates from
    int next_bucket = collectCandidates(candidates, buckets, threshold);

    // there might still be some buckets left below the threshold. for
    // these we go through the existing candidates and check if we can
    // find them in the buckets.
    bumpScores(candidates, buckets, next_bucket);

    if (DEBUG) System.out.println("candidates: " + candidates.size());

    // if the cutoff properties are not set we can stop right here
    // FIXME: it's possible to make this a lot cleaner
    if (max_search_hits > candidates.size() && min_relevance == 0.0) {
      Collection<Record> cands = new ArrayList(candidates.size());
      for (Long id : candidates.keySet()) cands.add(store.findRecordById(id));
      if (DEBUG) System.out.println("final: " + cands.size());
      return cands;
    }

    // flatten candidates into an array, prior to sorting etc
    int ix = 0;
    Score[] scores = new Score[candidates.size()];
    double max_score = 0.0;
    for (Score s : candidates.values()) {
      scores[ix++] = s;
      if (s.score > max_score) max_score = s.score;
      if (DEBUG && false) System.out.println("" + s.id + ": " + s.score);
    }

    // allow map to be GC-ed
    candidates = null;

    // filter candidates with min_relevance and max_search_hits. do
    // this by turning the scores[] array into a priority queue (on
    // .score), then retrieving the best candidates. (gives a big
    // performance improvement over sorting the array.)
    PriorityQueue pq = new PriorityQueue(scores);
    int count = Math.min(scores.length, max_search_hits);
    Collection<Record> records = new ArrayList(count);
    for (ix = 0; ix < count; ix++) {
      Score s = pq.next();
      if (s.score >= min_relevance) records.add(store.findRecordById(s.id));
    }

    if (DEBUG) System.out.println("final: " + records.size());
    return records;
  }