Beispiel #1
0
 public static <E> List<E> sortedKeys(Counter<E> counter) {
   List<E> sortedKeyList = new ArrayList<E>();
   PriorityQueue<E> pq = counter.asPriorityQueue();
   while (pq.hasNext()) {
     sortedKeyList.add(pq.next());
   }
   return sortedKeyList;
 }
Beispiel #2
0
 public static <E> String toBiggestValuesFirstString(Counter<E> c, int k) {
   PriorityQueue<E> pq = c.asPriorityQueue();
   PriorityQueue<E> largestK = new FastPriorityQueue<E>();
   while (largestK.size() < k && pq.hasNext()) {
     double firstScore = pq.getPriority();
     E first = pq.next();
     largestK.setPriority(first, firstScore);
   }
   return largestK.toString();
 }
  /** Look up potentially matching records. */
  public Collection<Record> findCandidateMatches(Record record) {
    if (DEBUG)
      System.out.println(
          "---------------------------------------------------------------------------");

    // do lookup on all tokens from all lookup properties
    // (we only identify the buckets for now. later we decide how to process
    // them)
    List<Bucket> buckets = lookup(record);

    // preprocess the list of buckets
    Collections.sort(buckets);
    double score_sum = 0.0;
    for (Bucket b : buckets) score_sum += b.getScore();

    double score_so_far = 0.0;
    int threshold = buckets.size() - 1;
    for (; (score_so_far / score_sum) < min_relevance; threshold--) {
      score_so_far += buckets.get(threshold).getScore();
      if (DEBUG)
        System.out.println("score_so_far: " + (score_so_far / score_sum) + " (" + threshold + ")");
    }
    // bucket.get(threshold) made us go over the limit, so we need to step
    // one back
    threshold++;
    if (DEBUG) System.out.println("Threshold: " + threshold);

    // the collection of candidates
    Map<Long, Score> candidates = new HashMap();

    // go through the buckets that we're going to collect candidates from
    int next_bucket = collectCandidates(candidates, buckets, threshold);

    // there might still be some buckets left below the threshold. for
    // these we go through the existing candidates and check if we can
    // find them in the buckets.
    bumpScores(candidates, buckets, next_bucket);

    if (DEBUG) System.out.println("candidates: " + candidates.size());

    // if the cutoff properties are not set we can stop right here
    // FIXME: it's possible to make this a lot cleaner
    if (max_search_hits > candidates.size() && min_relevance == 0.0) {
      Collection<Record> cands = new ArrayList(candidates.size());
      for (Long id : candidates.keySet()) cands.add(store.findRecordById(id));
      if (DEBUG) System.out.println("final: " + cands.size());
      return cands;
    }

    // flatten candidates into an array, prior to sorting etc
    int ix = 0;
    Score[] scores = new Score[candidates.size()];
    double max_score = 0.0;
    for (Score s : candidates.values()) {
      scores[ix++] = s;
      if (s.score > max_score) max_score = s.score;
      if (DEBUG && false) System.out.println("" + s.id + ": " + s.score);
    }

    // allow map to be GC-ed
    candidates = null;

    // filter candidates with min_relevance and max_search_hits. do
    // this by turning the scores[] array into a priority queue (on
    // .score), then retrieving the best candidates. (gives a big
    // performance improvement over sorting the array.)
    PriorityQueue pq = new PriorityQueue(scores);
    int count = Math.min(scores.length, max_search_hits);
    Collection<Record> records = new ArrayList(count);
    for (ix = 0; ix < count; ix++) {
      Score s = pq.next();
      if (s.score >= min_relevance) records.add(store.findRecordById(s.id));
    }

    if (DEBUG) System.out.println("final: " + records.size());
    return records;
  }