public static <E> List<E> sortedKeys(Counter<E> counter) { List<E> sortedKeyList = new ArrayList<E>(); PriorityQueue<E> pq = counter.asPriorityQueue(); while (pq.hasNext()) { sortedKeyList.add(pq.next()); } return sortedKeyList; }
public static <E> String toBiggestValuesFirstString(Counter<E> c, int k) { PriorityQueue<E> pq = c.asPriorityQueue(); PriorityQueue<E> largestK = new FastPriorityQueue<E>(); while (largestK.size() < k && pq.hasNext()) { double firstScore = pq.getPriority(); E first = pq.next(); largestK.setPriority(first, firstScore); } return largestK.toString(); }
/** Look up potentially matching records. */ public Collection<Record> findCandidateMatches(Record record) { if (DEBUG) System.out.println( "---------------------------------------------------------------------------"); // do lookup on all tokens from all lookup properties // (we only identify the buckets for now. later we decide how to process // them) List<Bucket> buckets = lookup(record); // preprocess the list of buckets Collections.sort(buckets); double score_sum = 0.0; for (Bucket b : buckets) score_sum += b.getScore(); double score_so_far = 0.0; int threshold = buckets.size() - 1; for (; (score_so_far / score_sum) < min_relevance; threshold--) { score_so_far += buckets.get(threshold).getScore(); if (DEBUG) System.out.println("score_so_far: " + (score_so_far / score_sum) + " (" + threshold + ")"); } // bucket.get(threshold) made us go over the limit, so we need to step // one back threshold++; if (DEBUG) System.out.println("Threshold: " + threshold); // the collection of candidates Map<Long, Score> candidates = new HashMap(); // go through the buckets that we're going to collect candidates from int next_bucket = collectCandidates(candidates, buckets, threshold); // there might still be some buckets left below the threshold. for // these we go through the existing candidates and check if we can // find them in the buckets. bumpScores(candidates, buckets, next_bucket); if (DEBUG) System.out.println("candidates: " + candidates.size()); // if the cutoff properties are not set we can stop right here // FIXME: it's possible to make this a lot cleaner if (max_search_hits > candidates.size() && min_relevance == 0.0) { Collection<Record> cands = new ArrayList(candidates.size()); for (Long id : candidates.keySet()) cands.add(store.findRecordById(id)); if (DEBUG) System.out.println("final: " + cands.size()); return cands; } // flatten candidates into an array, prior to sorting etc int ix = 0; Score[] scores = new Score[candidates.size()]; double max_score = 0.0; for (Score s : candidates.values()) { scores[ix++] = s; if (s.score > max_score) max_score = s.score; if (DEBUG && false) System.out.println("" + s.id + ": " + s.score); } // allow map to be GC-ed candidates = null; // filter candidates with min_relevance and max_search_hits. do // this by turning the scores[] array into a priority queue (on // .score), then retrieving the best candidates. (gives a big // performance improvement over sorting the array.) PriorityQueue pq = new PriorityQueue(scores); int count = Math.min(scores.length, max_search_hits); Collection<Record> records = new ArrayList(count); for (ix = 0; ix < count; ix++) { Score s = pq.next(); if (s.score >= min_relevance) records.add(store.findRecordById(s.id)); } if (DEBUG) System.out.println("final: " + records.size()); return records; }