/** * Search for pages matching a query, eliminating excessive hits with matching values for a named * field. Hits after the first <code>maxHitsPerDup</code> are removed from results. The remaining * hits have {@link Hit#moreFromDupExcluded()} set. * * <p>If maxHitsPerDup is zero then all hits are returned. * * @param query query * @param numHits number of requested hits * @param maxHitsPerDup the maximum hits returned with matching values, or zero * @param dedupField field name to check for duplicates * @param sortField Field to sort on (or null if no sorting). * @param reverse True if we are to reverse sort by <code>sortField</code>. * @return Hits the matching hits * @throws IOException */ public Hits search( Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse) throws IOException { if (maxHitsPerDup <= 0) // disable dup checking return search(query, numHits, dedupField, sortField, reverse); final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); int numHitsRaw = (int) (numHits * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("searching for " + numHitsRaw + " raw hits"); } Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse); final long total = hits.getTotal(); final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>(); final List<Hit> resultList = new ArrayList<Hit>(); final Set<Hit> seen = new HashSet<Hit>(); final List<String> excludedValues = new ArrayList<String>(); boolean totalIsExact = true; for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { // get the next raw hit if (rawHitNum >= hits.getLength()) { // optimize query by prohibiting more matches on some excluded values final Query optQuery = (Query) query.clone(); for (int i = 0; i < excludedValues.size(); i++) { if (i == MAX_PROHIBITED_TERMS) break; optQuery.addProhibitedTerm(excludedValues.get(i), dedupField); } numHitsRaw = (int) (numHitsRaw * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery); } hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse); if (LOG.isInfoEnabled()) { LOG.info("found " + hits.getTotal() + " raw hits"); } rawHitNum = -1; continue; } final Hit hit = hits.getHit(rawHitNum); if (seen.contains(hit)) continue; seen.add(hit); // get dup hits for its value final String value = hit.getDedupValue(); DupHits dupHits = dupToHits.get(value); if (dupHits == null) dupToHits.put(value, dupHits = new DupHits()); // does this hit exceed maxHitsPerDup? if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit if (!dupHits.maxSizeExceeded) { // mark prior hits with moreFromDupExcluded for (int i = 0; i < dupHits.size(); i++) { dupHits.get(i).setMoreFromDupExcluded(true); } dupHits.maxSizeExceeded = true; excludedValues.add(value); // exclude dup } totalIsExact = false; } else { // no -- collect the hit resultList.add(hit); dupHits.add(hit); // are we done? // we need to find one more than asked for, so that we can tell if // there are more hits to be shown if (resultList.size() > numHits) break; } } final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()])); results.setTotalIsExact(totalIsExact); return results; }
/** * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 * * @return Hit */ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { DictSegment ds = matchedHit.getMatchedDictSegment(); return ds.match(charArray, currentIndex, 1, matchedHit); }