/** For debugging. */ public static void main(String[] args) throws Exception { final String usage = "NutchBean query"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } final Configuration conf = NutchConfiguration.create(); final NutchBean bean = new NutchBean(conf); try { final Query query = Query.parse(args[0], conf); final Hits hits = bean.search(query, 10); System.out.println("Total hits: " + hits.getTotal()); final int length = (int) Math.min(hits.getTotal(), 10); final Hit[] show = hits.getHits(0, length); final HitDetails[] details = bean.getDetails(show); final Summary[] summaries = bean.getSummary(details, query); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); } } catch (Throwable t) { LOG.error("Exception occured while executing search: " + t, t); System.exit(1); } System.exit(0); }
/** * Search for pages matching a query, eliminating excessive hits with matching values for a named * field. Hits after the first <code>maxHitsPerDup</code> are removed from results. The remaining * hits have {@link Hit#moreFromDupExcluded()} set. * * <p>If maxHitsPerDup is zero then all hits are returned. * * @param query query * @param numHits number of requested hits * @param maxHitsPerDup the maximum hits returned with matching values, or zero * @param dedupField field name to check for duplicates * @param sortField Field to sort on (or null if no sorting). * @param reverse True if we are to reverse sort by <code>sortField</code>. * @return Hits the matching hits * @throws IOException */ public Hits search( Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse) throws IOException { if (maxHitsPerDup <= 0) // disable dup checking return search(query, numHits, dedupField, sortField, reverse); final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); int numHitsRaw = (int) (numHits * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("searching for " + numHitsRaw + " raw hits"); } Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse); final long total = hits.getTotal(); final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>(); final List<Hit> resultList = new ArrayList<Hit>(); final Set<Hit> seen = new HashSet<Hit>(); final List<String> excludedValues = new ArrayList<String>(); boolean totalIsExact = true; for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { // get the next raw hit if (rawHitNum >= hits.getLength()) { // optimize query by prohibiting more matches on some excluded values final Query optQuery = (Query) query.clone(); for (int i = 0; i < excludedValues.size(); i++) { if (i == MAX_PROHIBITED_TERMS) break; optQuery.addProhibitedTerm(excludedValues.get(i), dedupField); } numHitsRaw = (int) (numHitsRaw * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery); } hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse); if (LOG.isInfoEnabled()) { LOG.info("found " + hits.getTotal() + " raw hits"); } rawHitNum = -1; continue; } final Hit hit = hits.getHit(rawHitNum); if (seen.contains(hit)) continue; seen.add(hit); // get dup hits for its value final String value = hit.getDedupValue(); DupHits dupHits = dupToHits.get(value); if (dupHits == null) dupToHits.put(value, dupHits = new DupHits()); // does this hit exceed maxHitsPerDup? if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit if (!dupHits.maxSizeExceeded) { // mark prior hits with moreFromDupExcluded for (int i = 0; i < dupHits.size(); i++) { dupHits.get(i).setMoreFromDupExcluded(true); } dupHits.maxSizeExceeded = true; excludedValues.add(value); // exclude dup } totalIsExact = false; } else { // no -- collect the hit resultList.add(hit); dupHits.add(hit); // are we done? // we need to find one more than asked for, so that we can tell if // there are more hits to be shown if (resultList.size() > numHits) break; } } final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()])); results.setTotalIsExact(totalIsExact); return results; }