Ejemplo n.º 1
0
  /** For debugging. */
  public static void main(String[] args) throws Exception {
    final String usage = "NutchBean query";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    final Configuration conf = NutchConfiguration.create();
    final NutchBean bean = new NutchBean(conf);
    try {
      final Query query = Query.parse(args[0], conf);
      final Hits hits = bean.search(query, 10);
      System.out.println("Total hits: " + hits.getTotal());
      final int length = (int) Math.min(hits.getTotal(), 10);
      final Hit[] show = hits.getHits(0, length);
      final HitDetails[] details = bean.getDetails(show);
      final Summary[] summaries = bean.getSummary(details, query);

      for (int i = 0; i < hits.getLength(); i++) {
        System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
      }
    } catch (Throwable t) {
      LOG.error("Exception occured while executing search: " + t, t);
      System.exit(1);
    }
    System.exit(0);
  }
Ejemplo n.º 2
0
  /**
   * Search for pages matching a query, eliminating excessive hits with matching values for a named
   * field. Hits after the first <code>maxHitsPerDup</code> are removed from results. The remaining
   * hits have {@link Hit#moreFromDupExcluded()} set.
   *
   * <p>If maxHitsPerDup is zero then all hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @param dedupField field name to check for duplicates
   * @param sortField Field to sort on (or null if no sorting).
   * @param reverse True if we are to reverse sort by <code>sortField</code>.
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(
      Query query,
      int numHits,
      int maxHitsPerDup,
      String dedupField,
      String sortField,
      boolean reverse)
      throws IOException {
    if (maxHitsPerDup <= 0) // disable dup checking
    return search(query, numHits, dedupField, sortField, reverse);

    final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
    int numHitsRaw = (int) (numHits * rawHitsFactor);
    if (LOG.isInfoEnabled()) {
      LOG.info("searching for " + numHitsRaw + " raw hits");
    }
    Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse);
    final long total = hits.getTotal();
    final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>();
    final List<Hit> resultList = new ArrayList<Hit>();
    final Set<Hit> seen = new HashSet<Hit>();
    final List<String> excludedValues = new ArrayList<String>();
    boolean totalIsExact = true;
    for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
      // get the next raw hit
      if (rawHitNum >= hits.getLength()) {
        // optimize query by prohibiting more matches on some excluded values
        final Query optQuery = (Query) query.clone();
        for (int i = 0; i < excludedValues.size(); i++) {
          if (i == MAX_PROHIBITED_TERMS) break;
          optQuery.addProhibitedTerm(excludedValues.get(i), dedupField);
        }
        numHitsRaw = (int) (numHitsRaw * rawHitsFactor);
        if (LOG.isInfoEnabled()) {
          LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery);
        }
        hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse);
        if (LOG.isInfoEnabled()) {
          LOG.info("found " + hits.getTotal() + " raw hits");
        }
        rawHitNum = -1;
        continue;
      }

      final Hit hit = hits.getHit(rawHitNum);
      if (seen.contains(hit)) continue;
      seen.add(hit);

      // get dup hits for its value
      final String value = hit.getDedupValue();
      DupHits dupHits = dupToHits.get(value);
      if (dupHits == null) dupToHits.put(value, dupHits = new DupHits());

      // does this hit exceed maxHitsPerDup?
      if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit
        if (!dupHits.maxSizeExceeded) {

          // mark prior hits with moreFromDupExcluded
          for (int i = 0; i < dupHits.size(); i++) {
            dupHits.get(i).setMoreFromDupExcluded(true);
          }
          dupHits.maxSizeExceeded = true;

          excludedValues.add(value); // exclude dup
        }
        totalIsExact = false;
      } else { // no -- collect the hit
        resultList.add(hit);
        dupHits.add(hit);

        // are we done?
        // we need to find one more than asked for, so that we can tell if
        // there are more hits to be shown
        if (resultList.size() > numHits) break;
      }
    }

    final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()]));
    results.setTotalIsExact(totalIsExact);
    return results;
  }