Beispiel #1
0
  /** For debugging. */
  public static void main(String[] args) throws Exception {
    final String usage = "NutchBean query";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    final Configuration conf = NutchConfiguration.create();
    final NutchBean bean = new NutchBean(conf);
    try {
      final Query query = Query.parse(args[0], conf);
      final Hits hits = bean.search(query, 10);
      System.out.println("Total hits: " + hits.getTotal());
      final int length = (int) Math.min(hits.getTotal(), 10);
      final Hit[] show = hits.getHits(0, length);
      final HitDetails[] details = bean.getDetails(show);
      final Summary[] summaries = bean.getSummary(details, query);

      for (int i = 0; i < hits.getLength(); i++) {
        System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]);
      }
    } catch (Throwable t) {
      LOG.error("Exception occured while executing search: " + t, t);
      System.exit(1);
    }
    System.exit(0);
  }
Beispiel #2
0
 /** appends a sorted list of hits to an existing set of hits. */
 public void appendSingleHits(SingleHit[] hits, String prefix, int chrom) throws IOException {
   if (hits.length == 0) {
     return;
   }
   int s = getPositionsBuffer().limit() - 1;
   SingleHit last =
       new SingleHit(
           chrom,
           getPositionsBuffer().get(s),
           getWeightsBuffer().get(s),
           Hits.getStrandOne(getLASBuffer().get(s)),
           Hits.getLengthOne(getLASBuffer().get(s)));
   if (hits[0].compareTo(last) > 0) {
     append(hits, prefix, chrom);
   } else {
     merge(hits, prefix, chrom);
   }
 }
Beispiel #3
0
  private void merge(SingleHit[] hits, String prefix, int chrom) throws IOException {
    String postmp = getPositionsFname(prefix, chrom) + ".tmp";
    String weightstmp = getWeightsFname(prefix, chrom) + ".tmp";
    String lastmp = getLaSFname(prefix, chrom) + ".tmp";
    RandomAccessFile positionsRAF = new RandomAccessFile(postmp, "rw");
    RandomAccessFile weightsRAF = new RandomAccessFile(weightstmp, "rw");
    RandomAccessFile lasRAF = new RandomAccessFile(lastmp, "rw");
    int newsize = getPositionsBuffer().limit() + hits.length;
    IntBP posfile =
        new IntBP(positionsRAF.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, newsize * 4));
    FloatBP weightfile =
        new FloatBP(weightsRAF.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, newsize * 4));
    IntBP lasfile =
        new IntBP(lasRAF.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, newsize * 4));

    int oldp = 0;
    int newp = 0;
    int pos = 0;
    IntBP oldpositions = getPositionsBuffer();
    FloatBP oldweights = getWeightsBuffer();
    IntBP oldlas = getLASBuffer();
    while (oldp < oldpositions.limit() || newp < hits.length) {
      while (newp < hits.length
          && (oldp == oldpositions.limit() || hits[newp].pos <= oldpositions.get(oldp))) {
        posfile.put(pos, hits[newp].pos);
        weightfile.put(pos, hits[newp].weight);
        lasfile.put(pos, Hits.makeLAS(hits[newp].length, hits[newp].strand));
        newp++;
        pos++;
      }
      while (oldp < oldpositions.limit()
          && (newp == hits.length || oldpositions.get(oldp) <= hits[newp].pos)) {
        posfile.put(pos, oldpositions.get(oldp));
        weightfile.put(pos, oldweights.get(oldp));
        lasfile.put(pos, oldlas.get(oldp));
        oldp++;
        pos++;
      }
      //            System.err.println(String.format("%d %d %d", pos, newp, oldp));
    }
    posfile = null;
    weightfile = null;
    lasfile = null;
    oldpositions = null;
    oldweights = null;
    oldlas = null;
    positionsRAF.close();
    weightsRAF.close();
    lasRAF.close();
    /* ideally this part with the renames would atomic... */
    (new File(postmp)).renameTo(new File(getPositionsFname(prefix, chrom)));
    (new File(weightstmp)).renameTo(new File(getWeightsFname(prefix, chrom)));
    (new File(lastmp)).renameTo(new File(getLaSFname(prefix, chrom)));
  }
Beispiel #4
0
  /**
   * Search for pages matching a query, eliminating excessive hits with matching values for a named
   * field. Hits after the first <code>maxHitsPerDup</code> are removed from results. The remaining
   * hits have {@link Hit#moreFromDupExcluded()} set.
   *
   * <p>If maxHitsPerDup is zero then all hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @param dedupField field name to check for duplicates
   * @param sortField Field to sort on (or null if no sorting).
   * @param reverse True if we are to reverse sort by <code>sortField</code>.
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(
      Query query,
      int numHits,
      int maxHitsPerDup,
      String dedupField,
      String sortField,
      boolean reverse)
      throws IOException {
    if (maxHitsPerDup <= 0) // disable dup checking
    return search(query, numHits, dedupField, sortField, reverse);

    final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
    int numHitsRaw = (int) (numHits * rawHitsFactor);
    if (LOG.isInfoEnabled()) {
      LOG.info("searching for " + numHitsRaw + " raw hits");
    }
    Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse);
    final long total = hits.getTotal();
    final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>();
    final List<Hit> resultList = new ArrayList<Hit>();
    final Set<Hit> seen = new HashSet<Hit>();
    final List<String> excludedValues = new ArrayList<String>();
    boolean totalIsExact = true;
    for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
      // get the next raw hit
      if (rawHitNum >= hits.getLength()) {
        // optimize query by prohibiting more matches on some excluded values
        final Query optQuery = (Query) query.clone();
        for (int i = 0; i < excludedValues.size(); i++) {
          if (i == MAX_PROHIBITED_TERMS) break;
          optQuery.addProhibitedTerm(excludedValues.get(i), dedupField);
        }
        numHitsRaw = (int) (numHitsRaw * rawHitsFactor);
        if (LOG.isInfoEnabled()) {
          LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery);
        }
        hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse);
        if (LOG.isInfoEnabled()) {
          LOG.info("found " + hits.getTotal() + " raw hits");
        }
        rawHitNum = -1;
        continue;
      }

      final Hit hit = hits.getHit(rawHitNum);
      if (seen.contains(hit)) continue;
      seen.add(hit);

      // get dup hits for its value
      final String value = hit.getDedupValue();
      DupHits dupHits = dupToHits.get(value);
      if (dupHits == null) dupToHits.put(value, dupHits = new DupHits());

      // does this hit exceed maxHitsPerDup?
      if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit
        if (!dupHits.maxSizeExceeded) {

          // mark prior hits with moreFromDupExcluded
          for (int i = 0; i < dupHits.size(); i++) {
            dupHits.get(i).setMoreFromDupExcluded(true);
          }
          dupHits.maxSizeExceeded = true;

          excludedValues.add(value); // exclude dup
        }
        totalIsExact = false;
      } else { // no -- collect the hit
        resultList.add(hit);
        dupHits.add(hit);

        // are we done?
        // we need to find one more than asked for, so that we can tell if
        // there are more hits to be shown
        if (resultList.size() > numHits) break;
      }
    }

    final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()]));
    results.setTotalIsExact(totalIsExact);
    return results;
  }
Beispiel #5
0
 public static void main(String args[]) throws IOException, ParseException {
   Hits hits = new Hits();
   hits.computeHits();
 }