/** For debugging. */ public static void main(String[] args) throws Exception { final String usage = "NutchBean query"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } final Configuration conf = NutchConfiguration.create(); final NutchBean bean = new NutchBean(conf); try { final Query query = Query.parse(args[0], conf); final Hits hits = bean.search(query, 10); System.out.println("Total hits: " + hits.getTotal()); final int length = (int) Math.min(hits.getTotal(), 10); final Hit[] show = hits.getHits(0, length); final HitDetails[] details = bean.getDetails(show); final Summary[] summaries = bean.getSummary(details, query); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); } } catch (Throwable t) { LOG.error("Exception occured while executing search: " + t, t); System.exit(1); } System.exit(0); }
/** appends a sorted list of hits to an existing set of hits. */ public void appendSingleHits(SingleHit[] hits, String prefix, int chrom) throws IOException { if (hits.length == 0) { return; } int s = getPositionsBuffer().limit() - 1; SingleHit last = new SingleHit( chrom, getPositionsBuffer().get(s), getWeightsBuffer().get(s), Hits.getStrandOne(getLASBuffer().get(s)), Hits.getLengthOne(getLASBuffer().get(s))); if (hits[0].compareTo(last) > 0) { append(hits, prefix, chrom); } else { merge(hits, prefix, chrom); } }
private void merge(SingleHit[] hits, String prefix, int chrom) throws IOException { String postmp = getPositionsFname(prefix, chrom) + ".tmp"; String weightstmp = getWeightsFname(prefix, chrom) + ".tmp"; String lastmp = getLaSFname(prefix, chrom) + ".tmp"; RandomAccessFile positionsRAF = new RandomAccessFile(postmp, "rw"); RandomAccessFile weightsRAF = new RandomAccessFile(weightstmp, "rw"); RandomAccessFile lasRAF = new RandomAccessFile(lastmp, "rw"); int newsize = getPositionsBuffer().limit() + hits.length; IntBP posfile = new IntBP(positionsRAF.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, newsize * 4)); FloatBP weightfile = new FloatBP(weightsRAF.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, newsize * 4)); IntBP lasfile = new IntBP(lasRAF.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, newsize * 4)); int oldp = 0; int newp = 0; int pos = 0; IntBP oldpositions = getPositionsBuffer(); FloatBP oldweights = getWeightsBuffer(); IntBP oldlas = getLASBuffer(); while (oldp < oldpositions.limit() || newp < hits.length) { while (newp < hits.length && (oldp == oldpositions.limit() || hits[newp].pos <= oldpositions.get(oldp))) { posfile.put(pos, hits[newp].pos); weightfile.put(pos, hits[newp].weight); lasfile.put(pos, Hits.makeLAS(hits[newp].length, hits[newp].strand)); newp++; pos++; } while (oldp < oldpositions.limit() && (newp == hits.length || oldpositions.get(oldp) <= hits[newp].pos)) { posfile.put(pos, oldpositions.get(oldp)); weightfile.put(pos, oldweights.get(oldp)); lasfile.put(pos, oldlas.get(oldp)); oldp++; pos++; } // System.err.println(String.format("%d %d %d", pos, newp, oldp)); } posfile = null; weightfile = null; lasfile = null; oldpositions = null; oldweights = null; oldlas = null; positionsRAF.close(); weightsRAF.close(); lasRAF.close(); /* ideally this part with the renames would atomic... */ (new File(postmp)).renameTo(new File(getPositionsFname(prefix, chrom))); (new File(weightstmp)).renameTo(new File(getWeightsFname(prefix, chrom))); (new File(lastmp)).renameTo(new File(getLaSFname(prefix, chrom))); }
/** * Search for pages matching a query, eliminating excessive hits with matching values for a named * field. Hits after the first <code>maxHitsPerDup</code> are removed from results. The remaining * hits have {@link Hit#moreFromDupExcluded()} set. * * <p>If maxHitsPerDup is zero then all hits are returned. * * @param query query * @param numHits number of requested hits * @param maxHitsPerDup the maximum hits returned with matching values, or zero * @param dedupField field name to check for duplicates * @param sortField Field to sort on (or null if no sorting). * @param reverse True if we are to reverse sort by <code>sortField</code>. * @return Hits the matching hits * @throws IOException */ public Hits search( Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse) throws IOException { if (maxHitsPerDup <= 0) // disable dup checking return search(query, numHits, dedupField, sortField, reverse); final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f); int numHitsRaw = (int) (numHits * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("searching for " + numHitsRaw + " raw hits"); } Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse); final long total = hits.getTotal(); final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>(); final List<Hit> resultList = new ArrayList<Hit>(); final Set<Hit> seen = new HashSet<Hit>(); final List<String> excludedValues = new ArrayList<String>(); boolean totalIsExact = true; for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { // get the next raw hit if (rawHitNum >= hits.getLength()) { // optimize query by prohibiting more matches on some excluded values final Query optQuery = (Query) query.clone(); for (int i = 0; i < excludedValues.size(); i++) { if (i == MAX_PROHIBITED_TERMS) break; optQuery.addProhibitedTerm(excludedValues.get(i), dedupField); } numHitsRaw = (int) (numHitsRaw * rawHitsFactor); if (LOG.isInfoEnabled()) { LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery); } hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse); if (LOG.isInfoEnabled()) { LOG.info("found " + hits.getTotal() + " raw hits"); } rawHitNum = -1; continue; } final Hit hit = hits.getHit(rawHitNum); if (seen.contains(hit)) continue; seen.add(hit); // get dup hits for its value final String value = hit.getDedupValue(); DupHits dupHits = dupToHits.get(value); if (dupHits == null) dupToHits.put(value, dupHits = new DupHits()); // does this hit exceed maxHitsPerDup? if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit if (!dupHits.maxSizeExceeded) { // mark prior hits with moreFromDupExcluded for (int i = 0; i < dupHits.size(); i++) { dupHits.get(i).setMoreFromDupExcluded(true); } dupHits.maxSizeExceeded = true; excludedValues.add(value); // exclude dup } totalIsExact = false; } else { // no -- collect the hit resultList.add(hit); dupHits.add(hit); // are we done? // we need to find one more than asked for, so that we can tell if // there are more hits to be shown if (resultList.size() > numHits) break; } } final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()])); results.setTotalIsExact(totalIsExact); return results; }
public static void main(String args[]) throws IOException, ParseException { Hits hits = new Hits(); hits.computeHits(); }