/** generate a bundle from the given list of items and simultaneously flush it * */ void generateABundle(long domainFP, List<SegmentGeneratorItem> items, Reporter reporter) throws IOException { SegmentGeneratorItemBundle bundle = getBundleForDomain(domainFP); // LOG.info("Generating Bundle:" + currentBundleId + " for DH:" + domainFP); float maxPageRank = 0.0f; for (SegmentGeneratorItem item : items) { // LOG.info("URL:" + item.getUrl() + " Status:" + // CrawlDatum.getStatusName(item.getStatus()) +" PR:" + // item.getMetadata().getPageRank()); bundle.getUrls().add(item); currentDomainURLCount++; maxPageRank = Math.max(maxPageRank, item.getPageRank()); if (currentDomainURLCount <= 200) { urlDebugURLWriter.append( item.getUrl() + "\t" + item.getModifiedStatus() + "\t" + item.getPageRank() + "\n"); } } // LOG.info("Done Generating Bunlde - PR is:" + maxPageRank); // set page rank for bundle bundle.setMaxPageRank(maxPageRank); flushCurrentBundle(reporter); }
void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }