void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }
void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException { _domainStats = domainStats; if (_domainStats.has("dR")) { _domainRank = _domainStats.get("dR").getAsDouble(); } else { _domainRank = 0.0; } if (_domainStats.has("urls")) { int urlCount = _domainStats.get("urls").getAsInt(); int crawledCount = _domainStats.get("crawled").getAsInt(); int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0; if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1); LOG.info( "Skipping Everything But Homepage for Domain:" + _newDomainBytes.toString() + " CrawledCount:" + crawledCount + " HTTP200Count:" + Http200Count + " URLCount:" + urlCount); _skipEverythingButHomepage = true; } else if (urlCount > 25000 && urlCount < 100000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 250000 && urlCount < 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } } if (_emittedURLSInFilter >= FLUSH_THRESHOLD) { _emittedURLSFilter.clear(); _emittedURLSInFilter = 0; reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1); } }