boolean skipRecord(GoogleURL urlObject, Reporter reporter) { if (_skipDomain) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1); return true; } if (!urlObject.isValid()) { reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1); return true; } else if (urlObject.has_query()) { reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1); if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) { reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1); return true; } } else { // if redirect ... skip if ((_flags & HAS_REDIRECT_DATA) != 0) { reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1); return true; } if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) { if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) { reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1); return false; } } if (_skipEverythingButHomepage) { reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1); return true; } if (_crawlStatus != null) { if (_crawlStatus.has("crawl_status")) { JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject(); if (realCrawlStatus.has("http_result")) { int httpResult = realCrawlStatus.get("http_result").getAsInt(); if (httpResult == 200 || httpResult == 404) { if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) { if (_blogURLSkipFlag.get()) { reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1); return true; } else { reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1); return false; } } else { reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1); return true; } } } } } } return false; }
void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }