void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }
private JsonObject rawValueToJsonObject( DataOutputBuffer dataBuffer, DataInputBuffer stream, TextBytes tempTextBuffer) throws IOException { rawValueToTextBytes(dataBuffer, stream, tempTextBuffer); try { return parser.parse(tempTextBuffer.toString()).getAsJsonObject(); } catch (Exception e) { throw new IOException("Exception Building Json from String:" + tempTextBuffer.toString()); } }
void updateModelGivenUrlsSampleRecord(TextBytes inputData) { int curpos = inputData.getOffset(); int endpos = inputData.getOffset() + inputData.getLength(); byte lfPattern[] = {0xA}; byte tabPattern[] = {0x9}; TextBytes urlText = new TextBytes(); while (curpos != endpos) { int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern); if (tabIndex == -1) { break; } else { int lfIndex = ByteArrayUtils.indexOf( inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern); if (lfIndex == -1) { break; } else { long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(), curpos, tabIndex - curpos, 10); urlText.set(inputData.getBytes(), tabIndex + 1, lfIndex - (tabIndex + 1)); incoming.put(sourceDomainHash, urlText.toString()); curpos = lfIndex + 1; } } } }
void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException { _domainStats = domainStats; if (_domainStats.has("dR")) { _domainRank = _domainStats.get("dR").getAsDouble(); } else { _domainRank = 0.0; } if (_domainStats.has("urls")) { int urlCount = _domainStats.get("urls").getAsInt(); int crawledCount = _domainStats.get("crawled").getAsInt(); int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0; if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) { reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1); LOG.info( "Skipping Everything But Homepage for Domain:" + _newDomainBytes.toString() + " CrawledCount:" + crawledCount + " HTTP200Count:" + Http200Count + " URLCount:" + urlCount); _skipEverythingButHomepage = true; } else if (urlCount > 25000 && urlCount < 100000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 250000 && urlCount < 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } else if (urlCount > 1000000) { if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) { LOG.info("Skipping Domain:" + _newDomainBytes.toString()); reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1); _skipDomain = true; } } } if (_emittedURLSInFilter >= FLUSH_THRESHOLD) { _emittedURLSFilter.clear(); _emittedURLSInFilter = 0; reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1); } }
private static void rawValueToTextBytes( DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut) throws IOException { inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength()); int newLength = WritableUtils.readVInt(inputBuffer); textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength); }
@Test public void testSourceInputOutputWriters() throws IOException { _sourceInputsBuffer = new DataOutputBuffer(16348 * 4); _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS); String sourceDomainURL = "http://sourcedomain.com/foo"; URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL); String urls[] = {"http://somedomain.com/foo", "http://someother.com/bar"}; for (String url : urls) { URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url); // double insert and validate actual single insertion trackPotentialLinkSource(fp, url, sourceFP); trackPotentialLinkSource(fp, url, sourceFP); } // validate data ... TextBytes firstVersion = new TextBytes(); firstVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n"); int itemIndex = 0; while (tokenizer.hasMoreElements()) { String nextLine = tokenizer.nextToken(); String splits[] = nextLine.split("\t"); // validate fp URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]); Assert.assertEquals(fp.getDomainHash(), Long.parseLong(splits[0])); // validate actual url ... Assert.assertEquals(splits[1], urls[itemIndex]); itemIndex++; } // reset output buffer ... _sourceInputsBuffer = new DataOutputBuffer(16348 * 4); // and source bloom filter ... _sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS); importLinkSourceData(sourceFP, firstVersion); // second text should match first .. TextBytes secondVersion = new TextBytes(); secondVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); Assert.assertEquals(firstVersion, secondVersion); }
JsonObject updateModelGivenCrawlStatus(TextBytes statusJSON) throws IOException { has_crawl_status = true; JsonParser parser = new JsonParser(); JsonObject jsonObj = parser.parse(statusJSON.toString()).getAsJsonObject(); if (source_url == null) { source_url = jsonObj.get("source_url").getAsString(); } HashSet<String> extHrefs = new HashSet<String>(); JsonObject crawlDetailRecord = crawlDetailRecordFromCrawlStatusRecord( jsonObj, fp, extHrefs, new MockCollectorReporter()); long attemptTime = safeGetLong(crawlDetailRecord, CRAWLDETAIL_ATTEMPT_TIME_PROPERTY); latest_attempt_time = Math.max(attemptTime, latest_attempt_time); attempt_count++; int httpResult = safeGetInteger(crawlDetailRecord, CRAWLDETAIL_HTTPRESULT_PROPERTY); if (httpResult != -1) { if (latest_attempt_time == attemptTime) { this.http_result = httpResult; } if (httpResult >= 200 && httpResult <= 299) { latest_crawl_time = Math.max(attemptTime, latest_crawl_time); crawl_count++; if (latest_crawl_time == attemptTime) { this.parsed_as = crawlDetailRecord.get(CRAWLDETAIL_PARSEDAS_PROPERTY).getAsString(); this.ext_urls.clear(); this.ext_urls.addAll(extHrefs); } } else if (httpResult >= 300 && httpResult <= 399) { this.redirect_url = (crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL) != null) ? crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL).getAsString() : null; } } this.details.add(crawlDetailRecord); if (jsonObj.has("redirect_from")) { JsonObject redirectObject = jsonObj.get("redirect_from").getAsJsonObject(); JsonObject redirectJSON = new JsonObject(); int redirectHttpResult = redirectObject.get("http_result").getAsInt(); redirectJSON.addProperty("disposition", "SUCCESS"); redirectJSON.addProperty("http_result", redirectHttpResult); redirectJSON.addProperty("server_ip", redirectObject.get("server_ip").getAsString()); redirectJSON.addProperty("attempt_time", jsonObj.get("attempt_time").getAsLong()); redirectJSON.addProperty("target_url", jsonObj.get("source_url").getAsString()); redirectJSON.addProperty("source_url", redirectObject.get("source_url").getAsString()); return redirectJSON; } return null; }
public void updateModelGivenLinkRecord(TextBytes linkJSON) { JsonParser parser = new JsonParser(); JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject(); if (source_url == null) { source_url = jsonObj.get("href").getAsString(); } String sourceURL = jsonObj.get("source_url").getAsString(); URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL); if (urlfp != null) { if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) { if (!incoming.containsKey(urlfp.getRootDomainHash())) { incoming.put(urlfp.getRootDomainHash(), sourceURL); } } } }
void updateModelGivenMergedRecord(TextBytes mergedJSON) throws IOException { JsonObject mergeObject = new JsonParser().parse(mergedJSON.toString()).getAsJsonObject(); source_url = mergeObject.get(TOPLEVEL_SOURCE_URL_PROPRETY).getAsString(); has_crawl_status = mergeObject.has(TOPLEVEL_SUMMARYRECORD_PROPRETY); if (has_crawl_status) { JsonObject crawlStatusObj = mergeObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY); latest_attempt_time = crawlStatusObj.get(SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY).getAsLong(); latest_crawl_time = safeGetLong(crawlStatusObj, SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY); attempt_count = crawlStatusObj.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt(); crawl_count = (crawlStatusObj.has(SUMMARYRECORD_CRAWLCOUNT_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_CRAWLCOUNT_PROPERTY).getAsInt() : 0; parsed_as = (crawlStatusObj.has(SUMMARYRECORD_PARSEDAS_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_PARSEDAS_PROPERTY).getAsString() : null; http_result = (crawlStatusObj.has(SUMMARYRECORD_HTTP_RESULT_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_HTTP_RESULT_PROPERTY).getAsInt() : -1; redirect_url = (crawlStatusObj.has(SUMMARYRECORD_REDIRECT_URL_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_REDIRECT_URL_PROPERTY).getAsString() : null; safeJsonArrayToStringCollection( crawlStatusObj, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, ext_urls); if (crawlStatusObj.has(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { for (JsonElement crawlDetail : crawlStatusObj.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { details.add(crawlDetail.getAsJsonObject()); } } } }
@Override public void collect(TextBytes key, TextBytes value) throws IOException { items.add( new Pair<TextBytes, TextBytes>( new TextBytes(key.toString()), new TextBytes(value.toString()))); }
void iterateItems(MultiFileInputReader<TextBytes> multiFileInputReader, Reporter reporter) throws IOException { Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; int iterationCount = 0; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { reporter.incrCounter(Counters.GOT_RECORD, 1); int type = PartitionUtils.getTypeGivenPartitionKey(nextItem.e0._keyObject); PartitionUtils.getDomainGivenPartitionKey(nextItem.e0._keyObject, _newDomainBytes); PartitionUtils.getURLGivenPartitionKey(nextItem.e0._keyObject, _newURLBytes); if (_newURLBytes.compareTo(_contextURLBytes) != 0) { emitLastRecord(reporter); } long newDomainFP = SuperDomainList.domainFingerprintGivenName(_newDomainBytes.toString()); if (newDomainFP != _currentDomainId) { reporter.incrCounter(Counters.TRANSITIONING_DOMAIN, 1); domainTransition(newDomainFP, _newDomainBytes.toString(), reporter); } RawRecordValue valueRaw = Iterables.getFirst(nextItem.e1, null); switch (type) { case CrawlListGeneratorTask.KEY_TYPE_CRAWLSTATS: { reporter.incrCounter(Counters.GOT_CRAWLSTATS, 1); setDomainStats( rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer), reporter); } break; case CrawlListGeneratorTask.KEY_TYPE_HOMEPAGE_URL: { reporter.incrCounter(Counters.GOT_HOMEPAGE_DATA, 1); rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes); _flags |= HAS_HOMEPAGE_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL: { reporter.incrCounter(Counters.GOT_BLOGPROBE_DATA, 1); rawValueToWritable(valueRaw, tempBuffer, _blogURLSkipFlag); _contextURLBytes.set(_newURLBytes); _flags |= HAS_BLOGPROBE_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_FEED_URL: { reporter.incrCounter(Counters.GOT_FEEDURL_DATA, 1); rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes); _flags |= HAS_FEED_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_REDIRECT_RECORD: { reporter.incrCounter(Counters.GOT_REDIRECT_DATA, 1); _contextURLBytes.set(_newURLBytes); _flags |= HAS_REDIRECT_DATA; } break; case CrawlListGeneratorTask.KEY_TYPE_CRAWLDATA: { reporter.incrCounter(Counters.GOT_CRAWLURL_DATA, 1); _contextURLBytes.set(_newURLBytes); _crawlStatus = rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer); _flags |= HAS_CRAWL_STATUS; } break; } } // flush trailing record ... emitLastRecord(reporter); flushDomain(reporter); }