void emitLastRecord(Reporter reporter) throws IOException { if (_flags != 0) { if (_domainStats == null) { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1); } else { reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1); } if (_crawlStatus != null) { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1); } else { reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1); } } if (_contextURLBytes.getLength() >= 4097) { reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1); } else { GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString()); if (!skipRecord(urlObject, reporter)) { if (urlObject.has_query()) { reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1); } URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { if (_emittedURLSFilter.isPresent(fp)) { reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1); } else { _emittedURLSFilter.add(fp); _emittedURLSInFilter++; SegmentGeneratorItem itemValue = new SegmentGeneratorItem(); itemValue.setDomainFP(fp.getDomainHash()); itemValue.setRootDomainFP(fp.getRootDomainHash()); itemValue.setUrlFP(fp.getUrlHash()); itemValue.setUrl(urlObject.getCanonicalURL()); itemValue.setPageRank(0); itemValue.setModifiedStatus((byte) 0); items.add(itemValue); if (items.size() >= SPILL_THRESHOLD) spillItems(reporter); } } else { reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1); } } } // reset stuff _flags = 0; _crawlStatus = null; _contextURLBytes.clear(); _blogURLSkipFlag.set(true); }
@Test public void testSourceInputOutputWriters() throws IOException { _sourceInputsBuffer = new DataOutputBuffer(16348 * 4); _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS); String sourceDomainURL = "http://sourcedomain.com/foo"; URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL); String urls[] = {"http://somedomain.com/foo", "http://someother.com/bar"}; for (String url : urls) { URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url); // double insert and validate actual single insertion trackPotentialLinkSource(fp, url, sourceFP); trackPotentialLinkSource(fp, url, sourceFP); } // validate data ... TextBytes firstVersion = new TextBytes(); firstVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n"); int itemIndex = 0; while (tokenizer.hasMoreElements()) { String nextLine = tokenizer.nextToken(); String splits[] = nextLine.split("\t"); // validate fp URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]); Assert.assertEquals(fp.getDomainHash(), Long.parseLong(splits[0])); // validate actual url ... Assert.assertEquals(splits[1], urls[itemIndex]); itemIndex++; } // reset output buffer ... _sourceInputsBuffer = new DataOutputBuffer(16348 * 4); // and source bloom filter ... _sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS); importLinkSourceData(sourceFP, firstVersion); // second text should match first .. TextBytes secondVersion = new TextBytes(); secondVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); Assert.assertEquals(firstVersion, secondVersion); }
/** * update the model from the raw (generated tuples) * * @param tuple * @throws Exception */ void updateModelFromInputTuple(Pair<TextBytes, TextBytes> tuple) throws Exception { URLFPV2 fp = new URLFPV2(); // get key ... fp.setRootDomainHash( CrawlDBKey.getLongComponentFromKey( tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fp.setDomainHash( CrawlDBKey.getLongComponentFromKey( tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fp.setUrlHash( CrawlDBKey.getLongComponentFromKey( tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID)); long recordType = CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID); if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal() || recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) { // update model given key ... URLStateModel urlModel = fpToModelMap.get(fp); if (urlModel == null) { urlModel = new URLStateModel(); urlModel.fp = fp; fpToModelMap.put(fp, urlModel); } if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { JsonObject redirectJSON = urlModel.updateModelGivenCrawlStatus(tuple.e1); if (redirectJSON != null) { URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectJSON.get("source_url").getAsString()); TextBytes key = CrawlDBKey.generateKey( redirectFP, CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS, redirectJSON.get("attempt_time").getAsLong()); Pair<TextBytes, TextBytes> redirectTuple = new Pair<TextBytes, TextBytes>(key, new TextBytes(redirectJSON.toString())); updateModelFromInputTuple(redirectTuple); } } else if (recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) { urlModel.updateModelGivenLinkRecord(tuple.e1); } } }
public void updateModelGivenLinkRecord(TextBytes linkJSON) { JsonParser parser = new JsonParser(); JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject(); if (source_url == null) { source_url = jsonObj.get("href").getAsString(); } String sourceURL = jsonObj.get("source_url").getAsString(); URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL); if (urlfp != null) { if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) { if (!incoming.containsKey(urlfp.getRootDomainHash())) { incoming.put(urlfp.getRootDomainHash(), sourceURL); } } } }