void updateModelGivenUrlsSampleRecord(TextBytes inputData) { int curpos = inputData.getOffset(); int endpos = inputData.getOffset() + inputData.getLength(); byte lfPattern[] = {0xA}; byte tabPattern[] = {0x9}; TextBytes urlText = new TextBytes(); while (curpos != endpos) { int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern); if (tabIndex == -1) { break; } else { int lfIndex = ByteArrayUtils.indexOf( inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern); if (lfIndex == -1) { break; } else { long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(), curpos, tabIndex - curpos, 10); urlText.set(inputData.getBytes(), tabIndex + 1, lfIndex - (tabIndex + 1)); incoming.put(sourceDomainHash, urlText.toString()); curpos = lfIndex + 1; } } } }
private static void rawValueToTextBytes( DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut) throws IOException { inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength()); int newLength = WritableUtils.readVInt(inputBuffer); textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength); }
@Test public void testSourceInputOutputWriters() throws IOException { _sourceInputsBuffer = new DataOutputBuffer(16348 * 4); _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS); String sourceDomainURL = "http://sourcedomain.com/foo"; URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL); String urls[] = {"http://somedomain.com/foo", "http://someother.com/bar"}; for (String url : urls) { URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url); // double insert and validate actual single insertion trackPotentialLinkSource(fp, url, sourceFP); trackPotentialLinkSource(fp, url, sourceFP); } // validate data ... TextBytes firstVersion = new TextBytes(); firstVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n"); int itemIndex = 0; while (tokenizer.hasMoreElements()) { String nextLine = tokenizer.nextToken(); String splits[] = nextLine.split("\t"); // validate fp URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]); Assert.assertEquals(fp.getDomainHash(), Long.parseLong(splits[0])); // validate actual url ... Assert.assertEquals(splits[1], urls[itemIndex]); itemIndex++; } // reset output buffer ... _sourceInputsBuffer = new DataOutputBuffer(16348 * 4); // and source bloom filter ... _sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS); importLinkSourceData(sourceFP, firstVersion); // second text should match first .. TextBytes secondVersion = new TextBytes(); secondVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength()); Assert.assertEquals(firstVersion, secondVersion); }
void iterateItems(MultiFileInputReader<TextBytes> multiFileInputReader, Reporter reporter) throws IOException { Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; int iterationCount = 0; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { reporter.incrCounter(Counters.GOT_RECORD, 1); int type = PartitionUtils.getTypeGivenPartitionKey(nextItem.e0._keyObject); PartitionUtils.getDomainGivenPartitionKey(nextItem.e0._keyObject, _newDomainBytes); PartitionUtils.getURLGivenPartitionKey(nextItem.e0._keyObject, _newURLBytes); if (_newURLBytes.compareTo(_contextURLBytes) != 0) { emitLastRecord(reporter); } long newDomainFP = SuperDomainList.domainFingerprintGivenName(_newDomainBytes.toString()); if (newDomainFP != _currentDomainId) { reporter.incrCounter(Counters.TRANSITIONING_DOMAIN, 1); domainTransition(newDomainFP, _newDomainBytes.toString(), reporter); } RawRecordValue valueRaw = Iterables.getFirst(nextItem.e1, null); switch (type) { case CrawlListGeneratorTask.KEY_TYPE_CRAWLSTATS: { reporter.incrCounter(Counters.GOT_CRAWLSTATS, 1); setDomainStats( rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer), reporter); } break; case CrawlListGeneratorTask.KEY_TYPE_HOMEPAGE_URL: { reporter.incrCounter(Counters.GOT_HOMEPAGE_DATA, 1); rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes); _flags |= HAS_HOMEPAGE_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL: { reporter.incrCounter(Counters.GOT_BLOGPROBE_DATA, 1); rawValueToWritable(valueRaw, tempBuffer, _blogURLSkipFlag); _contextURLBytes.set(_newURLBytes); _flags |= HAS_BLOGPROBE_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_FEED_URL: { reporter.incrCounter(Counters.GOT_FEEDURL_DATA, 1); rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes); _flags |= HAS_FEED_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_REDIRECT_RECORD: { reporter.incrCounter(Counters.GOT_REDIRECT_DATA, 1); _contextURLBytes.set(_newURLBytes); _flags |= HAS_REDIRECT_DATA; } break; case CrawlListGeneratorTask.KEY_TYPE_CRAWLDATA: { reporter.incrCounter(Counters.GOT_CRAWLURL_DATA, 1); _contextURLBytes.set(_newURLBytes); _crawlStatus = rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer); _flags |= HAS_CRAWL_STATUS; } break; } } // flush trailing record ... emitLastRecord(reporter); flushDomain(reporter); }