void iterateItems(MultiFileInputReader<TextBytes> multiFileInputReader, Reporter reporter) throws IOException { Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; int iterationCount = 0; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { reporter.incrCounter(Counters.GOT_RECORD, 1); int type = PartitionUtils.getTypeGivenPartitionKey(nextItem.e0._keyObject); PartitionUtils.getDomainGivenPartitionKey(nextItem.e0._keyObject, _newDomainBytes); PartitionUtils.getURLGivenPartitionKey(nextItem.e0._keyObject, _newURLBytes); if (_newURLBytes.compareTo(_contextURLBytes) != 0) { emitLastRecord(reporter); } long newDomainFP = SuperDomainList.domainFingerprintGivenName(_newDomainBytes.toString()); if (newDomainFP != _currentDomainId) { reporter.incrCounter(Counters.TRANSITIONING_DOMAIN, 1); domainTransition(newDomainFP, _newDomainBytes.toString(), reporter); } RawRecordValue valueRaw = Iterables.getFirst(nextItem.e1, null); switch (type) { case CrawlListGeneratorTask.KEY_TYPE_CRAWLSTATS: { reporter.incrCounter(Counters.GOT_CRAWLSTATS, 1); setDomainStats( rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer), reporter); } break; case CrawlListGeneratorTask.KEY_TYPE_HOMEPAGE_URL: { reporter.incrCounter(Counters.GOT_HOMEPAGE_DATA, 1); rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes); _flags |= HAS_HOMEPAGE_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL: { reporter.incrCounter(Counters.GOT_BLOGPROBE_DATA, 1); rawValueToWritable(valueRaw, tempBuffer, _blogURLSkipFlag); _contextURLBytes.set(_newURLBytes); _flags |= HAS_BLOGPROBE_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_FEED_URL: { reporter.incrCounter(Counters.GOT_FEEDURL_DATA, 1); rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes); _flags |= HAS_FEED_URLDATA; } break; case CrawlListGeneratorTask.KEY_TYPE_REDIRECT_RECORD: { reporter.incrCounter(Counters.GOT_REDIRECT_DATA, 1); _contextURLBytes.set(_newURLBytes); _flags |= HAS_REDIRECT_DATA; } break; case CrawlListGeneratorTask.KEY_TYPE_CRAWLDATA: { reporter.incrCounter(Counters.GOT_CRAWLURL_DATA, 1); _contextURLBytes.set(_newURLBytes); _crawlStatus = rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer); _flags |= HAS_CRAWL_STATUS; } break; } } // flush trailing record ... emitLastRecord(reporter); flushDomain(reporter); }
@Override public void update(Integer lastYears, TaskId taskId) { int processNo = getProcessNo(); int orgUnitLevelNo = organisationUnitService.getNumberOfOrganisationalLevels(); Clock clock = new Clock(log) .startClock() .logTime( "Starting update, processes: " + processNo + ", org unit levels: " + orgUnitLevelNo); String validState = tableManager.validState(); if (validState != null) { notifier.notify(taskId, validState); return; } Date earliest = PartitionUtils.getEarliestDate(lastYears); final List<AnalyticsTable> tables = tableManager.getTables(earliest); final String tableName = tableManager.getTableName(); clock.logTime( "Table update start: " + tableName + ", processes: " + processNo + ", partitions: " + tables + ", last years: " + lastYears + ", earliest: " + earliest); notifier.notify( taskId, "Performing pre-create table work, processes: " + processNo + ", org unit levels: " + orgUnitLevelNo); tableManager.preCreateTables(); clock.logTime("Performed pre-create table work"); notifier.notify(taskId, "Creating analytics tables"); createTables(tables); clock.logTime("Created analytics tables"); notifier.notify(taskId, "Populating analytics tables"); populateTables(tables); clock.logTime("Populated analytics tables"); notifier.notify(taskId, "Applying aggregation levels"); applyAggregationLevels(tables); clock.logTime("Applied aggregation levels"); notifier.notify(taskId, "Creating indexes"); createIndexes(tables); clock.logTime("Created indexes"); notifier.notify(taskId, "Swapping analytics tables"); swapTables(tables, clock, taskId); clock.logTime("Swapped tables"); notifier.notify(taskId, "Clearing caches"); partitionManager.clearCaches(); clock.logTime("Table update done: " + tableName); notifier.notify(taskId, "Table update done"); }