private void skipUrls(List<ScoredUrlDatum> urls, UrlStatus status, String traceMsg) { for (ScoredUrlDatum datum : urls) { FetchedDatum result = new FetchedDatum(datum); Tuple tuple = result.getTuple(); tuple.add(status.toString()); _collector.add(BixoPlatform.clone(tuple, _flowProcess)); } _flowProcess.increment(FetchCounters.URLS_SKIPPED, urls.size()); if (status == UrlStatus.SKIPPED_PER_SERVER_LIMIT) { _flowProcess.increment(FetchCounters.URLS_SKIPPED_PER_SERVER_LIMIT, urls.size()); } if ((traceMsg != null) && LOGGER.isTraceEnabled()) { LOGGER.trace(String.format(traceMsg, urls.size())); } }
@Override public void cleanup(FlowProcess process, OperationCall<NullContext> operationCall) { LOGGER.info("Cleaning up FetchBuffer"); terminate(); _flowProcess.dumpCounters(); super.cleanup(process, operationCall); }
@Override public void cleanup( FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) { LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots"); terminate(); _flowProcess.dumpCounters(); super.cleanup(flowProcess, operationCall); }
@SuppressWarnings("unchecked") @Override public void prepare( FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) { _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT); // FUTURE KKr - use Cascading process vs creating our own, once it // supports logging in local mode, and a setStatus() call. _flowProcess = new LoggingFlowProcess(flowProcess); _flowProcess.addReporter(new LoggingFlowReporter()); }
@Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { TupleEntry group = bufferCall.getGroup(); String protocolAndDomain = group.getString(0); LOGGER.info("Processing tuple group: " + group); DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY); Iterator<TupleEntry> values = bufferCall.getArgumentsIterator(); while (values.hasNext()) { urls.add(new GroupedUrlDatum(new TupleEntry(values.next()))); } try { Runnable doRobots = new ProcessRobotsTask( protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess); _executor.execute(doRobots); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } catch (Throwable t) { LOGGER.error( "Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } }
@SuppressWarnings({"unchecked"}) @Override public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { super.prepare(flowProcess, operationCall); _flowProcess = new LoggingFlowProcess(flowProcess); _flowProcess.addReporter(new LoggingFlowReporter()); _executor = new ThreadedExecutor( _fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout()); _refLock = new Object(); _pendingRefs = new ConcurrentHashMap<String, Long>(); _activeRefs = new ConcurrentHashMap<String, Long>(); _keepCollecting = new AtomicBoolean(true); }