public FilterAndScoreByUrlAndRobots( UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) { super(ScoredUrlDatum.FIELDS); _scorer = scorer; _parser = parser; _fetcher = RobotUtils.createFetcher(userAgent, maxThreads); }
/** Filter out URLs by either domain (not popular enough) or if they're blocked by robots.txt */ @SuppressWarnings({"serial", "rawtypes"}) public class FilterAndScoreByUrlAndRobots extends BaseOperation<NullContext> implements Buffer<NullContext> { private static final Logger LOGGER = LoggerFactory.getLogger(FilterAndScoreByUrlAndRobots.class); private static final long COMMAND_TIMEOUT = RobotUtils.getMaxFetchTime(); private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT; private static final int MAX_URLS_IN_MEMORY = 100; private BaseScoreGenerator _scorer; private BaseFetcher _fetcher; private BaseRobotsParser _parser; private transient ThreadedExecutor _executor; private transient LoggingFlowProcess _flowProcess; public FilterAndScoreByUrlAndRobots( UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) { super(ScoredUrlDatum.FIELDS); _scorer = scorer; _parser = parser; _fetcher = RobotUtils.createFetcher(userAgent, maxThreads); } public FilterAndScoreByUrlAndRobots( BaseFetcher fetcher, BaseRobotsParser parser, BaseScoreGenerator scorer) { // We're going to output a ScoredUrlDatum (what FetcherBuffer expects). super(ScoredUrlDatum.FIELDS); _scorer = scorer; _parser = parser; _fetcher = fetcher; } @Override public boolean isSafe() { // We only want to fetch robots once. return false; } @SuppressWarnings("unchecked") @Override public void prepare( FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) { _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT); // FUTURE KKr - use Cascading process vs creating our own, once it // supports logging in local mode, and a setStatus() call. _flowProcess = new LoggingFlowProcess(flowProcess); _flowProcess.addReporter(new LoggingFlowReporter()); } private synchronized void terminate() { if (_executor == null) { return; } try { if (!_executor.terminate(TERMINATE_TIMEOUT)) { LOGGER.warn("Had to do a hard shutdown of robots fetching"); } } catch (InterruptedException e) { // FUTURE What's the right thing to do here? E.g. do I need to worry about // losing URLs still to be processed? LOGGER.warn("Interrupted while waiting for termination"); Thread.currentThread().interrupt(); } finally { _executor = null; } } @Override public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { LOGGER.info("Flushing FilterAndScoreByUrlAndRobots"); terminate(); super.flush(flowProcess, operationCall); } @Override public void cleanup( FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) { LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots"); terminate(); _flowProcess.dumpCounters(); super.cleanup(flowProcess, operationCall); } @Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { TupleEntry group = bufferCall.getGroup(); String protocolAndDomain = group.getString(0); LOGGER.info("Processing tuple group: " + group); DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY); Iterator<TupleEntry> values = bufferCall.getArgumentsIterator(); while (values.hasNext()) { urls.add(new GroupedUrlDatum(new TupleEntry(values.next()))); } try { Runnable doRobots = new ProcessRobotsTask( protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess); _executor.execute(doRobots); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } catch (Throwable t) { LOGGER.error( "Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } } }