/** Filter out URLs by either domain (not popular enough) or if they're blocked by robots.txt */
@SuppressWarnings({"serial", "rawtypes"})
public class FilterAndScoreByUrlAndRobots extends BaseOperation<NullContext>
    implements Buffer<NullContext> {
  private static final Logger LOGGER = LoggerFactory.getLogger(FilterAndScoreByUrlAndRobots.class);

  private static final long COMMAND_TIMEOUT = RobotUtils.getMaxFetchTime();
  private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT;

  private static final int MAX_URLS_IN_MEMORY = 100;

  private BaseScoreGenerator _scorer;
  private BaseFetcher _fetcher;
  private BaseRobotsParser _parser;

  private transient ThreadedExecutor _executor;
  private transient LoggingFlowProcess _flowProcess;

  public FilterAndScoreByUrlAndRobots(
      UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) {

    _scorer = scorer;
    _parser = parser;
    _fetcher = RobotUtils.createFetcher(userAgent, maxThreads);

  public FilterAndScoreByUrlAndRobots(
      BaseFetcher fetcher, BaseRobotsParser parser, BaseScoreGenerator scorer) {
    // We're going to output a ScoredUrlDatum (what FetcherBuffer expects).

    _scorer = scorer;
    _parser = parser;
    _fetcher = fetcher;

  public boolean isSafe() {
    // We only want to fetch robots once.
    return false;

  public void prepare(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT);

    // FUTURE KKr - use Cascading process vs creating our own, once it
    // supports logging in local mode, and a setStatus() call.
    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());

  private synchronized void terminate() {
    if (_executor == null) {

    try {
      if (!_executor.terminate(TERMINATE_TIMEOUT)) {
        LOGGER.warn("Had to do a hard shutdown of robots fetching");
    } catch (InterruptedException e) {
      // FUTURE What's the right thing to do here? E.g. do I need to worry about
      // losing URLs still to be processed?
      LOGGER.warn("Interrupted while waiting for termination");
    } finally {
      _executor = null;

  public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
    LOGGER.info("Flushing FilterAndScoreByUrlAndRobots");


    super.flush(flowProcess, operationCall);

  public void cleanup(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots");


    super.cleanup(flowProcess, operationCall);

  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);