public FilterAndScoreByUrlAndRobots(
      UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) {
    super(ScoredUrlDatum.FIELDS);

    _scorer = scorer;
    _parser = parser;
    _fetcher = RobotUtils.createFetcher(userAgent, maxThreads);
  }
/** Filter out URLs by either domain (not popular enough) or if they're blocked by robots.txt */
@SuppressWarnings({"serial", "rawtypes"})
public class FilterAndScoreByUrlAndRobots extends BaseOperation<NullContext>
    implements Buffer<NullContext> {
  private static final Logger LOGGER = LoggerFactory.getLogger(FilterAndScoreByUrlAndRobots.class);

  private static final long COMMAND_TIMEOUT = RobotUtils.getMaxFetchTime();
  private static final long TERMINATE_TIMEOUT = COMMAND_TIMEOUT;

  private static final int MAX_URLS_IN_MEMORY = 100;

  private BaseScoreGenerator _scorer;
  private BaseFetcher _fetcher;
  private BaseRobotsParser _parser;

  private transient ThreadedExecutor _executor;
  private transient LoggingFlowProcess _flowProcess;

  public FilterAndScoreByUrlAndRobots(
      UserAgent userAgent, int maxThreads, BaseRobotsParser parser, BaseScoreGenerator scorer) {
    super(ScoredUrlDatum.FIELDS);

    _scorer = scorer;
    _parser = parser;
    _fetcher = RobotUtils.createFetcher(userAgent, maxThreads);
  }

  public FilterAndScoreByUrlAndRobots(
      BaseFetcher fetcher, BaseRobotsParser parser, BaseScoreGenerator scorer) {
    // We're going to output a ScoredUrlDatum (what FetcherBuffer expects).
    super(ScoredUrlDatum.FIELDS);

    _scorer = scorer;
    _parser = parser;
    _fetcher = fetcher;
  }

  @Override
  public boolean isSafe() {
    // We only want to fetch robots once.
    return false;
  }

  @SuppressWarnings("unchecked")
  @Override
  public void prepare(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT);

    // FUTURE KKr - use Cascading process vs creating our own, once it
    // supports logging in local mode, and a setStatus() call.
    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());
  }

  private synchronized void terminate() {
    if (_executor == null) {
      return;
    }

    try {
      if (!_executor.terminate(TERMINATE_TIMEOUT)) {
        LOGGER.warn("Had to do a hard shutdown of robots fetching");
      }
    } catch (InterruptedException e) {
      // FUTURE What's the right thing to do here? E.g. do I need to worry about
      // losing URLs still to be processed?
      LOGGER.warn("Interrupted while waiting for termination");
      Thread.currentThread().interrupt();
    } finally {
      _executor = null;
    }
  }

  @Override
  public void flush(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
    LOGGER.info("Flushing FilterAndScoreByUrlAndRobots");

    terminate();

    super.flush(flowProcess, operationCall);
  }

  @Override
  public void cleanup(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots");

    terminate();

    _flowProcess.dumpCounters();
    super.cleanup(flowProcess, operationCall);
  }

  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
    }

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
              protocolAndDomain,
              _scorer,
              urls,
              _fetcher,
              _parser,
              bufferCall.getOutputCollector(),
              _flowProcess);
      _executor.execute(doRobots);
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
      LOGGER.error(
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
          t);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    }
  }
}