Пример #1
0
  private void skipUrls(List<ScoredUrlDatum> urls, UrlStatus status, String traceMsg) {
    for (ScoredUrlDatum datum : urls) {
      FetchedDatum result = new FetchedDatum(datum);
      Tuple tuple = result.getTuple();
      tuple.add(status.toString());
      _collector.add(BixoPlatform.clone(tuple, _flowProcess));
    }

    _flowProcess.increment(FetchCounters.URLS_SKIPPED, urls.size());
    if (status == UrlStatus.SKIPPED_PER_SERVER_LIMIT) {
      _flowProcess.increment(FetchCounters.URLS_SKIPPED_PER_SERVER_LIMIT, urls.size());
    }

    if ((traceMsg != null) && LOGGER.isTraceEnabled()) {
      LOGGER.trace(String.format(traceMsg, urls.size()));
    }
  }
Пример #2
0
  @Override
  public void cleanup(FlowProcess process, OperationCall<NullContext> operationCall) {
    LOGGER.info("Cleaning up FetchBuffer");

    terminate();

    _flowProcess.dumpCounters();
    super.cleanup(process, operationCall);
  }
  @Override
  public void cleanup(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    LOGGER.info("Cleaning up FilterAndScoreByUrlAndRobots");

    terminate();

    _flowProcess.dumpCounters();
    super.cleanup(flowProcess, operationCall);
  }
  @SuppressWarnings("unchecked")
  @Override
  public void prepare(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT);

    // FUTURE KKr - use Cascading process vs creating our own, once it
    // supports logging in local mode, and a setStatus() call.
    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());
  }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
    }

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
              protocolAndDomain,
              _scorer,
              urls,
              _fetcher,
              _parser,
              bufferCall.getOutputCollector(),
              _flowProcess);
      _executor.execute(doRobots);
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
      LOGGER.error(
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
          t);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    }
  }
Пример #6
0
  @SuppressWarnings({"unchecked"})
  @Override
  public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
    super.prepare(flowProcess, operationCall);

    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());

    _executor =
        new ThreadedExecutor(
            _fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout());

    _refLock = new Object();
    _pendingRefs = new ConcurrentHashMap<String, Long>();
    _activeRefs = new ConcurrentHashMap<String, Long>();

    _keepCollecting = new AtomicBoolean(true);
  }