Exemple #1
0
    private void addToQueue(FetchSetDatum datum) {
      if (datum != null) {
        _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, 1);
        _flowProcess.increment(FetchCounters.URLS_QUEUED, datum.getUrls().size());

        _queue.add(datum);
      }
    }
Exemple #2
0
 /**
  * Empty the buffer, then the iterator, without worrying about mode/state.
  *
  * @return
  */
 public FetchSetDatum drain() {
   if (!_queue.isEmpty()) {
     return removeFromQueue();
   } else if (safeHasNext()) {
     return new FetchSetDatum(new TupleEntry(_values.next()));
   } else {
     return null;
   }
 }
Exemple #3
0
    /**
     * Return the top-most item from the queue, or null if the queue is empty.
     *
     * @return fetch set from queue
     */
    private FetchSetDatum removeFromQueue() {
      FetchSetDatum result = _queue.poll();
      if (result != null) {
        _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, -1);
        _flowProcess.increment(FetchCounters.URLS_QUEUED, -result.getUrls().size());
      }

      return result;
    }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
    }

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
              protocolAndDomain,
              _scorer,
              urls,
              _fetcher,
              _parser,
              bufferCall.getOutputCollector(),
              _flowProcess);
      _executor.execute(doRobots);
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
      LOGGER.error(
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
          t);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    }
  }
Exemple #5
0
 public boolean isEmpty() {
   return _queue.isEmpty() && !safeHasNext();
 }