private synchronized void terminate() {
    if (_executor == null) {
      return;
    }

    try {
      if (!_executor.terminate(TERMINATE_TIMEOUT)) {
        LOGGER.warn("Had to do a hard shutdown of robots fetching");
      }
    } catch (InterruptedException e) {
      // FUTURE What's the right thing to do here? E.g. do I need to worry about
      // losing URLs still to be processed?
      LOGGER.warn("Interrupted while waiting for termination");
      Thread.currentThread().interrupt();
    } finally {
      _executor = null;
    }
  }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
    }

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
              protocolAndDomain,
              _scorer,
              urls,
              _fetcher,
              _parser,
              bufferCall.getOutputCollector(),
              _flowProcess);
      _executor.execute(doRobots);
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
      LOGGER.error(
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
          t);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    }
  }
Exemplo n.º 3
0
  private synchronized void terminate() {
    if (_executor == null) {
      return;
    }

    try {
      // We don't know worst-case for amount of time a worker thread will effectively
      // "sleep" waiting for a FetchTask to be queued up, but we'll add in a bit of
      // slop to represent that amount of time.
      long pollTime = ThreadedExecutor.MAX_POLL_TIME;
      Thread.sleep(pollTime);

      long requestTimeout = _fetcher.getFetcherPolicy().getRequestTimeout();
      if (!_executor.terminate(requestTimeout)) {
        LOGGER.warn("Had to do a hard termination of general fetching");

        // Abort any active connections, which should give the FetchTasks a chance
        // to clean things up.
        _fetcher.abort();

        // Now give everybody who had to be interrupted some time to
        // actually write out their remaining URLs.
        Thread.sleep(HARD_TERMINATION_CLEANUP_DURATION);
      }

      // Now stop collecting results. If somebody is in the middle of the collect() call,
      // we want them to finish before we set it to false and drop out of this method.
      synchronized (_keepCollecting) {
        _keepCollecting.set(false);
      }
    } catch (InterruptedException e) {
      // FUTURE What's the right thing to do here? E.g. do I need to worry about
      // losing URLs still to be processed?
      LOGGER.warn("Interrupted while waiting for termination");
    } finally {
      _executor = null;
    }
  }
Exemplo n.º 4
0
  @Override
  public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
    QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());

    _collector = buffCall.getOutputCollector();
    FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();

    // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
    // a single server, plus other values needed to set state properly.
    while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
      FetchSetDatum datum = values.nextOrNull(_fetcherMode);

      try {
        if (datum == null) {
          trace("Nothing ready to fetch, sleeping...");
          process.keepAlive();
          Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
        } else {
          List<ScoredUrlDatum> urls = datum.getUrls();
          String ref = datum.getGroupingRef();
          trace("Processing %d URLs for %s", urls.size(), ref);

          Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
          if (datum.isLastList()) {
            makeActive(ref, 0L);
            trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
          } else {
            Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
            makeActive(ref, nextFetchTime);
            trace(
                "Executing fetch of %d URLs from %s (next fetch time %d)",
                urls.size(), ref, nextFetchTime);
          }

          long startTime = System.currentTimeMillis();

          try {
            _executor.execute(doFetch);
          } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Fetch pool rejected our fetch list for " + ref);

            finished(ref);
            skipUrls(
                urls,
                UrlStatus.SKIPPED_DEFERRED,
                String.format("Execution rejection skipped %d URLs", urls.size()));
          }

          // Adjust for how long it took to get the request queued.
          adjustActive(ref, System.currentTimeMillis() - startTime);
        }
      } catch (InterruptedException e) {
        LOGGER.warn("FetchBuffer interrupted!");
        Thread.currentThread().interrupt();
      }
    }

    // Skip all URLs that we've got left.
    if (!values.isEmpty()) {
      trace("Found unprocessed URLs");

      UrlStatus status =
          Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;

      while (!values.isEmpty()) {
        FetchSetDatum datum = values.drain();
        List<ScoredUrlDatum> urls = datum.getUrls();
        trace(
            "Skipping %d urls from %s (e.g. %s) ",
            urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
        skipUrls(urls, status, null);
      }
    }
  }