Пример #1
0
  public FetchBuffer(BaseFetcher fetcher) {
    // We're going to output a tuple that contains a FetchedDatum, plus meta-data,
    // plus a result that could be a string, a status, or an exception
    super(FetchedDatum.FIELDS.append(FETCH_RESULT_FIELD));

    _fetcher = fetcher;
    _fetcherMode = _fetcher.getFetcherPolicy().getFetcherMode();
  }
Пример #2
0
  @SuppressWarnings({"unchecked"})
  @Override
  public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) {
    super.prepare(flowProcess, operationCall);

    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());

    _executor =
        new ThreadedExecutor(
            _fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout());

    _refLock = new Object();
    _pendingRefs = new ConcurrentHashMap<String, Long>();
    _activeRefs = new ConcurrentHashMap<String, Long>();

    _keepCollecting = new AtomicBoolean(true);
  }
  @SuppressWarnings("unchecked")
  @Override
  public void prepare(
      FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) {
    _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT);

    // FUTURE KKr - use Cascading process vs creating our own, once it
    // supports logging in local mode, and a setStatus() call.
    _flowProcess = new LoggingFlowProcess(flowProcess);
    _flowProcess.addReporter(new LoggingFlowReporter());
  }
Пример #4
0
  private synchronized void terminate() {
    if (_executor == null) {
      return;
    }

    try {
      // We don't know worst-case for amount of time a worker thread will effectively
      // "sleep" waiting for a FetchTask to be queued up, but we'll add in a bit of
      // slop to represent that amount of time.
      long pollTime = ThreadedExecutor.MAX_POLL_TIME;
      Thread.sleep(pollTime);

      long requestTimeout = _fetcher.getFetcherPolicy().getRequestTimeout();
      if (!_executor.terminate(requestTimeout)) {
        LOGGER.warn("Had to do a hard termination of general fetching");

        // Abort any active connections, which should give the FetchTasks a chance
        // to clean things up.
        _fetcher.abort();

        // Now give everybody who had to be interrupted some time to
        // actually write out their remaining URLs.
        Thread.sleep(HARD_TERMINATION_CLEANUP_DURATION);
      }

      // Now stop collecting results. If somebody is in the middle of the collect() call,
      // we want them to finish before we set it to false and drop out of this method.
      synchronized (_keepCollecting) {
        _keepCollecting.set(false);
      }
    } catch (InterruptedException e) {
      // FUTURE What's the right thing to do here? E.g. do I need to worry about
      // losing URLs still to be processed?
      LOGGER.warn("Interrupted while waiting for termination");
    } finally {
      _executor = null;
    }
  }
Пример #5
0
  @Override
  public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
    QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());

    _collector = buffCall.getOutputCollector();
    FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();

    // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
    // a single server, plus other values needed to set state properly.
    while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
      FetchSetDatum datum = values.nextOrNull(_fetcherMode);

      try {
        if (datum == null) {
          trace("Nothing ready to fetch, sleeping...");
          process.keepAlive();
          Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
        } else {
          List<ScoredUrlDatum> urls = datum.getUrls();
          String ref = datum.getGroupingRef();
          trace("Processing %d URLs for %s", urls.size(), ref);

          Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
          if (datum.isLastList()) {
            makeActive(ref, 0L);
            trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
          } else {
            Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
            makeActive(ref, nextFetchTime);
            trace(
                "Executing fetch of %d URLs from %s (next fetch time %d)",
                urls.size(), ref, nextFetchTime);
          }

          long startTime = System.currentTimeMillis();

          try {
            _executor.execute(doFetch);
          } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Fetch pool rejected our fetch list for " + ref);

            finished(ref);
            skipUrls(
                urls,
                UrlStatus.SKIPPED_DEFERRED,
                String.format("Execution rejection skipped %d URLs", urls.size()));
          }

          // Adjust for how long it took to get the request queued.
          adjustActive(ref, System.currentTimeMillis() - startTime);
        }
      } catch (InterruptedException e) {
        LOGGER.warn("FetchBuffer interrupted!");
        Thread.currentThread().interrupt();
      }
    }

    // Skip all URLs that we've got left.
    if (!values.isEmpty()) {
      trace("Found unprocessed URLs");

      UrlStatus status =
          Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;

      while (!values.isEmpty()) {
        FetchSetDatum datum = values.drain();
        List<ScoredUrlDatum> urls = datum.getUrls();
        trace(
            "Skipping %d urls from %s (e.g. %s) ",
            urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
        skipUrls(urls, status, null);
      }
    }
  }