Exemple #1
0
    private void addToQueue(FetchSetDatum datum) {
      if (datum != null) {
        _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, 1);
        _flowProcess.increment(FetchCounters.URLS_QUEUED, datum.getUrls().size());

        _queue.add(datum);
      }
    }
Exemple #2
0
    /**
     * Return the top-most item from the queue, or null if the queue is empty.
     *
     * @return fetch set from queue
     */
    private FetchSetDatum removeFromQueue() {
      FetchSetDatum result = _queue.poll();
      if (result != null) {
        _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, -1);
        _flowProcess.increment(FetchCounters.URLS_QUEUED, -result.getUrls().size());
      }

      return result;
    }
Exemple #3
0
    @Override
    public int compare(FetchSetDatum o1, FetchSetDatum o2) {
      long o1FetchTime = getFetchTime(o1.getGroupingRef());
      long o2FetchTime = getFetchTime(o2.getGroupingRef());

      // The entry that's ready sooner sorts sooner. If both
      // are ready, return the one with the bigger fetch set.
      if (o1FetchTime < o2FetchTime) {
        return -1;
      } else if (o1FetchTime > o2FetchTime) {
        return 1;
      } else if (o1.getUrls().size() > o2.getUrls().size()) {
        return -1;
      } else if (o1.getUrls().size() < o2.getUrls().size()) {
        return 1;
      } else {
        return 0;
      }
    }
Exemple #4
0
  @Override
  public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
    QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());

    _collector = buffCall.getOutputCollector();
    FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();

    // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
    // a single server, plus other values needed to set state properly.
    while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
      FetchSetDatum datum = values.nextOrNull(_fetcherMode);

      try {
        if (datum == null) {
          trace("Nothing ready to fetch, sleeping...");
          process.keepAlive();
          Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
        } else {
          List<ScoredUrlDatum> urls = datum.getUrls();
          String ref = datum.getGroupingRef();
          trace("Processing %d URLs for %s", urls.size(), ref);

          Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
          if (datum.isLastList()) {
            makeActive(ref, 0L);
            trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
          } else {
            Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
            makeActive(ref, nextFetchTime);
            trace(
                "Executing fetch of %d URLs from %s (next fetch time %d)",
                urls.size(), ref, nextFetchTime);
          }

          long startTime = System.currentTimeMillis();

          try {
            _executor.execute(doFetch);
          } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Fetch pool rejected our fetch list for " + ref);

            finished(ref);
            skipUrls(
                urls,
                UrlStatus.SKIPPED_DEFERRED,
                String.format("Execution rejection skipped %d URLs", urls.size()));
          }

          // Adjust for how long it took to get the request queued.
          adjustActive(ref, System.currentTimeMillis() - startTime);
        }
      } catch (InterruptedException e) {
        LOGGER.warn("FetchBuffer interrupted!");
        Thread.currentThread().interrupt();
      }
    }

    // Skip all URLs that we've got left.
    if (!values.isEmpty()) {
      trace("Found unprocessed URLs");

      UrlStatus status =
          Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;

      while (!values.isEmpty()) {
        FetchSetDatum datum = values.drain();
        List<ScoredUrlDatum> urls = datum.getUrls();
        trace(
            "Skipping %d urls from %s (e.g. %s) ",
            urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
        skipUrls(urls, status, null);
      }
    }
  }
Exemple #5
0
    public FetchSetDatum nextOrNull(FetcherMode mode) {

      int fetchSetsQueued = 0;

      // Loop until we have something to return, or there's nothing that we can return, or we've
      // queued up as many fetchsets as we want without any delay.
      while (!isEmpty() && (fetchSetsQueued < MAX_FETCHSETS_TO_QUEUE_PER_DELAY)) {
        // First see if we've got something in the queue, and if so, then check if it's ready
        // to be processed.
        final FetchSetDatum queueDatum = removeFromQueue();

        if (queueDatum != null) {
          String ref = queueDatum.getGroupingRef();
          if (readyToFetch(ref) || (mode == FetcherMode.IMPOLITE)) {
            List<ScoredUrlDatum> urls = queueDatum.getUrls();
            trace(
                "Returning %d urls via queue from %s (e.g. %s)",
                urls.size(), ref, urls.get(0).getUrl());
            return queueDatum;
          }
        }

        // Nothing ready from the top of the queue or nothing in the queue, let's see about the
        // iterator.
        if (safeHasNext()) {
          // Re-add the thing from the top of the queue, since we're going to want to keep it
          // around.
          // This is safe to call with a null datum.
          addToQueue(queueDatum);

          // Now get our next FetchSet from the Hadoop iterator.
          FetchSetDatum iterDatum = new FetchSetDatum(new TupleEntry(_values.next()));
          List<ScoredUrlDatum> urls = iterDatum.getUrls();
          String ref = iterDatum.getGroupingRef();

          if (iterDatum.isSkipped()) {
            trace(
                "Skipping %d urls via iterator from %s (e.g. %s)",
                urls.size(), ref, urls.get(0).getUrl());
            skipUrls(urls, UrlStatus.SKIPPED_PER_SERVER_LIMIT, null);
            continue;
          }

          if ((mode == FetcherMode.IMPOLITE) || readyToFetch(ref)) {
            trace(
                "Returning %d urls via iterator from %s (e.g. %s)",
                urls.size(), ref, urls.get(0).getUrl());
            return iterDatum;
          }

          // We've got a datum from the iterator that's not ready to be processed, so we'll stuff it
          // into the queue.
          trace(
              "Queuing %d urls via iterator from %s (e.g. %s)",
              urls.size(), iterDatum.getGroupingRef(), urls.get(0).getUrl());
          addToQueue(iterDatum);
          fetchSetsQueued += 1;
          continue;
        }

        // Nothing ready from top of queue, and iterator is empty too. If we had something from the
        // top of the queue (which then
        // must not be ready), decide what to do based on our FetcherMode.
        if (queueDatum != null) {
          List<ScoredUrlDatum> urls = queueDatum.getUrls();

          switch (mode) {
            case COMPLETE:
              // Re-add the datum, since we don't want to skip it. And immediately return, as
              // otherwise we're trapped
              // in this loop, versus giving FetchBuffer time to delay.
              trace(
                  "Blocked on %d urls via queue from %s (e.g. %s)",
                  urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl());
              addToQueue(queueDatum);
              return null;

            case IMPOLITE:
              trace(
                  "Impolitely returning %d urls via queue from %s (e.g. %s)",
                  urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl());
              return queueDatum;

            case EFFICIENT:
              // In efficient fetching, we punt on items that aren't ready. And immediately return,
              // so that FetchBuffer's loop has
              // time to delay, as otherwise we'd likely skip everything that's in the in-memory
              // queue (since the item we're skipping
              // is the "best" in terms of when it's going to be ready).
              trace(
                  "Efficiently skipping %d urls via queue from %s (e.g. %s)",
                  urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl());
              skipUrls(urls, UrlStatus.SKIPPED_INEFFICIENT, null);
              return null;
          }
        }
      }

      // Either we're all out of FetchSets to process (nothing left in iterator or queue) or we've
      // queued up lots of sets, and
      // we want to give FetchBuffer a chance to sleep.
      return null;
    }