private void addToQueue(FetchSetDatum datum) { if (datum != null) { _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, 1); _flowProcess.increment(FetchCounters.URLS_QUEUED, datum.getUrls().size()); _queue.add(datum); } }
/** * Return the top-most item from the queue, or null if the queue is empty. * * @return fetch set from queue */ private FetchSetDatum removeFromQueue() { FetchSetDatum result = _queue.poll(); if (result != null) { _flowProcess.increment(FetchCounters.FETCHSETS_QUEUED, -1); _flowProcess.increment(FetchCounters.URLS_QUEUED, -result.getUrls().size()); } return result; }
@Override public int compare(FetchSetDatum o1, FetchSetDatum o2) { long o1FetchTime = getFetchTime(o1.getGroupingRef()); long o2FetchTime = getFetchTime(o2.getGroupingRef()); // The entry that's ready sooner sorts sooner. If both // are ready, return the one with the bigger fetch set. if (o1FetchTime < o2FetchTime) { return -1; } else if (o1FetchTime > o2FetchTime) { return 1; } else if (o1.getUrls().size() > o2.getUrls().size()) { return -1; } else if (o1.getUrls().size() < o2.getUrls().size()) { return 1; } else { return 0; } }
@Override public void operate(FlowProcess process, BufferCall<NullContext> buffCall) { QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator()); _collector = buffCall.getOutputCollector(); FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy(); // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from // a single server, plus other values needed to set state properly. while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) { FetchSetDatum datum = values.nextOrNull(_fetcherMode); try { if (datum == null) { trace("Nothing ready to fetch, sleeping..."); process.keepAlive(); Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME); } else { List<ScoredUrlDatum> urls = datum.getUrls(); String ref = datum.getGroupingRef(); trace("Processing %d URLs for %s", urls.size(), ref); Runnable doFetch = new FetchTask(this, _fetcher, urls, ref); if (datum.isLastList()) { makeActive(ref, 0L); trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref); } else { Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay(); makeActive(ref, nextFetchTime); trace( "Executing fetch of %d URLs from %s (next fetch time %d)", urls.size(), ref, nextFetchTime); } long startTime = System.currentTimeMillis(); try { _executor.execute(doFetch); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Fetch pool rejected our fetch list for " + ref); finished(ref); skipUrls( urls, UrlStatus.SKIPPED_DEFERRED, String.format("Execution rejection skipped %d URLs", urls.size())); } // Adjust for how long it took to get the request queued. adjustActive(ref, System.currentTimeMillis() - startTime); } } catch (InterruptedException e) { LOGGER.warn("FetchBuffer interrupted!"); Thread.currentThread().interrupt(); } } // Skip all URLs that we've got left. if (!values.isEmpty()) { trace("Found unprocessed URLs"); UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT; while (!values.isEmpty()) { FetchSetDatum datum = values.drain(); List<ScoredUrlDatum> urls = datum.getUrls(); trace( "Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl()); skipUrls(urls, status, null); } } }
public FetchSetDatum nextOrNull(FetcherMode mode) { int fetchSetsQueued = 0; // Loop until we have something to return, or there's nothing that we can return, or we've // queued up as many fetchsets as we want without any delay. while (!isEmpty() && (fetchSetsQueued < MAX_FETCHSETS_TO_QUEUE_PER_DELAY)) { // First see if we've got something in the queue, and if so, then check if it's ready // to be processed. final FetchSetDatum queueDatum = removeFromQueue(); if (queueDatum != null) { String ref = queueDatum.getGroupingRef(); if (readyToFetch(ref) || (mode == FetcherMode.IMPOLITE)) { List<ScoredUrlDatum> urls = queueDatum.getUrls(); trace( "Returning %d urls via queue from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl()); return queueDatum; } } // Nothing ready from the top of the queue or nothing in the queue, let's see about the // iterator. if (safeHasNext()) { // Re-add the thing from the top of the queue, since we're going to want to keep it // around. // This is safe to call with a null datum. addToQueue(queueDatum); // Now get our next FetchSet from the Hadoop iterator. FetchSetDatum iterDatum = new FetchSetDatum(new TupleEntry(_values.next())); List<ScoredUrlDatum> urls = iterDatum.getUrls(); String ref = iterDatum.getGroupingRef(); if (iterDatum.isSkipped()) { trace( "Skipping %d urls via iterator from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl()); skipUrls(urls, UrlStatus.SKIPPED_PER_SERVER_LIMIT, null); continue; } if ((mode == FetcherMode.IMPOLITE) || readyToFetch(ref)) { trace( "Returning %d urls via iterator from %s (e.g. %s)", urls.size(), ref, urls.get(0).getUrl()); return iterDatum; } // We've got a datum from the iterator that's not ready to be processed, so we'll stuff it // into the queue. trace( "Queuing %d urls via iterator from %s (e.g. %s)", urls.size(), iterDatum.getGroupingRef(), urls.get(0).getUrl()); addToQueue(iterDatum); fetchSetsQueued += 1; continue; } // Nothing ready from top of queue, and iterator is empty too. If we had something from the // top of the queue (which then // must not be ready), decide what to do based on our FetcherMode. if (queueDatum != null) { List<ScoredUrlDatum> urls = queueDatum.getUrls(); switch (mode) { case COMPLETE: // Re-add the datum, since we don't want to skip it. And immediately return, as // otherwise we're trapped // in this loop, versus giving FetchBuffer time to delay. trace( "Blocked on %d urls via queue from %s (e.g. %s)", urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl()); addToQueue(queueDatum); return null; case IMPOLITE: trace( "Impolitely returning %d urls via queue from %s (e.g. %s)", urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl()); return queueDatum; case EFFICIENT: // In efficient fetching, we punt on items that aren't ready. And immediately return, // so that FetchBuffer's loop has // time to delay, as otherwise we'd likely skip everything that's in the in-memory // queue (since the item we're skipping // is the "best" in terms of when it's going to be ready). trace( "Efficiently skipping %d urls via queue from %s (e.g. %s)", urls.size(), queueDatum.getGroupingRef(), urls.get(0).getUrl()); skipUrls(urls, UrlStatus.SKIPPED_INEFFICIENT, null); return null; } } } // Either we're all out of FetchSets to process (nothing left in iterator or queue) or we've // queued up lots of sets, and // we want to give FetchBuffer a chance to sleep. return null; }