public FetchBuffer(BaseFetcher fetcher) { // We're going to output a tuple that contains a FetchedDatum, plus meta-data, // plus a result that could be a string, a status, or an exception super(FetchedDatum.FIELDS.append(FETCH_RESULT_FIELD)); _fetcher = fetcher; _fetcherMode = _fetcher.getFetcherPolicy().getFetcherMode(); }
@SuppressWarnings({"unchecked"}) @Override public void prepare(FlowProcess flowProcess, OperationCall<NullContext> operationCall) { super.prepare(flowProcess, operationCall); _flowProcess = new LoggingFlowProcess(flowProcess); _flowProcess.addReporter(new LoggingFlowReporter()); _executor = new ThreadedExecutor( _fetcher.getMaxThreads(), _fetcher.getFetcherPolicy().getRequestTimeout()); _refLock = new Object(); _pendingRefs = new ConcurrentHashMap<String, Long>(); _activeRefs = new ConcurrentHashMap<String, Long>(); _keepCollecting = new AtomicBoolean(true); }
@SuppressWarnings("unchecked") @Override public void prepare( FlowProcess flowProcess, cascading.operation.OperationCall<NullContext> operationCall) { _executor = new ThreadedExecutor(_fetcher.getMaxThreads(), COMMAND_TIMEOUT); // FUTURE KKr - use Cascading process vs creating our own, once it // supports logging in local mode, and a setStatus() call. _flowProcess = new LoggingFlowProcess(flowProcess); _flowProcess.addReporter(new LoggingFlowReporter()); }
private synchronized void terminate() { if (_executor == null) { return; } try { // We don't know worst-case for amount of time a worker thread will effectively // "sleep" waiting for a FetchTask to be queued up, but we'll add in a bit of // slop to represent that amount of time. long pollTime = ThreadedExecutor.MAX_POLL_TIME; Thread.sleep(pollTime); long requestTimeout = _fetcher.getFetcherPolicy().getRequestTimeout(); if (!_executor.terminate(requestTimeout)) { LOGGER.warn("Had to do a hard termination of general fetching"); // Abort any active connections, which should give the FetchTasks a chance // to clean things up. _fetcher.abort(); // Now give everybody who had to be interrupted some time to // actually write out their remaining URLs. Thread.sleep(HARD_TERMINATION_CLEANUP_DURATION); } // Now stop collecting results. If somebody is in the middle of the collect() call, // we want them to finish before we set it to false and drop out of this method. synchronized (_keepCollecting) { _keepCollecting.set(false); } } catch (InterruptedException e) { // FUTURE What's the right thing to do here? E.g. do I need to worry about // losing URLs still to be processed? LOGGER.warn("Interrupted while waiting for termination"); } finally { _executor = null; } }
@Override public void operate(FlowProcess process, BufferCall<NullContext> buffCall) { QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator()); _collector = buffCall.getOutputCollector(); FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy(); // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from // a single server, plus other values needed to set state properly. while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) { FetchSetDatum datum = values.nextOrNull(_fetcherMode); try { if (datum == null) { trace("Nothing ready to fetch, sleeping..."); process.keepAlive(); Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME); } else { List<ScoredUrlDatum> urls = datum.getUrls(); String ref = datum.getGroupingRef(); trace("Processing %d URLs for %s", urls.size(), ref); Runnable doFetch = new FetchTask(this, _fetcher, urls, ref); if (datum.isLastList()) { makeActive(ref, 0L); trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref); } else { Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay(); makeActive(ref, nextFetchTime); trace( "Executing fetch of %d URLs from %s (next fetch time %d)", urls.size(), ref, nextFetchTime); } long startTime = System.currentTimeMillis(); try { _executor.execute(doFetch); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Fetch pool rejected our fetch list for " + ref); finished(ref); skipUrls( urls, UrlStatus.SKIPPED_DEFERRED, String.format("Execution rejection skipped %d URLs", urls.size())); } // Adjust for how long it took to get the request queued. adjustActive(ref, System.currentTimeMillis() - startTime); } } catch (InterruptedException e) { LOGGER.warn("FetchBuffer interrupted!"); Thread.currentThread().interrupt(); } } // Skip all URLs that we've got left. if (!values.isEmpty()) { trace("Found unprocessed URLs"); UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT; while (!values.isEmpty()) { FetchSetDatum datum = values.drain(); List<ScoredUrlDatum> urls = datum.getUrls(); trace( "Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl()); skipUrls(urls, status, null); } } }