@Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { if (_useFiledsNone) { int count = 0; Iterator<TupleEntry> iter = bufferCall.getArgumentsIterator(); while (iter.hasNext()) { iter.next(); count += 1; } bufferCall.getOutputCollector().add(new Tuple(count)); } else { bufferCall.getOutputCollector().add(bufferCall.getGroup()); } }
@Override public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) { TupleEntry group = bufferCall.getGroup(); String protocolAndDomain = group.getString(0); LOGGER.info("Processing tuple group: " + group); DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY); Iterator<TupleEntry> values = bufferCall.getArgumentsIterator(); while (values.hasNext()) { urls.add(new GroupedUrlDatum(new TupleEntry(values.next()))); } try { Runnable doRobots = new ProcessRobotsTask( protocolAndDomain, _scorer, urls, _fetcher, _parser, bufferCall.getOutputCollector(), _flowProcess); _executor.execute(doRobots); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } catch (Throwable t) { LOGGER.error( "Caught an unexpected throwable - robots handling rejected our request for " + protocolAndDomain, t); _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1); _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size()); ProcessRobotsTask.emptyQueue( urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess); } }
@Override public void operate(FlowProcess flowProcess, BufferCall bufferCall) { Iterator<TupleEntry> it = bufferCall.getArgumentsIterator(); HyperLogLog merged = null; try { while (it.hasNext()) { TupleEntry tupleEntry = it.next(); byte[] serialized = (byte[]) tupleEntry.getObject(0); HyperLogLog hll = HyperLogLog.Builder.build(serialized); if (merged == null) { merged = hll; } else { merged = (HyperLogLog) merged.merge(hll); } } } catch (Exception e) { throw new RuntimeException(e); } }
public void operate(FlowProcess flowProcess, BufferCall<TupleEntryCollector> bufferCall) { if (bufferCall.getJoinerClosure() != null) throw new IllegalStateException("joiner closure should be null"); if (insertHeader) bufferCall.getOutputCollector().add(new Tuple(value)); Iterator<TupleEntry> iterator = bufferCall.getArgumentsIterator(); while (iterator.hasNext()) { TupleEntry arguments = iterator.next(); // must be called if (expectedSize != -1 && arguments.size() != expectedSize) throw new RuntimeException("arguments wrong size"); if (path != null) bufferCall.getContext().add(arguments); if (value != null) bufferCall.getOutputCollector().add(new Tuple(value)); else bufferCall.getOutputCollector().add(arguments); // copy } if (insertFooter) bufferCall.getOutputCollector().add(new Tuple(value)); iterator.hasNext(); // regression }
@Override public void operate(FlowProcess process, BufferCall<NullContext> buffCall) { QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator()); _collector = buffCall.getOutputCollector(); FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy(); // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from // a single server, plus other values needed to set state properly. while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) { FetchSetDatum datum = values.nextOrNull(_fetcherMode); try { if (datum == null) { trace("Nothing ready to fetch, sleeping..."); process.keepAlive(); Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME); } else { List<ScoredUrlDatum> urls = datum.getUrls(); String ref = datum.getGroupingRef(); trace("Processing %d URLs for %s", urls.size(), ref); Runnable doFetch = new FetchTask(this, _fetcher, urls, ref); if (datum.isLastList()) { makeActive(ref, 0L); trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref); } else { Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay(); makeActive(ref, nextFetchTime); trace( "Executing fetch of %d URLs from %s (next fetch time %d)", urls.size(), ref, nextFetchTime); } long startTime = System.currentTimeMillis(); try { _executor.execute(doFetch); } catch (RejectedExecutionException e) { // should never happen. LOGGER.error("Fetch pool rejected our fetch list for " + ref); finished(ref); skipUrls( urls, UrlStatus.SKIPPED_DEFERRED, String.format("Execution rejection skipped %d URLs", urls.size())); } // Adjust for how long it took to get the request queued. adjustActive(ref, System.currentTimeMillis() - startTime); } } catch (InterruptedException e) { LOGGER.warn("FetchBuffer interrupted!"); Thread.currentThread().interrupt(); } } // Skip all URLs that we've got left. if (!values.isEmpty()) { trace("Found unprocessed URLs"); UrlStatus status = Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT; while (!values.isEmpty()) { FetchSetDatum datum = values.drain(); List<ScoredUrlDatum> urls = datum.getUrls(); trace( "Skipping %d urls from %s (e.g. %s) ", urls.size(), datum.getGroupingRef(), urls.get(0).getUrl()); skipUrls(urls, status, null); } } }