@Override
    public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
      if (_useFiledsNone) {
        int count = 0;
        Iterator<TupleEntry> iter = bufferCall.getArgumentsIterator();
        while (iter.hasNext()) {
          iter.next();
          count += 1;
        }

        bufferCall.getOutputCollector().add(new Tuple(count));
      } else {
        bufferCall.getOutputCollector().add(bufferCall.getGroup());
      }
    }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall<NullContext> bufferCall) {
    TupleEntry group = bufferCall.getGroup();
    String protocolAndDomain = group.getString(0);
    LOGGER.info("Processing tuple group: " + group);

    DiskQueue<GroupedUrlDatum> urls = new DiskQueue<GroupedUrlDatum>(MAX_URLS_IN_MEMORY);
    Iterator<TupleEntry> values = bufferCall.getArgumentsIterator();
    while (values.hasNext()) {
      urls.add(new GroupedUrlDatum(new TupleEntry(values.next())));
    }

    try {
      Runnable doRobots =
          new ProcessRobotsTask(
              protocolAndDomain,
              _scorer,
              urls,
              _fetcher,
              _parser,
              bufferCall.getOutputCollector(),
              _flowProcess);
      _executor.execute(doRobots);
    } catch (RejectedExecutionException e) {
      // should never happen.
      LOGGER.error("Robots handling pool rejected our request for " + protocolAndDomain);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    } catch (Throwable t) {
      LOGGER.error(
          "Caught an unexpected throwable - robots handling rejected our request for "
              + protocolAndDomain,
          t);
      _flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
      _flowProcess.increment(FetchCounters.URLS_REJECTED, urls.size());
      ProcessRobotsTask.emptyQueue(
          urls, GroupingKey.DEFERRED_GROUPING_KEY, bufferCall.getOutputCollector(), flowProcess);
    }
  }
  @Override
  public void operate(FlowProcess flowProcess, BufferCall bufferCall) {
    Iterator<TupleEntry> it = bufferCall.getArgumentsIterator();
    HyperLogLog merged = null;

    try {
      while (it.hasNext()) {
        TupleEntry tupleEntry = it.next();
        byte[] serialized = (byte[]) tupleEntry.getObject(0);

        HyperLogLog hll = HyperLogLog.Builder.build(serialized);
        if (merged == null) {
          merged = hll;
        } else {
          merged = (HyperLogLog) merged.merge(hll);
        }
      }
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
Example #4
0
  public void operate(FlowProcess flowProcess, BufferCall<TupleEntryCollector> bufferCall) {
    if (bufferCall.getJoinerClosure() != null)
      throw new IllegalStateException("joiner closure should be null");

    if (insertHeader) bufferCall.getOutputCollector().add(new Tuple(value));

    Iterator<TupleEntry> iterator = bufferCall.getArgumentsIterator();

    while (iterator.hasNext()) {
      TupleEntry arguments = iterator.next(); // must be called

      if (expectedSize != -1 && arguments.size() != expectedSize)
        throw new RuntimeException("arguments wrong size");

      if (path != null) bufferCall.getContext().add(arguments);

      if (value != null) bufferCall.getOutputCollector().add(new Tuple(value));
      else bufferCall.getOutputCollector().add(arguments); // copy
    }

    if (insertFooter) bufferCall.getOutputCollector().add(new Tuple(value));

    iterator.hasNext(); // regression
  }
Example #5
0
  @Override
  public void operate(FlowProcess process, BufferCall<NullContext> buffCall) {
    QueuedValues values = new QueuedValues(buffCall.getArgumentsIterator());

    _collector = buffCall.getOutputCollector();
    FetcherPolicy fetcherPolicy = _fetcher.getFetcherPolicy();

    // Each value is a PreFetchedDatum that contains a set of URLs to fetch in one request from
    // a single server, plus other values needed to set state properly.
    while (!Thread.interrupted() && !fetcherPolicy.isTerminateFetch() && !values.isEmpty()) {
      FetchSetDatum datum = values.nextOrNull(_fetcherMode);

      try {
        if (datum == null) {
          trace("Nothing ready to fetch, sleeping...");
          process.keepAlive();
          Thread.sleep(NOTHING_TO_FETCH_SLEEP_TIME);
        } else {
          List<ScoredUrlDatum> urls = datum.getUrls();
          String ref = datum.getGroupingRef();
          trace("Processing %d URLs for %s", urls.size(), ref);

          Runnable doFetch = new FetchTask(this, _fetcher, urls, ref);
          if (datum.isLastList()) {
            makeActive(ref, 0L);
            trace("Executing fetch of %d URLs from %s (last batch)", urls.size(), ref);
          } else {
            Long nextFetchTime = System.currentTimeMillis() + datum.getFetchDelay();
            makeActive(ref, nextFetchTime);
            trace(
                "Executing fetch of %d URLs from %s (next fetch time %d)",
                urls.size(), ref, nextFetchTime);
          }

          long startTime = System.currentTimeMillis();

          try {
            _executor.execute(doFetch);
          } catch (RejectedExecutionException e) {
            // should never happen.
            LOGGER.error("Fetch pool rejected our fetch list for " + ref);

            finished(ref);
            skipUrls(
                urls,
                UrlStatus.SKIPPED_DEFERRED,
                String.format("Execution rejection skipped %d URLs", urls.size()));
          }

          // Adjust for how long it took to get the request queued.
          adjustActive(ref, System.currentTimeMillis() - startTime);
        }
      } catch (InterruptedException e) {
        LOGGER.warn("FetchBuffer interrupted!");
        Thread.currentThread().interrupt();
      }
    }

    // Skip all URLs that we've got left.
    if (!values.isEmpty()) {
      trace("Found unprocessed URLs");

      UrlStatus status =
          Thread.interrupted() ? UrlStatus.SKIPPED_INTERRUPTED : UrlStatus.SKIPPED_TIME_LIMIT;

      while (!values.isEmpty()) {
        FetchSetDatum datum = values.drain();
        List<ScoredUrlDatum> urls = datum.getUrls();
        trace(
            "Skipping %d urls from %s (e.g. %s) ",
            urls.size(), datum.getGroupingRef(), urls.get(0).getUrl());
        skipUrls(urls, status, null);
      }
    }
  }