コード例 #1
0
  /** generate a bundle from the given list of items and simultaneously flush it * */
  void generateABundle(long domainFP, List<SegmentGeneratorItem> items, Reporter reporter)
      throws IOException {

    SegmentGeneratorItemBundle bundle = getBundleForDomain(domainFP);

    // LOG.info("Generating Bundle:" + currentBundleId + " for DH:" + domainFP);
    float maxPageRank = 0.0f;
    for (SegmentGeneratorItem item : items) {
      // LOG.info("URL:" + item.getUrl() + " Status:" +
      // CrawlDatum.getStatusName(item.getStatus()) +" PR:" +
      // item.getMetadata().getPageRank());
      bundle.getUrls().add(item);
      currentDomainURLCount++;
      maxPageRank = Math.max(maxPageRank, item.getPageRank());

      if (currentDomainURLCount <= 200) {
        urlDebugURLWriter.append(
            item.getUrl() + "\t" + item.getModifiedStatus() + "\t" + item.getPageRank() + "\n");
      }
    }
    // LOG.info("Done Generating Bunlde - PR is:" + maxPageRank);

    // set page rank for bundle
    bundle.setMaxPageRank(maxPageRank);

    flushCurrentBundle(reporter);
  }
コード例 #2
0
  void emitLastRecord(Reporter reporter) throws IOException {

    if (_flags != 0) {
      if (_domainStats == null) {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1);
      }

      if (_crawlStatus != null) {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1);
      }
    }

    if (_contextURLBytes.getLength() >= 4097) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1);
    } else {
      GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString());

      if (!skipRecord(urlObject, reporter)) {

        if (urlObject.has_query()) {
          reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1);
        }

        URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
        if (fp != null) {
          if (_emittedURLSFilter.isPresent(fp)) {
            reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
          } else {
            _emittedURLSFilter.add(fp);
            _emittedURLSInFilter++;

            SegmentGeneratorItem itemValue = new SegmentGeneratorItem();

            itemValue.setDomainFP(fp.getDomainHash());
            itemValue.setRootDomainFP(fp.getRootDomainHash());
            itemValue.setUrlFP(fp.getUrlHash());
            itemValue.setUrl(urlObject.getCanonicalURL());
            itemValue.setPageRank(0);
            itemValue.setModifiedStatus((byte) 0);

            items.add(itemValue);

            if (items.size() >= SPILL_THRESHOLD) spillItems(reporter);
          }
        } else {
          reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
        }
      }
    }

    // reset stuff
    _flags = 0;
    _crawlStatus = null;
    _contextURLBytes.clear();
    _blogURLSkipFlag.set(true);
  }