コード例 #1
0
  boolean skipRecord(GoogleURL urlObject, Reporter reporter) {

    if (_skipDomain) {
      reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_URL, 1);
      return true;
    }

    if (!urlObject.isValid()) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_URL, 1);
      return true;
    } else if (urlObject.has_query()) {
      reporter.incrCounter(Counters.HIT_QUERY_CHECK_CONDITION, 1);
      if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) == 0) {
        reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1);
        return true;
      }
    } else {
      // if redirect ... skip
      if ((_flags & HAS_REDIRECT_DATA) != 0) {
        reporter.incrCounter(Counters.SKIPPING_REDIRECTED_URL, 1);
        return true;
      }

      if ((_flags & (HAS_HOMEPAGE_URLDATA | HAS_FEED_URLDATA)) != 0) {
        if (!_skipEverythingButHomepage || ((_flags & HAS_HOMEPAGE_URLDATA) != 0)) {
          reporter.incrCounter(Counters.ALLOWING_HOMEPAGE_OR_FEEDURL, 1);
          return false;
        }
      }

      if (_skipEverythingButHomepage) {
        reporter.incrCounter(Counters.SKIPPING_EVERYTHING_BUT_HOMEPAGE_URL, 1);
        return true;
      }

      if (_crawlStatus != null) {
        if (_crawlStatus.has("crawl_status")) {
          JsonObject realCrawlStatus = _crawlStatus.get("crawl_status").getAsJsonObject();
          if (realCrawlStatus.has("http_result")) {
            int httpResult = realCrawlStatus.get("http_result").getAsInt();
            if (httpResult == 200 || httpResult == 404) {
              if ((_flags & HAS_BLOGPROBE_URLDATA) != 0) {
                if (_blogURLSkipFlag.get()) {
                  reporter.incrCounter(Counters.SKIPPING_BLOGPROBE_URL, 1);
                  return true;
                } else {
                  reporter.incrCounter(Counters.RECRAWLING_BLOGPROBE_URL, 1);
                  return false;
                }
              } else {
                reporter.incrCounter(Counters.SKIPPING_ALREADY_FETCHED, 1);
                return true;
              }
            }
          }
        }
      }
    }
    return false;
  }
コード例 #2
0
  void emitLastRecord(Reporter reporter) throws IOException {

    if (_flags != 0) {
      if (_domainStats == null) {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1);
      }

      if (_crawlStatus != null) {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1);
      }
    }

    if (_contextURLBytes.getLength() >= 4097) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1);
    } else {
      GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString());

      if (!skipRecord(urlObject, reporter)) {

        if (urlObject.has_query()) {
          reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1);
        }

        URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
        if (fp != null) {
          if (_emittedURLSFilter.isPresent(fp)) {
            reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
          } else {
            _emittedURLSFilter.add(fp);
            _emittedURLSInFilter++;

            SegmentGeneratorItem itemValue = new SegmentGeneratorItem();

            itemValue.setDomainFP(fp.getDomainHash());
            itemValue.setRootDomainFP(fp.getRootDomainHash());
            itemValue.setUrlFP(fp.getUrlHash());
            itemValue.setUrl(urlObject.getCanonicalURL());
            itemValue.setPageRank(0);
            itemValue.setModifiedStatus((byte) 0);

            items.add(itemValue);

            if (items.size() >= SPILL_THRESHOLD) spillItems(reporter);
          }
        } else {
          reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
        }
      }
    }

    // reset stuff
    _flags = 0;
    _crawlStatus = null;
    _contextURLBytes.clear();
    _blogURLSkipFlag.set(true);
  }