コード例 #1
0
  void emitLastRecord(Reporter reporter) throws IOException {

    if (_flags != 0) {
      if (_domainStats == null) {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_NULL_DOMAINSTATS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTING_URL_RECORD_WITH_DOMINSTATS, 1);
      }

      if (_crawlStatus != null) {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_CRAWLSTATUS, 1);
      } else {
        reporter.incrCounter(Counters.EMITTED_RECORD_HAD_NULL_CRAWLSTATUS, 1);
      }
    }

    if (_contextURLBytes.getLength() >= 4097) {
      reporter.incrCounter(Counters.SKIPPING_INVALID_LENGTH_URL, 1);
    } else {
      GoogleURL urlObject = new GoogleURL(_contextURLBytes.toString());

      if (!skipRecord(urlObject, reporter)) {

        if (urlObject.has_query()) {
          reporter.incrCounter(Counters.LET_THROUGH_QUERY_URL, 1);
        }

        URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
        if (fp != null) {
          if (_emittedURLSFilter.isPresent(fp)) {
            reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
          } else {
            _emittedURLSFilter.add(fp);
            _emittedURLSInFilter++;

            SegmentGeneratorItem itemValue = new SegmentGeneratorItem();

            itemValue.setDomainFP(fp.getDomainHash());
            itemValue.setRootDomainFP(fp.getRootDomainHash());
            itemValue.setUrlFP(fp.getUrlHash());
            itemValue.setUrl(urlObject.getCanonicalURL());
            itemValue.setPageRank(0);
            itemValue.setModifiedStatus((byte) 0);

            items.add(itemValue);

            if (items.size() >= SPILL_THRESHOLD) spillItems(reporter);
          }
        } else {
          reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
        }
      }
    }

    // reset stuff
    _flags = 0;
    _crawlStatus = null;
    _contextURLBytes.clear();
    _blogURLSkipFlag.set(true);
  }
コード例 #2
0
 private JsonObject rawValueToJsonObject(
     DataOutputBuffer dataBuffer, DataInputBuffer stream, TextBytes tempTextBuffer)
     throws IOException {
   rawValueToTextBytes(dataBuffer, stream, tempTextBuffer);
   try {
     return parser.parse(tempTextBuffer.toString()).getAsJsonObject();
   } catch (Exception e) {
     throw new IOException("Exception Building Json from String:" + tempTextBuffer.toString());
   }
 }
コード例 #3
0
    void updateModelGivenUrlsSampleRecord(TextBytes inputData) {
      int curpos = inputData.getOffset();
      int endpos = inputData.getOffset() + inputData.getLength();

      byte lfPattern[] = {0xA};
      byte tabPattern[] = {0x9};

      TextBytes urlText = new TextBytes();

      while (curpos != endpos) {
        int tabIndex =
            ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern);
        if (tabIndex == -1) {
          break;
        } else {
          int lfIndex =
              ByteArrayUtils.indexOf(
                  inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern);
          if (lfIndex == -1) {
            break;
          } else {
            long sourceDomainHash =
                ByteArrayUtils.parseLong(inputData.getBytes(), curpos, tabIndex - curpos, 10);
            urlText.set(inputData.getBytes(), tabIndex + 1, lfIndex - (tabIndex + 1));
            incoming.put(sourceDomainHash, urlText.toString());
            curpos = lfIndex + 1;
          }
        }
      }
    }
コード例 #4
0
  void setDomainStats(JsonObject domainStats, Reporter reporter) throws IOException {

    _domainStats = domainStats;
    if (_domainStats.has("dR")) {
      _domainRank = _domainStats.get("dR").getAsDouble();
    } else {
      _domainRank = 0.0;
    }

    if (_domainStats.has("urls")) {
      int urlCount = _domainStats.get("urls").getAsInt();
      int crawledCount = _domainStats.get("crawled").getAsInt();
      int Http200Count = (_domainStats.has("200")) ? _domainStats.get("200").getAsInt() : 0;
      if (urlCount != 0 && crawledCount != 0 && Http200Count == 0) {
        reporter.incrCounter(Counters.SKIPPING_BAD_DOMAIN_BASED_ON_CRAWL_HISTORY, 1);
        LOG.info(
            "Skipping Everything But Homepage for Domain:"
                + _newDomainBytes.toString()
                + " CrawledCount:"
                + crawledCount
                + " HTTP200Count:"
                + Http200Count
                + " URLCount:"
                + urlCount);
        _skipEverythingButHomepage = true;
      } else if (urlCount > 25000 && urlCount < 100000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 3.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      } else if (urlCount > 250000 && urlCount < 1000000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 4.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      } else if (urlCount > 1000000) {
        if (!_domainStats.has("dR") || _domainStats.get("dR").getAsDouble() < 5.0) {
          LOG.info("Skipping Domain:" + _newDomainBytes.toString());
          reporter.incrCounter(Counters.SKIPPING_DOMAIN_EXCEEDED_URL_COUNT_AND_LOW_DR, 1);
          _skipDomain = true;
        }
      }
    }
    if (_emittedURLSInFilter >= FLUSH_THRESHOLD) {
      _emittedURLSFilter.clear();
      _emittedURLSInFilter = 0;
      reporter.incrCounter(Counters.FLUSHED_BLOOMFILTER, 1);
    }
  }
コード例 #5
0
 private static void rawValueToTextBytes(
     DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut)
     throws IOException {
   inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength());
   int newLength = WritableUtils.readVInt(inputBuffer);
   textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength);
 }
コード例 #6
0
  @Test
  public void testSourceInputOutputWriters() throws IOException {
    _sourceInputsBuffer = new DataOutputBuffer(16348 * 4);
    _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);

    String sourceDomainURL = "http://sourcedomain.com/foo";
    URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL);

    String urls[] = {"http://somedomain.com/foo", "http://someother.com/bar"};

    for (String url : urls) {
      URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url);
      // double insert and validate actual single insertion
      trackPotentialLinkSource(fp, url, sourceFP);
      trackPotentialLinkSource(fp, url, sourceFP);
    }

    //  validate data ...
    TextBytes firstVersion = new TextBytes();
    firstVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength());

    StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n");
    int itemIndex = 0;
    while (tokenizer.hasMoreElements()) {
      String nextLine = tokenizer.nextToken();
      String splits[] = nextLine.split("\t");
      // validate fp
      URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]);
      Assert.assertEquals(fp.getDomainHash(), Long.parseLong(splits[0]));
      // validate actual url ...
      Assert.assertEquals(splits[1], urls[itemIndex]);
      itemIndex++;
    }

    // reset output buffer ...
    _sourceInputsBuffer = new DataOutputBuffer(16348 * 4);
    // and source bloom filter ...
    _sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS);
    importLinkSourceData(sourceFP, firstVersion);
    // second text should match first ..
    TextBytes secondVersion = new TextBytes();
    secondVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength());
    Assert.assertEquals(firstVersion, secondVersion);
  }
コード例 #7
0
    JsonObject updateModelGivenCrawlStatus(TextBytes statusJSON) throws IOException {
      has_crawl_status = true;
      JsonParser parser = new JsonParser();
      JsonObject jsonObj = parser.parse(statusJSON.toString()).getAsJsonObject();
      if (source_url == null) {
        source_url = jsonObj.get("source_url").getAsString();
      }

      HashSet<String> extHrefs = new HashSet<String>();
      JsonObject crawlDetailRecord =
          crawlDetailRecordFromCrawlStatusRecord(
              jsonObj, fp, extHrefs, new MockCollectorReporter());
      long attemptTime = safeGetLong(crawlDetailRecord, CRAWLDETAIL_ATTEMPT_TIME_PROPERTY);
      latest_attempt_time = Math.max(attemptTime, latest_attempt_time);
      attempt_count++;
      int httpResult = safeGetInteger(crawlDetailRecord, CRAWLDETAIL_HTTPRESULT_PROPERTY);
      if (httpResult != -1) {
        if (latest_attempt_time == attemptTime) {
          this.http_result = httpResult;
        }
        if (httpResult >= 200 && httpResult <= 299) {
          latest_crawl_time = Math.max(attemptTime, latest_crawl_time);
          crawl_count++;
          if (latest_crawl_time == attemptTime) {
            this.parsed_as = crawlDetailRecord.get(CRAWLDETAIL_PARSEDAS_PROPERTY).getAsString();
            this.ext_urls.clear();
            this.ext_urls.addAll(extHrefs);
          }
        } else if (httpResult >= 300 && httpResult <= 399) {
          this.redirect_url =
              (crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL) != null)
                  ? crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL).getAsString()
                  : null;
        }
      }
      this.details.add(crawlDetailRecord);

      if (jsonObj.has("redirect_from")) {

        JsonObject redirectObject = jsonObj.get("redirect_from").getAsJsonObject();

        JsonObject redirectJSON = new JsonObject();

        int redirectHttpResult = redirectObject.get("http_result").getAsInt();

        redirectJSON.addProperty("disposition", "SUCCESS");
        redirectJSON.addProperty("http_result", redirectHttpResult);
        redirectJSON.addProperty("server_ip", redirectObject.get("server_ip").getAsString());
        redirectJSON.addProperty("attempt_time", jsonObj.get("attempt_time").getAsLong());
        redirectJSON.addProperty("target_url", jsonObj.get("source_url").getAsString());
        redirectJSON.addProperty("source_url", redirectObject.get("source_url").getAsString());

        return redirectJSON;
      }
      return null;
    }
コード例 #8
0
    public void updateModelGivenLinkRecord(TextBytes linkJSON) {

      JsonParser parser = new JsonParser();
      JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject();
      if (source_url == null) {
        source_url = jsonObj.get("href").getAsString();
      }

      String sourceURL = jsonObj.get("source_url").getAsString();
      URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL);
      if (urlfp != null) {
        if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) {
          if (!incoming.containsKey(urlfp.getRootDomainHash())) {
            incoming.put(urlfp.getRootDomainHash(), sourceURL);
          }
        }
      }
    }
コード例 #9
0
    void updateModelGivenMergedRecord(TextBytes mergedJSON) throws IOException {
      JsonObject mergeObject = new JsonParser().parse(mergedJSON.toString()).getAsJsonObject();

      source_url = mergeObject.get(TOPLEVEL_SOURCE_URL_PROPRETY).getAsString();
      has_crawl_status = mergeObject.has(TOPLEVEL_SUMMARYRECORD_PROPRETY);
      if (has_crawl_status) {
        JsonObject crawlStatusObj = mergeObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);

        latest_attempt_time = crawlStatusObj.get(SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY).getAsLong();
        latest_crawl_time = safeGetLong(crawlStatusObj, SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY);
        attempt_count = crawlStatusObj.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt();
        crawl_count =
            (crawlStatusObj.has(SUMMARYRECORD_CRAWLCOUNT_PROPERTY))
                ? crawlStatusObj.get(SUMMARYRECORD_CRAWLCOUNT_PROPERTY).getAsInt()
                : 0;
        parsed_as =
            (crawlStatusObj.has(SUMMARYRECORD_PARSEDAS_PROPERTY))
                ? crawlStatusObj.get(SUMMARYRECORD_PARSEDAS_PROPERTY).getAsString()
                : null;
        http_result =
            (crawlStatusObj.has(SUMMARYRECORD_HTTP_RESULT_PROPERTY))
                ? crawlStatusObj.get(SUMMARYRECORD_HTTP_RESULT_PROPERTY).getAsInt()
                : -1;
        redirect_url =
            (crawlStatusObj.has(SUMMARYRECORD_REDIRECT_URL_PROPERTY))
                ? crawlStatusObj.get(SUMMARYRECORD_REDIRECT_URL_PROPERTY).getAsString()
                : null;
        safeJsonArrayToStringCollection(
            crawlStatusObj, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, ext_urls);
        if (crawlStatusObj.has(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) {
          for (JsonElement crawlDetail :
              crawlStatusObj.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) {
            details.add(crawlDetail.getAsJsonObject());
          }
        }
      }
    }
コード例 #10
0
 @Override
 public void collect(TextBytes key, TextBytes value) throws IOException {
   items.add(
       new Pair<TextBytes, TextBytes>(
           new TextBytes(key.toString()), new TextBytes(value.toString())));
 }
コード例 #11
0
  void iterateItems(MultiFileInputReader<TextBytes> multiFileInputReader, Reporter reporter)
      throws IOException {

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    int iterationCount = 0;

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

      reporter.incrCounter(Counters.GOT_RECORD, 1);

      int type = PartitionUtils.getTypeGivenPartitionKey(nextItem.e0._keyObject);
      PartitionUtils.getDomainGivenPartitionKey(nextItem.e0._keyObject, _newDomainBytes);
      PartitionUtils.getURLGivenPartitionKey(nextItem.e0._keyObject, _newURLBytes);

      if (_newURLBytes.compareTo(_contextURLBytes) != 0) {
        emitLastRecord(reporter);
      }

      long newDomainFP = SuperDomainList.domainFingerprintGivenName(_newDomainBytes.toString());

      if (newDomainFP != _currentDomainId) {
        reporter.incrCounter(Counters.TRANSITIONING_DOMAIN, 1);
        domainTransition(newDomainFP, _newDomainBytes.toString(), reporter);
      }

      RawRecordValue valueRaw = Iterables.getFirst(nextItem.e1, null);

      switch (type) {
        case CrawlListGeneratorTask.KEY_TYPE_CRAWLSTATS:
          {
            reporter.incrCounter(Counters.GOT_CRAWLSTATS, 1);
            setDomainStats(
                rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer), reporter);
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_HOMEPAGE_URL:
          {
            reporter.incrCounter(Counters.GOT_HOMEPAGE_DATA, 1);
            rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes);
            _flags |= HAS_HOMEPAGE_URLDATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL:
          {
            reporter.incrCounter(Counters.GOT_BLOGPROBE_DATA, 1);
            rawValueToWritable(valueRaw, tempBuffer, _blogURLSkipFlag);
            _contextURLBytes.set(_newURLBytes);
            _flags |= HAS_BLOGPROBE_URLDATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_FEED_URL:
          {
            reporter.incrCounter(Counters.GOT_FEEDURL_DATA, 1);
            rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes);
            _flags |= HAS_FEED_URLDATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_REDIRECT_RECORD:
          {
            reporter.incrCounter(Counters.GOT_REDIRECT_DATA, 1);
            _contextURLBytes.set(_newURLBytes);
            _flags |= HAS_REDIRECT_DATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_CRAWLDATA:
          {
            reporter.incrCounter(Counters.GOT_CRAWLURL_DATA, 1);
            _contextURLBytes.set(_newURLBytes);
            _crawlStatus = rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer);
            _flags |= HAS_CRAWL_STATUS;
          }
          break;
      }
    }
    // flush trailing record ...
    emitLastRecord(reporter);
    flushDomain(reporter);
  }