コード例 #1
0
    /**
     * update the model from the raw (generated tuples)
     *
     * @param tuple
     * @throws Exception
     */
    void updateModelFromInputTuple(Pair<TextBytes, TextBytes> tuple) throws Exception {
      URLFPV2 fp = new URLFPV2();
      // get key ...
      fp.setRootDomainHash(
          CrawlDBKey.getLongComponentFromKey(
              tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
      fp.setDomainHash(
          CrawlDBKey.getLongComponentFromKey(
              tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
      fp.setUrlHash(
          CrawlDBKey.getLongComponentFromKey(
              tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));

      long recordType =
          CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);

      if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()
          || recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) {
        // update model given key ...
        URLStateModel urlModel = fpToModelMap.get(fp);
        if (urlModel == null) {
          urlModel = new URLStateModel();
          urlModel.fp = fp;
          fpToModelMap.put(fp, urlModel);
        }

        if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
          JsonObject redirectJSON = urlModel.updateModelGivenCrawlStatus(tuple.e1);

          if (redirectJSON != null) {
            URLFPV2 redirectFP =
                URLUtils.getURLFPV2FromURL(redirectJSON.get("source_url").getAsString());
            TextBytes key =
                CrawlDBKey.generateKey(
                    redirectFP,
                    CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS,
                    redirectJSON.get("attempt_time").getAsLong());
            Pair<TextBytes, TextBytes> redirectTuple =
                new Pair<TextBytes, TextBytes>(key, new TextBytes(redirectJSON.toString()));
            updateModelFromInputTuple(redirectTuple);
          }

        } else if (recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) {
          urlModel.updateModelGivenLinkRecord(tuple.e1);
        }
      }
    }
コード例 #2
0
    public void updateModelGivenLinkRecord(TextBytes linkJSON) {

      JsonParser parser = new JsonParser();
      JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject();
      if (source_url == null) {
        source_url = jsonObj.get("href").getAsString();
      }

      String sourceURL = jsonObj.get("source_url").getAsString();
      URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL);
      if (urlfp != null) {
        if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) {
          if (!incoming.containsKey(urlfp.getRootDomainHash())) {
            incoming.put(urlfp.getRootDomainHash(), sourceURL);
          }
        }
      }
    }