/** * update the model from the raw (generated tuples) * * @param tuple * @throws Exception */ void updateModelFromInputTuple(Pair<TextBytes, TextBytes> tuple) throws Exception { URLFPV2 fp = new URLFPV2(); // get key ... fp.setRootDomainHash( CrawlDBKey.getLongComponentFromKey( tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); fp.setDomainHash( CrawlDBKey.getLongComponentFromKey( tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); fp.setUrlHash( CrawlDBKey.getLongComponentFromKey( tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID)); long recordType = CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID); if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal() || recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) { // update model given key ... URLStateModel urlModel = fpToModelMap.get(fp); if (urlModel == null) { urlModel = new URLStateModel(); urlModel.fp = fp; fpToModelMap.put(fp, urlModel); } if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { JsonObject redirectJSON = urlModel.updateModelGivenCrawlStatus(tuple.e1); if (redirectJSON != null) { URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectJSON.get("source_url").getAsString()); TextBytes key = CrawlDBKey.generateKey( redirectFP, CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS, redirectJSON.get("attempt_time").getAsLong()); Pair<TextBytes, TextBytes> redirectTuple = new Pair<TextBytes, TextBytes>(key, new TextBytes(redirectJSON.toString())); updateModelFromInputTuple(redirectTuple); } } else if (recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) { urlModel.updateModelGivenLinkRecord(tuple.e1); } } }
public void updateModelGivenLinkRecord(TextBytes linkJSON) { JsonParser parser = new JsonParser(); JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject(); if (source_url == null) { source_url = jsonObj.get("href").getAsString(); } String sourceURL = jsonObj.get("source_url").getAsString(); URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL); if (urlfp != null) { if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) { if (!incoming.containsKey(urlfp.getRootDomainHash())) { incoming.put(urlfp.getRootDomainHash(), sourceURL); } } } }