/** Used for redirections or when discovering sitemap URLs * */ private void handleOutlink(Tuple t, URL sURL, String newUrl, Metadata sourceMetadata) { // build an absolute URL try { URL tmpURL = URLUtil.resolveURL(sURL, newUrl); newUrl = tmpURL.toExternalForm(); } catch (MalformedURLException e) { LOG.debug("MalformedURLException on {} or {}: {}", sURL.toExternalForm(), newUrl, e); return; } // apply URL filters if (this.urlFilters != null) { newUrl = this.urlFilters.filter(sURL, sourceMetadata, newUrl); } // filtered if (newUrl == null) { return; } Metadata metadata = metadataTransfer.getMetaForOutlink(newUrl, sURL.toExternalForm(), sourceMetadata); // TODO check that hasn't exceeded max number of redirections _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, t, new Values(newUrl, metadata, Status.DISCOVERED)); }
private List<Outlink> toOutlinks( String url, Metadata metadata, Map<String, List<String>> slinks) { List<Outlink> outlinks = new LinkedList<>(); URL sourceUrl; try { sourceUrl = new URL(url); } catch (MalformedURLException e) { // we would have known by now as previous components check whether // the URL is valid LOG.error("MalformedURLException on {}", url); eventCounter.scope("error_invalid_source_url").incrBy(1); return outlinks; } Map<String, List<String>> linksKept = new HashMap<>(); for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) { String targetURL = linkEntry.getKey(); // filter the urls if (urlFilters != null) { targetURL = urlFilters.filter(sourceUrl, metadata, targetURL); if (targetURL == null) { eventCounter.scope("outlink_filtered").incr(); continue; } } // the link has survived the various filters if (targetURL != null) { List<String> anchors = linkEntry.getValue(); linksKept.put(targetURL, anchors); eventCounter.scope("outlink_kept").incr(); } } for (String outlink : linksKept.keySet()) { // configure which metadata gets inherited from parent Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata); Outlink ol = new Outlink(outlink); // add the anchors to the metadata? if (trackAnchors) { List<String> anchors = linksKept.get(outlink); if (anchors.size() > 0) { linkMetadata.addValues(ANCHORS_KEY_NAME, anchors); // sets the first anchor ol.setAnchor(anchors.get(0)); } } ol.setMetadata(linkMetadata); outlinks.add(ol); } return outlinks; }