/** Used for redirections or when discovering sitemap URLs * */ private void handleOutlink(Tuple t, URL sURL, String newUrl, Metadata sourceMetadata) { // build an absolute URL try { URL tmpURL = URLUtil.resolveURL(sURL, newUrl); newUrl = tmpURL.toExternalForm(); } catch (MalformedURLException e) { LOG.debug("MalformedURLException on {} or {}: {}", sURL.toExternalForm(), newUrl, e); return; } // apply URL filters if (this.urlFilters != null) { newUrl = this.urlFilters.filter(sURL, sourceMetadata, newUrl); } // filtered if (newUrl == null) { return; } Metadata metadata = metadataTransfer.getMetaForOutlink(newUrl, sURL.toExternalForm(), sourceMetadata); // TODO check that hasn't exceeded max number of redirections _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, t, new Values(newUrl, metadata, Status.DISCOVERED)); }
private List<Outlink> toOutlinks( String url, Metadata metadata, Map<String, List<String>> slinks) { List<Outlink> outlinks = new LinkedList<>(); URL sourceUrl; try { sourceUrl = new URL(url); } catch (MalformedURLException e) { // we would have known by now as previous components check whether // the URL is valid LOG.error("MalformedURLException on {}", url); eventCounter.scope("error_invalid_source_url").incrBy(1); return outlinks; } Map<String, List<String>> linksKept = new HashMap<>(); for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) { String targetURL = linkEntry.getKey(); // filter the urls if (urlFilters != null) { targetURL = urlFilters.filter(sourceUrl, metadata, targetURL); if (targetURL == null) { eventCounter.scope("outlink_filtered").incr(); continue; } } // the link has survived the various filters if (targetURL != null) { List<String> anchors = linkEntry.getValue(); linksKept.put(targetURL, anchors); eventCounter.scope("outlink_kept").incr(); } } for (String outlink : linksKept.keySet()) { // configure which metadata gets inherited from parent Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata); Outlink ol = new Outlink(outlink); // add the anchors to the metadata? if (trackAnchors) { List<String> anchors = linksKept.get(outlink); if (anchors.size() > 0) { linkMetadata.addValues(ANCHORS_KEY_NAME, anchors); // sets the first anchor ol.setAnchor(anchors.get(0)); } } ol.setMetadata(linkMetadata); outlinks.add(ol); } return outlinks; }
@SuppressWarnings({"rawtypes", "unchecked"}) @Override public void prepare(Map conf, TopologyContext context, OutputCollector collector) { this.collector = collector; eventCounter = context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10); parseFilters = ParseFilters.emptyParseFilter; String parseconfigfile = ConfUtils.getString(conf, "parsefilters.config.file", "parsefilters.json"); if (parseconfigfile != null) { try { parseFilters = new ParseFilters(conf, parseconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the ParseFilters"); throw new RuntimeException("Exception caught while loading the ParseFilters", e); } } urlFilters = URLFilters.emptyURLFilters; emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true); if (emitOutlinks) { String urlconfigfile = ConfUtils.getString(conf, "urlfilters.config.file", "urlfilters.json"); if (urlconfigfile != null) { try { urlFilters = new URLFilters(conf, urlconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the URLFilters"); throw new RuntimeException("Exception caught while loading the URLFilters", e); } } } trackAnchors = ConfUtils.getBoolean(conf, "track.anchors", true); robots_noFollow_strict = ConfUtils.getBoolean(conf, RobotsTags.ROBOTS_NO_FOLLOW_STRICT, true); metadataTransfer = MetadataTransfer.getInstance(conf); }
@SuppressWarnings({"rawtypes", "unchecked"}) @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { _collector = collector; this.conf = new Config(); this.conf.putAll(stormConf); checkConfiguration(); this.taskIndex = context.getThisTaskIndex(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH); long start = System.currentTimeMillis(); LOG.info("[Fetcher #{}] : starting at {}", taskIndex, sdf.format(start)); // Register a "MultiCountMetric" to count different events in this bolt // Storm will emit the counts every n seconds to a special bolt via a // system stream // The data can be accessed by registering a "MetricConsumer" in the // topology this.eventCounter = context.registerMetric("fetcher_counter", new MultiCountMetric(), 10); this.averagedMetrics = context.registerMetric("fetcher_average", new MultiReducedMetric(new MeanReducer()), 10); this.perSecMetrics = context.registerMetric( "fetcher_average_persec", new MultiReducedMetric(new PerSecondReducer()), 10); protocolFactory = new ProtocolFactory(conf); String urlconfigfile = ConfUtils.getString(conf, "urlfilters.config.file", "urlfilters.json"); if (urlconfigfile != null) try { urlFilters = new URLFilters(conf, urlconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the URLFilters"); throw new RuntimeException("Exception caught while loading the URLFilters", e); } metadataTransfer = MetadataTransfer.getInstance(stormConf); allowRedirs = ConfUtils.getBoolean( stormConf, com.digitalpebble.storm.crawler.Constants.AllowRedirParamName, true); sitemapsAutoDiscovery = ConfUtils.getBoolean(stormConf, "sitemap.discovery", false); queueMode = ConfUtils.getString(conf, "fetcher.queue.mode", QUEUE_MODE_HOST); // check that the mode is known if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN) && !queueMode.equals(QUEUE_MODE_HOST)) { LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode); queueMode = QUEUE_MODE_HOST; } LOG.info("Using queue mode : {}", queueMode); this.crawlDelay = (long) (ConfUtils.getFloat(conf, "fetcher.server.delay", 1.0f) * 1000); this.maxCrawlDelay = (long) ConfUtils.getInt(conf, "fetcher.max.crawl.delay", 30) * 1000; }