public static void main(String args[]) throws Exception { HttpProtocol protocol = new HttpProtocol(); Config conf = new Config(); String url = args[0]; ConfUtils.loadConf(args[1], conf); protocol.configure(conf); if (!protocol.skipRobots) { BaseRobotRules rules = protocol.getRobotRules(url); System.out.println("is allowed : " + rules.isAllowed(url)); } Metadata md = new Metadata(); ProtocolResponse response = protocol.getProtocolOutput(url, md); System.out.println(url); System.out.println(response.getMetadata()); System.out.println(response.getStatusCode()); System.out.println(response.getContent().length); }
@Override public void execute(Tuple input) { String urlString = input.getStringByField("url"); if (StringUtils.isBlank(urlString)) { LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskIndex, input); // ignore silently _collector.ack(input); return; } Metadata metadata = null; if (input.contains("metadata")) metadata = (Metadata) input.getValueByField("metadata"); if (metadata == null) metadata = Metadata.empty; URL url; try { url = new URL(urlString); } catch (MalformedURLException e) { LOG.error("{} is a malformed URL", urlString); // Report to status stream and ack if (metadata == Metadata.empty) { metadata = new Metadata(); } metadata.setValue("error.cause", "malformed URL"); _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); _collector.ack(input); return; } // check when we are allowed to process it String key = getPolitenessKey(url); Long timeAllowed = throttler.getIfPresent(key); if (timeAllowed != null) { long now = System.currentTimeMillis(); long timeToWait = timeAllowed - now; if (timeToWait > 0) { try { Thread.sleep(timeToWait); } catch (InterruptedException e) { LOG.error("[Fetcher #{}] caught InterruptedException caught while waiting"); } } } long delay = this.crawlDelay; try { Protocol protocol = protocolFactory.getProtocol(url); BaseRobotRules rules = protocol.getRobotRules(urlString); // autodiscovery of sitemaps // the sitemaps will be sent down the topology // as many times as there is a URL for a given host // the status updater will certainly cache things // but we could also have a simple cache mechanism here // as well. if (sitemapsAutoDiscovery) { for (String sitemapURL : rules.getSitemaps()) { handleOutlink(input, url, sitemapURL, metadata); } } if (!rules.isAllowed(urlString)) { LOG.info("Denied by robots.txt: {}", urlString); metadata.setValue("error.cause", "robots.txt"); // Report to status stream and ack _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); _collector.ack(input); return; } // get the delay from robots // value is negative when not set long robotsDelay = rules.getCrawlDelay(); if (robotsDelay > 0) { // cap the value to a maximum // as some sites specify ridiculous values if (robotsDelay > maxCrawlDelay) { LOG.debug("Delay from robots capped at {} for {}", robotsDelay, url); delay = maxCrawlDelay; } else { delay = robotsDelay; } } long start = System.currentTimeMillis(); ProtocolResponse response = protocol.getProtocolOutput(urlString, metadata); long timeFetching = System.currentTimeMillis() - start; averagedMetrics.scope("fetch_time").update(timeFetching); averagedMetrics.scope("bytes_fetched").update(response.getContent().length); eventCounter.scope("fetched").incrBy(1); perSecMetrics.scope("bytes_fetched_perSec").update(response.getContent().length); perSecMetrics.scope("fetched_perSec").update(1); LOG.info( "[Fetcher #{}] Fetched {} with status {} in {}", taskIndex, urlString, response.getStatusCode(), timeFetching); response .getMetadata() .setValue("fetch.statusCode", Integer.toString(response.getStatusCode())); response.getMetadata().putAll(metadata); // determine the status based on the status code Status status = Status.fromHTTPCode(response.getStatusCode()); // if the status is OK emit on default stream if (status.equals(Status.FETCHED)) { _collector.emit( Utils.DEFAULT_STREAM_ID, input, new Values(urlString, response.getContent(), response.getMetadata())); } else if (status.equals(Status.REDIRECTION)) { // Mark URL as redirected _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, response.getMetadata(), status)); // find the URL it redirects to String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION); if (allowRedirs && redirection != null && StringUtils.isNotBlank(redirection)) { handleOutlink(input, url, redirection, response.getMetadata()); } } else { // Error _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, response.getMetadata(), status)); } } catch (Exception exece) { String message = exece.getMessage(); if (message == null) message = ""; // common exceptions for which we log only a short message if (exece.getCause() instanceof java.util.concurrent.TimeoutException) LOG.error("Socket timeout fetching {}", urlString); else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString); else if (exece.getCause() instanceof java.net.UnknownHostException) LOG.error("Unknown host {}", urlString); else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString); // log the full stacktrace else LOG.error("Exception while fetching {}", urlString, exece); eventCounter.scope("exception").incrBy(1); // could be an empty, immutable Metadata if (metadata.size() == 0) { metadata = new Metadata(); } // add the reason of the failure in the metadata metadata.setValue("fetch.exception", message); _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.FETCH_ERROR)); } // update the throttler throttler.put(key, System.currentTimeMillis() + delay); _collector.ack(input); }