private List<Outlink> toOutlinks( String url, Metadata metadata, Map<String, List<String>> slinks) { List<Outlink> outlinks = new LinkedList<>(); URL sourceUrl; try { sourceUrl = new URL(url); } catch (MalformedURLException e) { // we would have known by now as previous components check whether // the URL is valid LOG.error("MalformedURLException on {}", url); eventCounter.scope("error_invalid_source_url").incrBy(1); return outlinks; } Map<String, List<String>> linksKept = new HashMap<>(); for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) { String targetURL = linkEntry.getKey(); // filter the urls if (urlFilters != null) { targetURL = urlFilters.filter(sourceUrl, metadata, targetURL); if (targetURL == null) { eventCounter.scope("outlink_filtered").incr(); continue; } } // the link has survived the various filters if (targetURL != null) { List<String> anchors = linkEntry.getValue(); linksKept.put(targetURL, anchors); eventCounter.scope("outlink_kept").incr(); } } for (String outlink : linksKept.keySet()) { // configure which metadata gets inherited from parent Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata); Outlink ol = new Outlink(outlink); // add the anchors to the metadata? if (trackAnchors) { List<String> anchors = linksKept.get(outlink); if (anchors.size() > 0) { linkMetadata.addValues(ANCHORS_KEY_NAME, anchors); // sets the first anchor ol.setAnchor(anchors.get(0)); } } ol.setMetadata(linkMetadata); outlinks.add(ol); } return outlinks; }
private void handleException( String url, Throwable e, Metadata metadata, Tuple tuple, String errorSource, String errorMessage) { LOG.error(errorMessage); // send to status stream in case another component wants to update // its status metadata.setValue(Constants.STATUS_ERROR_SOURCE, errorSource); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); // Increment metric that is context specific String s = "error_" + errorSource.replaceAll(" ", "_") + "_"; eventCounter.scope(s + e.getClass().getSimpleName()).incrBy(1); // Increment general metric eventCounter.scope("parse exception").incrBy(1); }
@Override public void execute(Tuple input) { String urlString = input.getStringByField("url"); if (StringUtils.isBlank(urlString)) { LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskIndex, input); // ignore silently _collector.ack(input); return; } Metadata metadata = null; if (input.contains("metadata")) metadata = (Metadata) input.getValueByField("metadata"); if (metadata == null) metadata = Metadata.empty; URL url; try { url = new URL(urlString); } catch (MalformedURLException e) { LOG.error("{} is a malformed URL", urlString); // Report to status stream and ack if (metadata == Metadata.empty) { metadata = new Metadata(); } metadata.setValue("error.cause", "malformed URL"); _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); _collector.ack(input); return; } // check when we are allowed to process it String key = getPolitenessKey(url); Long timeAllowed = throttler.getIfPresent(key); if (timeAllowed != null) { long now = System.currentTimeMillis(); long timeToWait = timeAllowed - now; if (timeToWait > 0) { try { Thread.sleep(timeToWait); } catch (InterruptedException e) { LOG.error("[Fetcher #{}] caught InterruptedException caught while waiting"); } } } long delay = this.crawlDelay; try { Protocol protocol = protocolFactory.getProtocol(url); BaseRobotRules rules = protocol.getRobotRules(urlString); // autodiscovery of sitemaps // the sitemaps will be sent down the topology // as many times as there is a URL for a given host // the status updater will certainly cache things // but we could also have a simple cache mechanism here // as well. if (sitemapsAutoDiscovery) { for (String sitemapURL : rules.getSitemaps()) { handleOutlink(input, url, sitemapURL, metadata); } } if (!rules.isAllowed(urlString)) { LOG.info("Denied by robots.txt: {}", urlString); metadata.setValue("error.cause", "robots.txt"); // Report to status stream and ack _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.ERROR)); _collector.ack(input); return; } // get the delay from robots // value is negative when not set long robotsDelay = rules.getCrawlDelay(); if (robotsDelay > 0) { // cap the value to a maximum // as some sites specify ridiculous values if (robotsDelay > maxCrawlDelay) { LOG.debug("Delay from robots capped at {} for {}", robotsDelay, url); delay = maxCrawlDelay; } else { delay = robotsDelay; } } long start = System.currentTimeMillis(); ProtocolResponse response = protocol.getProtocolOutput(urlString, metadata); long timeFetching = System.currentTimeMillis() - start; averagedMetrics.scope("fetch_time").update(timeFetching); averagedMetrics.scope("bytes_fetched").update(response.getContent().length); eventCounter.scope("fetched").incrBy(1); perSecMetrics.scope("bytes_fetched_perSec").update(response.getContent().length); perSecMetrics.scope("fetched_perSec").update(1); LOG.info( "[Fetcher #{}] Fetched {} with status {} in {}", taskIndex, urlString, response.getStatusCode(), timeFetching); response .getMetadata() .setValue("fetch.statusCode", Integer.toString(response.getStatusCode())); response.getMetadata().putAll(metadata); // determine the status based on the status code Status status = Status.fromHTTPCode(response.getStatusCode()); // if the status is OK emit on default stream if (status.equals(Status.FETCHED)) { _collector.emit( Utils.DEFAULT_STREAM_ID, input, new Values(urlString, response.getContent(), response.getMetadata())); } else if (status.equals(Status.REDIRECTION)) { // Mark URL as redirected _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, response.getMetadata(), status)); // find the URL it redirects to String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION); if (allowRedirs && redirection != null && StringUtils.isNotBlank(redirection)) { handleOutlink(input, url, redirection, response.getMetadata()); } } else { // Error _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, response.getMetadata(), status)); } } catch (Exception exece) { String message = exece.getMessage(); if (message == null) message = ""; // common exceptions for which we log only a short message if (exece.getCause() instanceof java.util.concurrent.TimeoutException) LOG.error("Socket timeout fetching {}", urlString); else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString); else if (exece.getCause() instanceof java.net.UnknownHostException) LOG.error("Unknown host {}", urlString); else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString); // log the full stacktrace else LOG.error("Exception while fetching {}", urlString, exece); eventCounter.scope("exception").incrBy(1); // could be an empty, immutable Metadata if (metadata.size() == 0) { metadata = new Metadata(); } // add the reason of the failure in the metadata metadata.setValue("fetch.exception", message); _collector.emit( com.digitalpebble.storm.crawler.Constants.StatusStreamName, input, new Values(urlString, metadata, Status.FETCH_ERROR)); } // update the throttler throttler.put(key, System.currentTimeMillis() + delay); _collector.ack(input); }
@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (StringUtils.isNotBlank(httpCT)) { if (httpCT.toLowerCase().contains("html")) { CT_OK = true; } } // simply ignore cases where the content type has not been set // TODO sniff content with Tika? else { CT_OK = true; } if (!CT_OK) { String errorMessage = "Exception content-type " + httpCT + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); return; } LOG.info("Parsing : starting {}", url); long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } text = jsoupDoc.body().text(); } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : parse.getOutlinks()) { collector.emit( StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit( tuple, new Values( doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }