Esempio n. 1
0
  private List<Outlink> toOutlinks(
      String url, Metadata metadata, Map<String, List<String>> slinks) {
    List<Outlink> outlinks = new LinkedList<>();
    URL sourceUrl;
    try {
      sourceUrl = new URL(url);
    } catch (MalformedURLException e) {
      // we would have known by now as previous components check whether
      // the URL is valid
      LOG.error("MalformedURLException on {}", url);
      eventCounter.scope("error_invalid_source_url").incrBy(1);
      return outlinks;
    }

    Map<String, List<String>> linksKept = new HashMap<>();

    for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) {
      String targetURL = linkEntry.getKey();
      // filter the urls
      if (urlFilters != null) {
        targetURL = urlFilters.filter(sourceUrl, metadata, targetURL);
        if (targetURL == null) {
          eventCounter.scope("outlink_filtered").incr();
          continue;
        }
      }
      // the link has survived the various filters
      if (targetURL != null) {
        List<String> anchors = linkEntry.getValue();
        linksKept.put(targetURL, anchors);
        eventCounter.scope("outlink_kept").incr();
      }
    }

    for (String outlink : linksKept.keySet()) {
      // configure which metadata gets inherited from parent
      Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata);
      Outlink ol = new Outlink(outlink);
      // add the anchors to the metadata?
      if (trackAnchors) {
        List<String> anchors = linksKept.get(outlink);
        if (anchors.size() > 0) {
          linkMetadata.addValues(ANCHORS_KEY_NAME, anchors);
          // sets the first anchor
          ol.setAnchor(anchors.get(0));
        }
      }
      ol.setMetadata(linkMetadata);
      outlinks.add(ol);
    }
    return outlinks;
  }
Esempio n. 2
0
 private void handleException(
     String url,
     Throwable e,
     Metadata metadata,
     Tuple tuple,
     String errorSource,
     String errorMessage) {
   LOG.error(errorMessage);
   // send to status stream in case another component wants to update
   // its status
   metadata.setValue(Constants.STATUS_ERROR_SOURCE, errorSource);
   metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
   collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
   collector.ack(tuple);
   // Increment metric that is context specific
   String s = "error_" + errorSource.replaceAll(" ", "_") + "_";
   eventCounter.scope(s + e.getClass().getSimpleName()).incrBy(1);
   // Increment general metric
   eventCounter.scope("parse exception").incrBy(1);
 }
  @Override
  public void execute(Tuple input) {

    String urlString = input.getStringByField("url");
    if (StringUtils.isBlank(urlString)) {
      LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskIndex, input);
      // ignore silently
      _collector.ack(input);
      return;
    }

    Metadata metadata = null;

    if (input.contains("metadata")) metadata = (Metadata) input.getValueByField("metadata");
    if (metadata == null) metadata = Metadata.empty;

    URL url;

    try {
      url = new URL(urlString);
    } catch (MalformedURLException e) {
      LOG.error("{} is a malformed URL", urlString);
      // Report to status stream and ack
      if (metadata == Metadata.empty) {
        metadata = new Metadata();
      }
      metadata.setValue("error.cause", "malformed URL");
      _collector.emit(
          com.digitalpebble.storm.crawler.Constants.StatusStreamName,
          input,
          new Values(urlString, metadata, Status.ERROR));
      _collector.ack(input);
      return;
    }

    // check when we are allowed to process it
    String key = getPolitenessKey(url);

    Long timeAllowed = throttler.getIfPresent(key);

    if (timeAllowed != null) {
      long now = System.currentTimeMillis();
      long timeToWait = timeAllowed - now;
      if (timeToWait > 0) {
        try {
          Thread.sleep(timeToWait);
        } catch (InterruptedException e) {
          LOG.error("[Fetcher #{}] caught InterruptedException caught while waiting");
        }
      }
    }

    long delay = this.crawlDelay;

    try {
      Protocol protocol = protocolFactory.getProtocol(url);

      BaseRobotRules rules = protocol.getRobotRules(urlString);

      // autodiscovery of sitemaps
      // the sitemaps will be sent down the topology
      // as many times as there is a URL for a given host
      // the status updater will certainly cache things
      // but we could also have a simple cache mechanism here
      // as well.
      if (sitemapsAutoDiscovery) {
        for (String sitemapURL : rules.getSitemaps()) {
          handleOutlink(input, url, sitemapURL, metadata);
        }
      }

      if (!rules.isAllowed(urlString)) {
        LOG.info("Denied by robots.txt: {}", urlString);

        metadata.setValue("error.cause", "robots.txt");

        // Report to status stream and ack
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, metadata, Status.ERROR));
        _collector.ack(input);
        return;
      }

      // get the delay from robots
      // value is negative when not set
      long robotsDelay = rules.getCrawlDelay();
      if (robotsDelay > 0) {
        // cap the value to a maximum
        // as some sites specify ridiculous values
        if (robotsDelay > maxCrawlDelay) {
          LOG.debug("Delay from robots capped at {} for {}", robotsDelay, url);
          delay = maxCrawlDelay;
        } else {
          delay = robotsDelay;
        }
      }

      long start = System.currentTimeMillis();
      ProtocolResponse response = protocol.getProtocolOutput(urlString, metadata);
      long timeFetching = System.currentTimeMillis() - start;

      averagedMetrics.scope("fetch_time").update(timeFetching);
      averagedMetrics.scope("bytes_fetched").update(response.getContent().length);
      eventCounter.scope("fetched").incrBy(1);
      perSecMetrics.scope("bytes_fetched_perSec").update(response.getContent().length);
      perSecMetrics.scope("fetched_perSec").update(1);

      LOG.info(
          "[Fetcher #{}] Fetched {} with status {} in {}",
          taskIndex,
          urlString,
          response.getStatusCode(),
          timeFetching);

      response
          .getMetadata()
          .setValue("fetch.statusCode", Integer.toString(response.getStatusCode()));

      response.getMetadata().putAll(metadata);

      // determine the status based on the status code
      Status status = Status.fromHTTPCode(response.getStatusCode());

      // if the status is OK emit on default stream
      if (status.equals(Status.FETCHED)) {
        _collector.emit(
            Utils.DEFAULT_STREAM_ID,
            input,
            new Values(urlString, response.getContent(), response.getMetadata()));
      } else if (status.equals(Status.REDIRECTION)) {
        // Mark URL as redirected
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, response.getMetadata(), status));

        // find the URL it redirects to
        String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION);

        if (allowRedirs && redirection != null && StringUtils.isNotBlank(redirection)) {
          handleOutlink(input, url, redirection, response.getMetadata());
        }
      } else {
        // Error
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, response.getMetadata(), status));
      }

    } catch (Exception exece) {

      String message = exece.getMessage();
      if (message == null) message = "";

      // common exceptions for which we log only a short message
      if (exece.getCause() instanceof java.util.concurrent.TimeoutException)
        LOG.error("Socket timeout fetching {}", urlString);
      else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString);
      else if (exece.getCause() instanceof java.net.UnknownHostException)
        LOG.error("Unknown host {}", urlString);
      else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString);
      // log the full stacktrace
      else LOG.error("Exception while fetching {}", urlString, exece);

      eventCounter.scope("exception").incrBy(1);

      // could be an empty, immutable Metadata
      if (metadata.size() == 0) {
        metadata = new Metadata();
      }

      // add the reason of the failure in the metadata
      metadata.setValue("fetch.exception", message);

      _collector.emit(
          com.digitalpebble.storm.crawler.Constants.StatusStreamName,
          input,
          new Values(urlString, metadata, Status.FETCH_ERROR));
    }

    // update the throttler
    throttler.put(key, System.currentTimeMillis() + delay);

    _collector.ack(input);
  }
Esempio n. 4
0
  @Override
  public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;
    String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    if (StringUtils.isNotBlank(httpCT)) {
      if (httpCT.toLowerCase().contains("html")) {
        CT_OK = true;
      }
    }
    // simply ignore cases where the content type has not been set
    // TODO sniff content with Tika?
    else {
      CT_OK = true;
    }

    if (!CT_OK) {
      String errorMessage = "Exception content-type " + httpCT + " for " + url;
      RuntimeException e = new RuntimeException(errorMessage);
      handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
      return;
    }

    LOG.info("Parsing : starting {}", url);

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text;
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
      org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

      fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

      // extracts the robots directives from the meta tags
      robotsTags.extractMetaTags(fragment);

      // store a normalised representation in metadata
      // so that the indexer is aware of it
      robotsTags.normaliseToMetadata(metadata);

      // do not extract the links if no follow has been set
      // and we are in strict mode
      if (robotsTags.isNoFollow() && robots_noFollow_strict) {
        slinks = new HashMap<>(0);
      } else {
        Elements links = jsoupDoc.select("a[href]");
        slinks = new HashMap<>(links.size());
        for (Element link : links) {
          // abs:href tells jsoup to return fully qualified domains
          // for
          // relative urls.
          // e.g.: /foo will resolve to http://shopstyle.com/foo
          String targetURL = link.attr("abs:href");

          // nofollow
          boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
          // remove altogether
          if (noFollow && robots_noFollow_strict) {
            continue;
          }

          // link not specifically marked as no follow
          // but whole page is
          if (!noFollow && robotsTags.isNoFollow()) {
            noFollow = true;
          }

          String anchor = link.text();
          if (StringUtils.isNotBlank(targetURL)) {
            // any existing anchors for the same target?
            List<String> anchors = slinks.get(targetURL);
            if (anchors == null) {
              anchors = new LinkedList<>();
              slinks.put(targetURL, anchors);
            }
            // track the anchors only if no follow is false
            if (!noFollow && StringUtils.isNotBlank(anchor)) {
              anchors.add(anchor);
            }
          }
        }
      }

      text = jsoupDoc.body().text();

    } catch (Throwable e) {
      String errorMessage = "Exception while parsing " + url + ": " + e;
      handleException(url, e, metadata, tuple, "content parsing", errorMessage);
      return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
      parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

      String errorMessage = "Exception while running parse filters on " + url + ": " + e;
      handleException(url, e, metadata, tuple, "content filtering", errorMessage);
      return;
    }

    if (emitOutlinks) {
      for (Outlink outlink : parse.getOutlinks()) {
        collector.emit(
            StatusStreamName,
            tuple,
            new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
      }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
      ParseData parseDoc = doc.getValue();

      collector.emit(
          tuple,
          new Values(
              doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
  }