Ejemplo n.º 1
0
  public static void main(String args[]) throws Exception {
    HttpProtocol protocol = new HttpProtocol();
    Config conf = new Config();

    String url = args[0];
    ConfUtils.loadConf(args[1], conf);
    protocol.configure(conf);

    if (!protocol.skipRobots) {
      BaseRobotRules rules = protocol.getRobotRules(url);
      System.out.println("is allowed : " + rules.isAllowed(url));
    }

    Metadata md = new Metadata();
    ProtocolResponse response = protocol.getProtocolOutput(url, md);
    System.out.println(url);
    System.out.println(response.getMetadata());
    System.out.println(response.getStatusCode());
    System.out.println(response.getContent().length);
  }
  @Override
  public void execute(Tuple input) {

    String urlString = input.getStringByField("url");
    if (StringUtils.isBlank(urlString)) {
      LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskIndex, input);
      // ignore silently
      _collector.ack(input);
      return;
    }

    Metadata metadata = null;

    if (input.contains("metadata")) metadata = (Metadata) input.getValueByField("metadata");
    if (metadata == null) metadata = Metadata.empty;

    URL url;

    try {
      url = new URL(urlString);
    } catch (MalformedURLException e) {
      LOG.error("{} is a malformed URL", urlString);
      // Report to status stream and ack
      if (metadata == Metadata.empty) {
        metadata = new Metadata();
      }
      metadata.setValue("error.cause", "malformed URL");
      _collector.emit(
          com.digitalpebble.storm.crawler.Constants.StatusStreamName,
          input,
          new Values(urlString, metadata, Status.ERROR));
      _collector.ack(input);
      return;
    }

    // check when we are allowed to process it
    String key = getPolitenessKey(url);

    Long timeAllowed = throttler.getIfPresent(key);

    if (timeAllowed != null) {
      long now = System.currentTimeMillis();
      long timeToWait = timeAllowed - now;
      if (timeToWait > 0) {
        try {
          Thread.sleep(timeToWait);
        } catch (InterruptedException e) {
          LOG.error("[Fetcher #{}] caught InterruptedException caught while waiting");
        }
      }
    }

    long delay = this.crawlDelay;

    try {
      Protocol protocol = protocolFactory.getProtocol(url);

      BaseRobotRules rules = protocol.getRobotRules(urlString);

      // autodiscovery of sitemaps
      // the sitemaps will be sent down the topology
      // as many times as there is a URL for a given host
      // the status updater will certainly cache things
      // but we could also have a simple cache mechanism here
      // as well.
      if (sitemapsAutoDiscovery) {
        for (String sitemapURL : rules.getSitemaps()) {
          handleOutlink(input, url, sitemapURL, metadata);
        }
      }

      if (!rules.isAllowed(urlString)) {
        LOG.info("Denied by robots.txt: {}", urlString);

        metadata.setValue("error.cause", "robots.txt");

        // Report to status stream and ack
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, metadata, Status.ERROR));
        _collector.ack(input);
        return;
      }

      // get the delay from robots
      // value is negative when not set
      long robotsDelay = rules.getCrawlDelay();
      if (robotsDelay > 0) {
        // cap the value to a maximum
        // as some sites specify ridiculous values
        if (robotsDelay > maxCrawlDelay) {
          LOG.debug("Delay from robots capped at {} for {}", robotsDelay, url);
          delay = maxCrawlDelay;
        } else {
          delay = robotsDelay;
        }
      }

      long start = System.currentTimeMillis();
      ProtocolResponse response = protocol.getProtocolOutput(urlString, metadata);
      long timeFetching = System.currentTimeMillis() - start;

      averagedMetrics.scope("fetch_time").update(timeFetching);
      averagedMetrics.scope("bytes_fetched").update(response.getContent().length);
      eventCounter.scope("fetched").incrBy(1);
      perSecMetrics.scope("bytes_fetched_perSec").update(response.getContent().length);
      perSecMetrics.scope("fetched_perSec").update(1);

      LOG.info(
          "[Fetcher #{}] Fetched {} with status {} in {}",
          taskIndex,
          urlString,
          response.getStatusCode(),
          timeFetching);

      response
          .getMetadata()
          .setValue("fetch.statusCode", Integer.toString(response.getStatusCode()));

      response.getMetadata().putAll(metadata);

      // determine the status based on the status code
      Status status = Status.fromHTTPCode(response.getStatusCode());

      // if the status is OK emit on default stream
      if (status.equals(Status.FETCHED)) {
        _collector.emit(
            Utils.DEFAULT_STREAM_ID,
            input,
            new Values(urlString, response.getContent(), response.getMetadata()));
      } else if (status.equals(Status.REDIRECTION)) {
        // Mark URL as redirected
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, response.getMetadata(), status));

        // find the URL it redirects to
        String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION);

        if (allowRedirs && redirection != null && StringUtils.isNotBlank(redirection)) {
          handleOutlink(input, url, redirection, response.getMetadata());
        }
      } else {
        // Error
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, response.getMetadata(), status));
      }

    } catch (Exception exece) {

      String message = exece.getMessage();
      if (message == null) message = "";

      // common exceptions for which we log only a short message
      if (exece.getCause() instanceof java.util.concurrent.TimeoutException)
        LOG.error("Socket timeout fetching {}", urlString);
      else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString);
      else if (exece.getCause() instanceof java.net.UnknownHostException)
        LOG.error("Unknown host {}", urlString);
      else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString);
      // log the full stacktrace
      else LOG.error("Exception while fetching {}", urlString, exece);

      eventCounter.scope("exception").incrBy(1);

      // could be an empty, immutable Metadata
      if (metadata.size() == 0) {
        metadata = new Metadata();
      }

      // add the reason of the failure in the metadata
      metadata.setValue("fetch.exception", message);

      _collector.emit(
          com.digitalpebble.storm.crawler.Constants.StatusStreamName,
          input,
          new Values(urlString, metadata, Status.FETCH_ERROR));
    }

    // update the throttler
    throttler.put(key, System.currentTimeMillis() + delay);

    _collector.ack(input);
  }