/** Used for redirections or when discovering sitemap URLs * */
  private void handleOutlink(Tuple t, URL sURL, String newUrl, Metadata sourceMetadata) {
    // build an absolute URL
    try {
      URL tmpURL = URLUtil.resolveURL(sURL, newUrl);
      newUrl = tmpURL.toExternalForm();
    } catch (MalformedURLException e) {
      LOG.debug("MalformedURLException on {} or {}: {}", sURL.toExternalForm(), newUrl, e);
      return;
    }

    // apply URL filters
    if (this.urlFilters != null) {
      newUrl = this.urlFilters.filter(sURL, sourceMetadata, newUrl);
    }

    // filtered
    if (newUrl == null) {
      return;
    }

    Metadata metadata =
        metadataTransfer.getMetaForOutlink(newUrl, sURL.toExternalForm(), sourceMetadata);

    // TODO check that hasn't exceeded max number of redirections

    _collector.emit(
        com.digitalpebble.storm.crawler.Constants.StatusStreamName,
        t,
        new Values(newUrl, metadata, Status.DISCOVERED));
  }
Ejemplo n.º 2
0
  private List<Outlink> toOutlinks(
      String url, Metadata metadata, Map<String, List<String>> slinks) {
    List<Outlink> outlinks = new LinkedList<>();
    URL sourceUrl;
    try {
      sourceUrl = new URL(url);
    } catch (MalformedURLException e) {
      // we would have known by now as previous components check whether
      // the URL is valid
      LOG.error("MalformedURLException on {}", url);
      eventCounter.scope("error_invalid_source_url").incrBy(1);
      return outlinks;
    }

    Map<String, List<String>> linksKept = new HashMap<>();

    for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) {
      String targetURL = linkEntry.getKey();
      // filter the urls
      if (urlFilters != null) {
        targetURL = urlFilters.filter(sourceUrl, metadata, targetURL);
        if (targetURL == null) {
          eventCounter.scope("outlink_filtered").incr();
          continue;
        }
      }
      // the link has survived the various filters
      if (targetURL != null) {
        List<String> anchors = linkEntry.getValue();
        linksKept.put(targetURL, anchors);
        eventCounter.scope("outlink_kept").incr();
      }
    }

    for (String outlink : linksKept.keySet()) {
      // configure which metadata gets inherited from parent
      Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata);
      Outlink ol = new Outlink(outlink);
      // add the anchors to the metadata?
      if (trackAnchors) {
        List<String> anchors = linksKept.get(outlink);
        if (anchors.size() > 0) {
          linkMetadata.addValues(ANCHORS_KEY_NAME, anchors);
          // sets the first anchor
          ol.setAnchor(anchors.get(0));
        }
      }
      ol.setMetadata(linkMetadata);
      outlinks.add(ol);
    }
    return outlinks;
  }
Ejemplo n.º 3
0
  @SuppressWarnings({"rawtypes", "unchecked"})
  @Override
  public void prepare(Map conf, TopologyContext context, OutputCollector collector) {
    this.collector = collector;

    eventCounter =
        context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10);

    parseFilters = ParseFilters.emptyParseFilter;

    String parseconfigfile =
        ConfUtils.getString(conf, "parsefilters.config.file", "parsefilters.json");
    if (parseconfigfile != null) {
      try {
        parseFilters = new ParseFilters(conf, parseconfigfile);
      } catch (IOException e) {
        LOG.error("Exception caught while loading the ParseFilters");
        throw new RuntimeException("Exception caught while loading the ParseFilters", e);
      }
    }

    urlFilters = URLFilters.emptyURLFilters;
    emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true);

    if (emitOutlinks) {
      String urlconfigfile = ConfUtils.getString(conf, "urlfilters.config.file", "urlfilters.json");

      if (urlconfigfile != null) {
        try {
          urlFilters = new URLFilters(conf, urlconfigfile);
        } catch (IOException e) {
          LOG.error("Exception caught while loading the URLFilters");
          throw new RuntimeException("Exception caught while loading the URLFilters", e);
        }
      }
    }

    trackAnchors = ConfUtils.getBoolean(conf, "track.anchors", true);

    robots_noFollow_strict = ConfUtils.getBoolean(conf, RobotsTags.ROBOTS_NO_FOLLOW_STRICT, true);

    metadataTransfer = MetadataTransfer.getInstance(conf);
  }
  @SuppressWarnings({"rawtypes", "unchecked"})
  @Override
  public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {

    _collector = collector;
    this.conf = new Config();
    this.conf.putAll(stormConf);

    checkConfiguration();

    this.taskIndex = context.getThisTaskIndex();

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
    long start = System.currentTimeMillis();
    LOG.info("[Fetcher #{}] : starting at {}", taskIndex, sdf.format(start));

    // Register a "MultiCountMetric" to count different events in this bolt
    // Storm will emit the counts every n seconds to a special bolt via a
    // system stream
    // The data can be accessed by registering a "MetricConsumer" in the
    // topology
    this.eventCounter = context.registerMetric("fetcher_counter", new MultiCountMetric(), 10);

    this.averagedMetrics =
        context.registerMetric("fetcher_average", new MultiReducedMetric(new MeanReducer()), 10);

    this.perSecMetrics =
        context.registerMetric(
            "fetcher_average_persec", new MultiReducedMetric(new PerSecondReducer()), 10);

    protocolFactory = new ProtocolFactory(conf);

    String urlconfigfile = ConfUtils.getString(conf, "urlfilters.config.file", "urlfilters.json");

    if (urlconfigfile != null)
      try {
        urlFilters = new URLFilters(conf, urlconfigfile);
      } catch (IOException e) {
        LOG.error("Exception caught while loading the URLFilters");
        throw new RuntimeException("Exception caught while loading the URLFilters", e);
      }

    metadataTransfer = MetadataTransfer.getInstance(stormConf);

    allowRedirs =
        ConfUtils.getBoolean(
            stormConf, com.digitalpebble.storm.crawler.Constants.AllowRedirParamName, true);

    sitemapsAutoDiscovery = ConfUtils.getBoolean(stormConf, "sitemap.discovery", false);

    queueMode = ConfUtils.getString(conf, "fetcher.queue.mode", QUEUE_MODE_HOST);
    // check that the mode is known
    if (!queueMode.equals(QUEUE_MODE_IP)
        && !queueMode.equals(QUEUE_MODE_DOMAIN)
        && !queueMode.equals(QUEUE_MODE_HOST)) {
      LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode);
      queueMode = QUEUE_MODE_HOST;
    }
    LOG.info("Using queue mode : {}", queueMode);

    this.crawlDelay = (long) (ConfUtils.getFloat(conf, "fetcher.server.delay", 1.0f) * 1000);

    this.maxCrawlDelay = (long) ConfUtils.getInt(conf, "fetcher.max.crawl.delay", 30) * 1000;
  }