Java MultiCountMetric примеры использования

Язык программирования: Java

Пространство имен/Пакет: backtype.storm.metric.api

Класс/Тип: MultiCountMetric

Примеров на hotexamples.com: 4

Java MultiCountMetric - 4 примера найдено. Это лучшие примеры Java кода для backtype.storm.metric.api.MultiCountMetric, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

scope(4)

Пример #1

Показать файл

Файл: JSoupParserBolt.java Проект: wdxxl/storm-crawler

  private List<Outlink> toOutlinks(
      String url, Metadata metadata, Map<String, List<String>> slinks) {
    List<Outlink> outlinks = new LinkedList<>();
    URL sourceUrl;
    try {
      sourceUrl = new URL(url);
    } catch (MalformedURLException e) {
      // we would have known by now as previous components check whether
      // the URL is valid
      LOG.error("MalformedURLException on {}", url);
      eventCounter.scope("error_invalid_source_url").incrBy(1);
      return outlinks;
    }

    Map<String, List<String>> linksKept = new HashMap<>();

    for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) {
      String targetURL = linkEntry.getKey();
      // filter the urls
      if (urlFilters != null) {
        targetURL = urlFilters.filter(sourceUrl, metadata, targetURL);
        if (targetURL == null) {
          eventCounter.scope("outlink_filtered").incr();
          continue;
        }
      }
      // the link has survived the various filters
      if (targetURL != null) {
        List<String> anchors = linkEntry.getValue();
        linksKept.put(targetURL, anchors);
        eventCounter.scope("outlink_kept").incr();
      }
    }

    for (String outlink : linksKept.keySet()) {
      // configure which metadata gets inherited from parent
      Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata);
      Outlink ol = new Outlink(outlink);
      // add the anchors to the metadata?
      if (trackAnchors) {
        List<String> anchors = linksKept.get(outlink);
        if (anchors.size() > 0) {
          linkMetadata.addValues(ANCHORS_KEY_NAME, anchors);
          // sets the first anchor
          ol.setAnchor(anchors.get(0));
        }
      }
      ol.setMetadata(linkMetadata);
      outlinks.add(ol);
    }
    return outlinks;
  }

Пример #2

Показать файл

Файл: JSoupParserBolt.java Проект: wdxxl/storm-crawler

 private void handleException(
     String url,
     Throwable e,
     Metadata metadata,
     Tuple tuple,
     String errorSource,
     String errorMessage) {
   LOG.error(errorMessage);
   // send to status stream in case another component wants to update
   // its status
   metadata.setValue(Constants.STATUS_ERROR_SOURCE, errorSource);
   metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
   collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
   collector.ack(tuple);
   // Increment metric that is context specific
   String s = "error_" + errorSource.replaceAll(" ", "_") + "_";
   eventCounter.scope(s + e.getClass().getSimpleName()).incrBy(1);
   // Increment general metric
   eventCounter.scope("parse exception").incrBy(1);
 }

Пример #3

Показать файл

Файл: SimpleFetcherBolt.java Проект: fysoft2006/storm-crawler

  @Override
  public void execute(Tuple input) {

    String urlString = input.getStringByField("url");
    if (StringUtils.isBlank(urlString)) {
      LOG.info("[Fetcher #{}] Missing value for field url in tuple {}", taskIndex, input);
      // ignore silently
      _collector.ack(input);
      return;
    }

    Metadata metadata = null;

    if (input.contains("metadata")) metadata = (Metadata) input.getValueByField("metadata");
    if (metadata == null) metadata = Metadata.empty;

    URL url;

    try {
      url = new URL(urlString);
    } catch (MalformedURLException e) {
      LOG.error("{} is a malformed URL", urlString);
      // Report to status stream and ack
      if (metadata == Metadata.empty) {
        metadata = new Metadata();
      }
      metadata.setValue("error.cause", "malformed URL");
      _collector.emit(
          com.digitalpebble.storm.crawler.Constants.StatusStreamName,
          input,
          new Values(urlString, metadata, Status.ERROR));
      _collector.ack(input);
      return;
    }

    // check when we are allowed to process it
    String key = getPolitenessKey(url);

    Long timeAllowed = throttler.getIfPresent(key);

    if (timeAllowed != null) {
      long now = System.currentTimeMillis();
      long timeToWait = timeAllowed - now;
      if (timeToWait > 0) {
        try {
          Thread.sleep(timeToWait);
        } catch (InterruptedException e) {
          LOG.error("[Fetcher #{}] caught InterruptedException caught while waiting");
        }
      }
    }

    long delay = this.crawlDelay;

    try {
      Protocol protocol = protocolFactory.getProtocol(url);

      BaseRobotRules rules = protocol.getRobotRules(urlString);

      // autodiscovery of sitemaps
      // the sitemaps will be sent down the topology
      // as many times as there is a URL for a given host
      // the status updater will certainly cache things
      // but we could also have a simple cache mechanism here
      // as well.
      if (sitemapsAutoDiscovery) {
        for (String sitemapURL : rules.getSitemaps()) {
          handleOutlink(input, url, sitemapURL, metadata);
        }
      }

      if (!rules.isAllowed(urlString)) {
        LOG.info("Denied by robots.txt: {}", urlString);

        metadata.setValue("error.cause", "robots.txt");

        // Report to status stream and ack
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, metadata, Status.ERROR));
        _collector.ack(input);
        return;
      }

      // get the delay from robots
      // value is negative when not set
      long robotsDelay = rules.getCrawlDelay();
      if (robotsDelay > 0) {
        // cap the value to a maximum
        // as some sites specify ridiculous values
        if (robotsDelay > maxCrawlDelay) {
          LOG.debug("Delay from robots capped at {} for {}", robotsDelay, url);
          delay = maxCrawlDelay;
        } else {
          delay = robotsDelay;
        }
      }

      long start = System.currentTimeMillis();
      ProtocolResponse response = protocol.getProtocolOutput(urlString, metadata);
      long timeFetching = System.currentTimeMillis() - start;

      averagedMetrics.scope("fetch_time").update(timeFetching);
      averagedMetrics.scope("bytes_fetched").update(response.getContent().length);
      eventCounter.scope("fetched").incrBy(1);
      perSecMetrics.scope("bytes_fetched_perSec").update(response.getContent().length);
      perSecMetrics.scope("fetched_perSec").update(1);

      LOG.info(
          "[Fetcher #{}] Fetched {} with status {} in {}",
          taskIndex,
          urlString,
          response.getStatusCode(),
          timeFetching);

      response
          .getMetadata()
          .setValue("fetch.statusCode", Integer.toString(response.getStatusCode()));

      response.getMetadata().putAll(metadata);

      // determine the status based on the status code
      Status status = Status.fromHTTPCode(response.getStatusCode());

      // if the status is OK emit on default stream
      if (status.equals(Status.FETCHED)) {
        _collector.emit(
            Utils.DEFAULT_STREAM_ID,
            input,
            new Values(urlString, response.getContent(), response.getMetadata()));
      } else if (status.equals(Status.REDIRECTION)) {
        // Mark URL as redirected
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, response.getMetadata(), status));

        // find the URL it redirects to
        String redirection = response.getMetadata().getFirstValue(HttpHeaders.LOCATION);

        if (allowRedirs && redirection != null && StringUtils.isNotBlank(redirection)) {
          handleOutlink(input, url, redirection, response.getMetadata());
        }
      } else {
        // Error
        _collector.emit(
            com.digitalpebble.storm.crawler.Constants.StatusStreamName,
            input,
            new Values(urlString, response.getMetadata(), status));
      }

    } catch (Exception exece) {

      String message = exece.getMessage();
      if (message == null) message = "";

      // common exceptions for which we log only a short message
      if (exece.getCause() instanceof java.util.concurrent.TimeoutException)
        LOG.error("Socket timeout fetching {}", urlString);
      else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString);
      else if (exece.getCause() instanceof java.net.UnknownHostException)
        LOG.error("Unknown host {}", urlString);
      else if (message.contains(" timed out")) LOG.error("Socket timeout fetching {}", urlString);
      // log the full stacktrace
      else LOG.error("Exception while fetching {}", urlString, exece);

      eventCounter.scope("exception").incrBy(1);

      // could be an empty, immutable Metadata
      if (metadata.size() == 0) {
        metadata = new Metadata();
      }

      // add the reason of the failure in the metadata
      metadata.setValue("fetch.exception", message);

      _collector.emit(
          com.digitalpebble.storm.crawler.Constants.StatusStreamName,
          input,
          new Values(urlString, metadata, Status.FETCH_ERROR));
    }

    // update the throttler
    throttler.put(key, System.currentTimeMillis() + delay);

    _collector.ack(input);
  }

Пример #4

Показать файл

Файл: JSoupParserBolt.java Проект: wdxxl/storm-crawler

  @Override
  public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;
    String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    if (StringUtils.isNotBlank(httpCT)) {
      if (httpCT.toLowerCase().contains("html")) {
        CT_OK = true;
      }
    }
    // simply ignore cases where the content type has not been set
    // TODO sniff content with Tika?
    else {
      CT_OK = true;
    }

    if (!CT_OK) {
      String errorMessage = "Exception content-type " + httpCT + " for " + url;
      RuntimeException e = new RuntimeException(errorMessage);
      handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
      return;
    }

    LOG.info("Parsing : starting {}", url);

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text;
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
      org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

      fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

      // extracts the robots directives from the meta tags
      robotsTags.extractMetaTags(fragment);

      // store a normalised representation in metadata
      // so that the indexer is aware of it
      robotsTags.normaliseToMetadata(metadata);

      // do not extract the links if no follow has been set
      // and we are in strict mode
      if (robotsTags.isNoFollow() && robots_noFollow_strict) {
        slinks = new HashMap<>(0);
      } else {
        Elements links = jsoupDoc.select("a[href]");
        slinks = new HashMap<>(links.size());
        for (Element link : links) {
          // abs:href tells jsoup to return fully qualified domains
          // for
          // relative urls.
          // e.g.: /foo will resolve to http://shopstyle.com/foo
          String targetURL = link.attr("abs:href");

          // nofollow
          boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
          // remove altogether
          if (noFollow && robots_noFollow_strict) {
            continue;
          }

          // link not specifically marked as no follow
          // but whole page is
          if (!noFollow && robotsTags.isNoFollow()) {
            noFollow = true;
          }

          String anchor = link.text();
          if (StringUtils.isNotBlank(targetURL)) {
            // any existing anchors for the same target?
            List<String> anchors = slinks.get(targetURL);
            if (anchors == null) {
              anchors = new LinkedList<>();
              slinks.put(targetURL, anchors);
            }
            // track the anchors only if no follow is false
            if (!noFollow && StringUtils.isNotBlank(anchor)) {
              anchors.add(anchor);
            }
          }
        }
      }

      text = jsoupDoc.body().text();

    } catch (Throwable e) {
      String errorMessage = "Exception while parsing " + url + ": " + e;
      handleException(url, e, metadata, tuple, "content parsing", errorMessage);
      return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
      parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

      String errorMessage = "Exception while running parse filters on " + url + ": " + e;
      handleException(url, e, metadata, tuple, "content filtering", errorMessage);
      return;
    }

    if (emitOutlinks) {
      for (Outlink outlink : parse.getOutlinks()) {
        collector.emit(
            StatusStreamName,
            tuple,
            new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
      }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
      ParseData parseDoc = doc.getValue();

      collector.emit(
          tuple,
          new Values(
              doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
  }