예제 #1
0
  @Override
  public void execute(Tuple tuple) {
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");
    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");

    boolean isfeed = Boolean.valueOf(metadata.getFirstValue(isFeedKey));
    // doesn't have the metadata expected
    if (!isfeed) {
      if (sniffWhenNoMDKey) {
        // uses mime-type
        // won't work when servers return text/xml
        // TODO use Tika instead?
        String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
        if (ct.contains("rss+xml")) isfeed = true;
      }
    }

    // still not a feed file
    if (!isfeed) {
      // just pass it on
      this.collector.emit(tuple, tuple.getValues());
      this.collector.ack(tuple);
      return;
    } else {
      // can be used later on for custom scheduling
      metadata.setValue(isFeedKey, "true");
    }

    List<Outlink> outlinks;
    try {
      outlinks = parseFeed(url, content, metadata);
    } catch (Exception e) {
      // exception while parsing the feed
      String errorMessage = "Exception while parsing " + url + ": " + e;
      LOG.error(errorMessage);
      // send to status stream in case another component wants to update
      // its status
      metadata.setValue(Constants.STATUS_ERROR_SOURCE, "feed parsing");
      metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
      collector.emit(Constants.StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
      this.collector.ack(tuple);
      return;
    }

    // apply the parse filters if any to the current document
    try {
      ParseResult parse = new ParseResult();
      parse.setOutlinks(outlinks);
      ParseData parseData = parse.get(url);
      parseData.setMetadata(metadata);
      parseFilters.filter(url, content, null, parse);
    } catch (RuntimeException e) {
      String errorMessage = "Exception while running parse filters on " + url + ": " + e;
      LOG.error(errorMessage);
      metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
      metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
      collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR));
      collector.ack(tuple);
      return;
    }

    // send to status stream
    for (Outlink ol : outlinks) {
      Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED);
      collector.emit(Constants.StatusStreamName, tuple, v);
    }

    // marking the main URL as successfully fetched
    // regardless of whether we got a parse exception or not
    collector.emit(Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED));
    this.collector.ack(tuple);
  }
예제 #2
0
  private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetadata)
      throws MalformedURLException {
    List<Outlink> links = new ArrayList<>();

    SyndFeed feed = null;
    try (ByteArrayInputStream is = new ByteArrayInputStream(content)) {
      SyndFeedInput input = new SyndFeedInput();
      feed = input.build(new InputSource(is));
    } catch (Exception e) {
      LOG.error("Exception parsing feed from DOM {}", url);
      return links;
    }

    URL sURL = new URL(url);

    List<SyndEntry> entries = feed.getEntries();
    for (SyndEntry entry : entries) {
      String targetURL = entry.getLink();

      // build an absolute URL
      try {
        targetURL = URLUtil.resolveURL(sURL, targetURL).toExternalForm();
      } catch (MalformedURLException e) {
        LOG.debug("MalformedURLException on {}", targetURL);
        continue;
      }

      targetURL = urlFilters.filter(sURL, parentMetadata, targetURL);

      if (StringUtils.isBlank(targetURL)) continue;

      Outlink newLink = new Outlink(targetURL);

      Metadata targetMD = metadataTransfer.getMetaForOutlink(targetURL, url, parentMetadata);
      newLink.setMetadata(targetMD);

      String title = entry.getTitle();
      if (StringUtils.isNotBlank(title)) {
        targetMD.setValue("feed.title", title.trim());
      }

      Date publishedDate = entry.getPublishedDate();
      if (publishedDate != null) {
        // filter based on the published date
        if (filterHoursSincePub != -1) {
          Calendar rightNow = Calendar.getInstance();
          rightNow.add(Calendar.HOUR, -filterHoursSincePub);
          if (publishedDate.before(rightNow.getTime())) {
            LOG.info(
                "{} has a published date {} which is more than {} hours old",
                targetURL,
                publishedDate.toString(),
                filterHoursSincePub);
            continue;
          }
        }
        targetMD.setValue("feed.publishedDate", publishedDate.toString());
      }

      SyndContent description = entry.getDescription();
      if (description != null && StringUtils.isNotBlank(description.getValue())) {
        targetMD.setValue("feed.description", description.getValue());
      }

      links.add(newLink);
    }

    return links;
  }