Ejemplo n.º 1
0
 @Override
 @SuppressWarnings({"rawtypes", "unchecked"})
 public void prepare(Map stormConf, TopologyContext context, OutputCollector collect) {
   collector = collect;
   metadataTransfer = MetadataTransfer.getInstance(stormConf);
   sniffWhenNoMDKey = ConfUtils.getBoolean(stormConf, "feed.sniffContent", false);
   filterHoursSincePub = ConfUtils.getInt(stormConf, "feed.filter.hours.since.published", -1);
   urlFilters = URLFilters.fromConf(stormConf);
   parseFilters = ParseFilters.fromConf(stormConf);
 }
Ejemplo n.º 2
0
  private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetadata)
      throws MalformedURLException {
    List<Outlink> links = new ArrayList<>();

    SyndFeed feed = null;
    try (ByteArrayInputStream is = new ByteArrayInputStream(content)) {
      SyndFeedInput input = new SyndFeedInput();
      feed = input.build(new InputSource(is));
    } catch (Exception e) {
      LOG.error("Exception parsing feed from DOM {}", url);
      return links;
    }

    URL sURL = new URL(url);

    List<SyndEntry> entries = feed.getEntries();
    for (SyndEntry entry : entries) {
      String targetURL = entry.getLink();

      // build an absolute URL
      try {
        targetURL = URLUtil.resolveURL(sURL, targetURL).toExternalForm();
      } catch (MalformedURLException e) {
        LOG.debug("MalformedURLException on {}", targetURL);
        continue;
      }

      targetURL = urlFilters.filter(sURL, parentMetadata, targetURL);

      if (StringUtils.isBlank(targetURL)) continue;

      Outlink newLink = new Outlink(targetURL);

      Metadata targetMD = metadataTransfer.getMetaForOutlink(targetURL, url, parentMetadata);
      newLink.setMetadata(targetMD);

      String title = entry.getTitle();
      if (StringUtils.isNotBlank(title)) {
        targetMD.setValue("feed.title", title.trim());
      }

      Date publishedDate = entry.getPublishedDate();
      if (publishedDate != null) {
        // filter based on the published date
        if (filterHoursSincePub != -1) {
          Calendar rightNow = Calendar.getInstance();
          rightNow.add(Calendar.HOUR, -filterHoursSincePub);
          if (publishedDate.before(rightNow.getTime())) {
            LOG.info(
                "{} has a published date {} which is more than {} hours old",
                targetURL,
                publishedDate.toString(),
                filterHoursSincePub);
            continue;
          }
        }
        targetMD.setValue("feed.publishedDate", publishedDate.toString());
      }

      SyndContent description = entry.getDescription();
      if (description != null && StringUtils.isNotBlank(description.getValue())) {
        targetMD.setValue("feed.description", description.getValue());
      }

      links.add(newLink);
    }

    return links;
  }