@Override public void execute(Tuple tuple) { Metadata metadata = (Metadata) tuple.getValueByField("metadata"); byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); boolean isfeed = Boolean.valueOf(metadata.getFirstValue(isFeedKey)); // doesn't have the metadata expected if (!isfeed) { if (sniffWhenNoMDKey) { // uses mime-type // won't work when servers return text/xml // TODO use Tika instead? String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (ct.contains("rss+xml")) isfeed = true; } } // still not a feed file if (!isfeed) { // just pass it on this.collector.emit(tuple, tuple.getValues()); this.collector.ack(tuple); return; } else { // can be used later on for custom scheduling metadata.setValue(isFeedKey, "true"); } List<Outlink> outlinks; try { outlinks = parseFeed(url, content, metadata); } catch (Exception e) { // exception while parsing the feed String errorMessage = "Exception while parsing " + url + ": " + e; LOG.error(errorMessage); // send to status stream in case another component wants to update // its status metadata.setValue(Constants.STATUS_ERROR_SOURCE, "feed parsing"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(Constants.StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); this.collector.ack(tuple); return; } // apply the parse filters if any to the current document try { ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseFilters.filter(url, content, null, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; LOG.error(errorMessage); metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); return; } // send to status stream for (Outlink ol : outlinks) { Values v = new Values(ol.getTargetURL(), ol.getMetadata(), Status.DISCOVERED); collector.emit(Constants.StatusStreamName, tuple, v); } // marking the main URL as successfully fetched // regardless of whether we got a parse exception or not collector.emit(Constants.StatusStreamName, tuple, new Values(url, metadata, Status.FETCHED)); this.collector.ack(tuple); }
private List<Outlink> parseFeed(String url, byte[] content, Metadata parentMetadata) throws MalformedURLException { List<Outlink> links = new ArrayList<>(); SyndFeed feed = null; try (ByteArrayInputStream is = new ByteArrayInputStream(content)) { SyndFeedInput input = new SyndFeedInput(); feed = input.build(new InputSource(is)); } catch (Exception e) { LOG.error("Exception parsing feed from DOM {}", url); return links; } URL sURL = new URL(url); List<SyndEntry> entries = feed.getEntries(); for (SyndEntry entry : entries) { String targetURL = entry.getLink(); // build an absolute URL try { targetURL = URLUtil.resolveURL(sURL, targetURL).toExternalForm(); } catch (MalformedURLException e) { LOG.debug("MalformedURLException on {}", targetURL); continue; } targetURL = urlFilters.filter(sURL, parentMetadata, targetURL); if (StringUtils.isBlank(targetURL)) continue; Outlink newLink = new Outlink(targetURL); Metadata targetMD = metadataTransfer.getMetaForOutlink(targetURL, url, parentMetadata); newLink.setMetadata(targetMD); String title = entry.getTitle(); if (StringUtils.isNotBlank(title)) { targetMD.setValue("feed.title", title.trim()); } Date publishedDate = entry.getPublishedDate(); if (publishedDate != null) { // filter based on the published date if (filterHoursSincePub != -1) { Calendar rightNow = Calendar.getInstance(); rightNow.add(Calendar.HOUR, -filterHoursSincePub); if (publishedDate.before(rightNow.getTime())) { LOG.info( "{} has a published date {} which is more than {} hours old", targetURL, publishedDate.toString(), filterHoursSincePub); continue; } } targetMD.setValue("feed.publishedDate", publishedDate.toString()); } SyndContent description = entry.getDescription(); if (description != null && StringUtils.isNotBlank(description.getValue())) { targetMD.setValue("feed.description", description.getValue()); } links.add(newLink); } return links; }