private void handleRssError(Exception e, SourcePojo source) { // Error handling: // - If it's a 500 or 502 or 503 or 504 then just log and carry on // - Otherwise, if you get the same message twice in succession then error out boolean bSuspendSource = false; String sNewMessage = e.getMessage(); if (null != sNewMessage) { if (sNewMessage.matches(".*50[0234].*")) { // Do nothing, this is just a temporary error } else if (null != source.getHarvestStatus()) { String sOldMessage = source.getHarvestStatus().getHarvest_message(); if ((null != sOldMessage) && sOldMessage.equals(sNewMessage)) { bSuspendSource = true; } } } _context .getHarvestStatus() .update(source, new Date(), HarvestEnum.error, sNewMessage, bSuspendSource, false); // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); }
// Process the feed private void processFeed(SourcePojo source) throws Exception { // Process the feed LinkedList<SyndFeed> feeds = new LinkedList<SyndFeed>(); boolean bExtraUrls = (null == source.getUrl()); if ((null != source.getUrl()) && ((null == source.getRssConfig()) || (null == source.getRssConfig().getSearchConfig()))) { // (if the second clause is false, the URL is a search query, will process differently, inside // buildFeedList) SyndFeed feed = getFeed(source, null); if (null != feed) { feeds.add(feed); } } else if ((null != source.getRssConfig()) && (null != source.getRssConfig().getSearchConfig())) { FeedHarvester_searchEngineSubsystem searchEngineSubsystem = new FeedHarvester_searchEngineSubsystem(); searchEngineSubsystem.generateFeedFromSearch(source, _context); bExtraUrls = true; } // TESTED if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls()) && (null == source.getRssConfig().getSearchConfig())) { // Some of these might be RSS feeds, check if title==null for (ExtraUrlPojo url : source.getRssConfig().getExtraUrls()) { if ((null == url.title) && (null != url.url)) { SyndFeed feed = getFeed(source, url.url); if (null != feed) { feeds.add(feed); } } } } // TESTED if (!feeds.isEmpty() || bExtraUrls) // (second case: also have extra URLs) { // Error handling, part 1: this.nTmpHttpErrors = 0; this.nTmpDocsSubmitted = 0; // Extract the feed and place into the pojo try { buildFeedList(feeds, source); } catch (Exception e) { // Propagate upwards: throw e; } // Error handling part 2: // clean up if ((nTmpHttpErrors == this.nTmpDocsSubmitted) && (this.nTmpDocsSubmitted > 5)) { // any time when all a decent number of feeds are errors logger.error( "Source generates only invalid feeds: " + " http_errs=" + nTmpHttpErrors + " source=" + source.getUrl()); if (this.nTmpDocsSubmitted < 20) { // harvested unsucessfully, post in mongo _context .getHarvestStatus() .update( source, new Date(), HarvestEnum.error, "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors, true, false); } else { // harvested unsucessfully, post in mongo *AND DISABLE* _context .getHarvestStatus() .update( source, new Date(), HarvestEnum.error, "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors, true, true); } } else { // harvested successfully, post in mongo _context .getHarvestStatus() .update(source, new Date(), HarvestEnum.in_progress, "", false, false); } } }