private void handleRssError(Exception e, SourcePojo source) { // Error handling: // - If it's a 500 or 502 or 503 or 504 then just log and carry on // - Otherwise, if you get the same message twice in succession then error out boolean bSuspendSource = false; String sNewMessage = e.getMessage(); if (null != sNewMessage) { if (sNewMessage.matches(".*50[0234].*")) { // Do nothing, this is just a temporary error } else if (null != source.getHarvestStatus()) { String sOldMessage = source.getHarvestStatus().getHarvest_message(); if ((null != sOldMessage) && sOldMessage.equals(sNewMessage)) { bSuspendSource = true; } } } _context .getHarvestStatus() .update(source, new Date(), HarvestEnum.error, sNewMessage, bSuspendSource, false); // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); }
// Build the feed list @SuppressWarnings("unchecked") private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) { // If there's a max number of sources to get per harvest, configure that here: long nWaitTime_ms = props.getWebCrawlWaitTime(); long nMaxTime_ms = props.getMaxTimePerFeed(); // (can't override this, too easy to break the system...) int nMaxDocsPerSource = props.getMaxDocsPerSource(); long nNow = new Date().getTime(); if (null != source.getRssConfig()) { if (null != source.getRssConfig().getWaitTimeOverride_ms()) { nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms(); } } long nMaxDocs = Long.MAX_VALUE; if (nWaitTime_ms > 0) { nMaxDocs = nMaxTime_ms / nWaitTime_ms; } if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2) nMaxDocs = nMaxDocsPerSource; } // (end per feed configuration) // Add extra docs List<SyndEntry> tmpList = null; boolean bCreatedAggregateList = false; int nRealSyndEntries = 0; for (SyndFeed feed : syndFeeds) { if (0 == nRealSyndEntries) { tmpList = feed.getEntries(); } else if (!bCreatedAggregateList) { bCreatedAggregateList = true; tmpList = new LinkedList<SyndEntry>(tmpList); tmpList.addAll(feed.getEntries()); } else { tmpList.addAll(feed.getEntries()); } nRealSyndEntries += feed.getEntries().size(); } if (null == tmpList) { tmpList = new LinkedList<SyndEntry>(); } // TESTED if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) { for (ExtraUrlPojo extraUrl : source.getRssConfig().getExtraUrls()) { if (null == extraUrl.title) { continue; // (this is an RSS feed not a URL) } // TESTED SyndEntryImpl synd = new SyndEntryImpl(); synd.setLink(extraUrl.url); if (null != extraUrl.description) { SyndContentImpl description = new SyndContentImpl(); description.setValue(extraUrl.description); synd.setDescription(description); } synd.setTitle(extraUrl.title); if (null != extraUrl.publishedDate) { try { synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate))); } catch (Exception e) { } // do nothign will use now as pub date } tmpList.add((SyndEntry) synd); if (null != extraUrl.fullText) { SyndFeedImpl fullTextContainer = new SyndFeedImpl(); fullTextContainer.setDescription(extraUrl.fullText); synd.setSource(fullTextContainer); } } } // Then begin looping over entries LinkedList<String> duplicateSources = new LinkedList<String>(); try { Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>(); int nSyndEntries = 0; for (Object synd : tmpList) { nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls) final SyndEntry entry = (SyndEntry) synd; if (null != entry.getLink()) // if url returns null, skip this entry { String url = this.cleanUrlStart(entry.getLink()); if (null != source.getRssConfig()) { // Some RSS specific logic // If an include is specified, must match Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url); if (null != includeMatcher) { if (!includeMatcher.find()) { continue; } } // If an exclude is specified, must not match Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url); if (null != excludeMatcher) { if (excludeMatcher.find()) { continue; } } } // Some error checking: // sometimes the URL seems to have some characters in front of the HTTP - remove these this.nTmpDocsSubmitted++; if (null == url) { this.nTmpHttpErrors++; continue; } // Also save the title and description: String title = ""; if (null != entry.getTitle()) { title = entry.getTitle(); } String desc = ""; if (null != entry.getDescription()) { desc = entry.getDescription().getValue(); } boolean duplicate = false; // Look for duplicates within the current set of sources List<SyndEntry> possDups = null; if (null == (possDups = urlDups.get(url))) { // (new URL) possDups = new LinkedList<SyndEntry>(); possDups.add(entry); urlDups.put(url, possDups); } else { // (old URL, check if this is a duplicate...) int nCount = 0; for (SyndEntry possDup : possDups) { if (possDup.getTitle().equals(title) || ((null != possDup.getDescription()) && possDup.getDescription().getValue().equals(desc)) || ((null != possDup.getDescription()) && (null == entry.getDescription()))) { // If *either* the title or the description matches as well as the URL... duplicate = true; break; } nCount++; } if (!duplicate) { possDups.add(entry); } else { // DUPLICATE: ensure we have minimal set of data to cover all cases: boolean bTitleMatch = false; boolean bDescMatch = false; for (SyndEntry possDup : possDups) { if (!bTitleMatch && possDup .getTitle() .equals(title)) { // (don't bother if already have a title match) bTitleMatch = true; } else if (!bDescMatch) { // (don't yet have a desc match( if (null != entry.getDescription()) { if (null != possDup.getDescription()) { // (neither desc is null) if (possDup.getDescription().getValue().equals(desc)) { bDescMatch = true; } } } else { // curr desc is null if (null == possDup.getDescription()) { // dup desc is null bDescMatch = true; } } // (end various title match/desc match/both have no desc cases } // (end if no desc match) if (bTitleMatch && bDescMatch) { break; // (no way can fire) } } // (end loop over dups) if (!bTitleMatch || !bDescMatch) { possDups.add(entry); } } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles, // descs) } if (duplicate) { continue; } try { DuplicateManager qr = _context.getDuplicateManager(); if (null != entry.getDescription()) { duplicate = qr.isDuplicate_UrlTitleDescription( url, title.replaceAll("\\<.*?\\>", "").trim(), desc.replaceAll("\\<.*?\\>", "").trim(), source, duplicateSources); } else { duplicate = qr.isDuplicate_UrlTitleDescription( url, title.replaceAll("\\<.*?\\>", "").trim(), null, source, duplicateSources); // ^^^(this is different to isDuplicate_UrlTitle because it enforces that the // description be null, vs just checking the title) } if (duplicate && (null != source.getRssConfig()) && (null != source.getRssConfig().getUpdateCycle_secs())) { // Check modified times... Date dupModDate = qr.getLastDuplicateModifiedTime(); ObjectId dupId = qr.getLastDuplicateId(); if ((null != dupModDate) && (null != dupId)) { if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs() * 1000 < nNow) { DocumentPojo doc = buildDocument(entry, source, duplicateSources); if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) { // (Use dummy TitleEx to create a "fake" full text block) doc.setFullText(entry.getSource().getDescription()); } doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite) this.docsToUpdate.add(doc); if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) { source.setReachedMaxDocs(); break; // (that's enough documents) } } } } // TESTED (duplicates we update instead of ignoring) if (!duplicate) { DocumentPojo doc = buildDocument(entry, source, duplicateSources); if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) { // (Use dummy TitleEx to create a "fake" full text block) doc.setFullText(entry.getSource().getDescription()); } this.docsToAdd.add(doc); if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) { source.setReachedMaxDocs(); break; // (that's enough documents) } } if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number) if (nTmpHttpErrors == this.nTmpDocsSubmitted) { break; } } } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } } // (end loop over feeds in a syndicate) } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } }
// Process the feed private void processFeed(SourcePojo source) throws Exception { // Process the feed LinkedList<SyndFeed> feeds = new LinkedList<SyndFeed>(); boolean bExtraUrls = (null == source.getUrl()); if ((null != source.getUrl()) && ((null == source.getRssConfig()) || (null == source.getRssConfig().getSearchConfig()))) { // (if the second clause is false, the URL is a search query, will process differently, inside // buildFeedList) SyndFeed feed = getFeed(source, null); if (null != feed) { feeds.add(feed); } } else if ((null != source.getRssConfig()) && (null != source.getRssConfig().getSearchConfig())) { FeedHarvester_searchEngineSubsystem searchEngineSubsystem = new FeedHarvester_searchEngineSubsystem(); searchEngineSubsystem.generateFeedFromSearch(source, _context); bExtraUrls = true; } // TESTED if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls()) && (null == source.getRssConfig().getSearchConfig())) { // Some of these might be RSS feeds, check if title==null for (ExtraUrlPojo url : source.getRssConfig().getExtraUrls()) { if ((null == url.title) && (null != url.url)) { SyndFeed feed = getFeed(source, url.url); if (null != feed) { feeds.add(feed); } } } } // TESTED if (!feeds.isEmpty() || bExtraUrls) // (second case: also have extra URLs) { // Error handling, part 1: this.nTmpHttpErrors = 0; this.nTmpDocsSubmitted = 0; // Extract the feed and place into the pojo try { buildFeedList(feeds, source); } catch (Exception e) { // Propagate upwards: throw e; } // Error handling part 2: // clean up if ((nTmpHttpErrors == this.nTmpDocsSubmitted) && (this.nTmpDocsSubmitted > 5)) { // any time when all a decent number of feeds are errors logger.error( "Source generates only invalid feeds: " + " http_errs=" + nTmpHttpErrors + " source=" + source.getUrl()); if (this.nTmpDocsSubmitted < 20) { // harvested unsucessfully, post in mongo _context .getHarvestStatus() .update( source, new Date(), HarvestEnum.error, "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors, true, false); } else { // harvested unsucessfully, post in mongo *AND DISABLE* _context .getHarvestStatus() .update( source, new Date(), HarvestEnum.error, "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors, true, true); } } else { // harvested successfully, post in mongo _context .getHarvestStatus() .update(source, new Date(), HarvestEnum.in_progress, "", false, false); } } }