private DocumentPojo buildDocument( SyndEntry entry, SourcePojo source, LinkedList<String> duplicateSources) { String tmpURL = this.cleanUrlStart(entry.getLink().toString()); // (can't return null because called from code which checks this) // create the feed pojo DocumentPojo doc = new DocumentPojo(); doc.setUrl(tmpURL); doc.setCreated(new Date()); doc.setModified(new Date()); // Strip out html if it is present if (entry.getTitle() != null) doc.setTitle(entry.getTitle().replaceAll("\\<.*?\\>", "").trim()); if (entry.getDescription() != null) doc.setDescription(entry.getDescription().getValue().replaceAll("\\<.*?\\>", "").trim()); if (entry.getPublishedDate() != null) { doc.setPublishedDate(entry.getPublishedDate()); } else { doc.setPublishedDate(new Date()); } // Clone from an existing source if we can: if (!duplicateSources.isEmpty() && (null == doc.getUpdateId())) { // (can't duplicate updating document) doc.setDuplicateFrom(duplicateSources.getFirst()); } // GeoRSS GeoRSSModule geoRSSModule = GeoRSSUtils.getGeoRSS(entry); // currently does not handle <georss:circle> if (null != geoRSSModule) { if (null != geoRSSModule.getPosition()) { double lat = geoRSSModule.getPosition().getLatitude(); double lon = geoRSSModule.getPosition().getLongitude(); GeoPojo gp = new GeoPojo(); gp.lat = lat; gp.lon = lon; doc.setDocGeo(gp); } if (null != geoRSSModule.getGeometry()) { AbstractGeometry ag = geoRSSModule.getGeometry(); if (ag.getClass().equals(new LineString().getClass())) { // <georss:line> LineString ls = ((LineString) geoRSSModule.getGeometry()); double latAvg = 0.0; double lonAvg = 0.0; int length = ls.getPositionList().size(); for (int i = 0; i < length; i++) { latAvg += ls.getPositionList().getLatitude(i); lonAvg += ls.getPositionList().getLongitude(i); } latAvg = latAvg / length; lonAvg = lonAvg / length; GeoPojo gp = new GeoPojo(); gp.lat = latAvg; gp.lon = lonAvg; doc.setDocGeo(gp); } else if (ag.getClass().equals(new Polygon().getClass())) // <georss:polygon> { Polygon poly = ((Polygon) geoRSSModule.getGeometry()); AbstractRing ar = poly.getExterior(); LinearRing lr = (LinearRing) ar; double latAvg = 0.0; double lonAvg = 0.0; int length = lr.getPositionList().size(); for (int i = 0; i < length; i++) { latAvg += lr.getPositionList().getLatitude(i); lonAvg += lr.getPositionList().getLongitude(i); } latAvg = latAvg / length; lonAvg = lonAvg / length; GeoPojo gp = new GeoPojo(); gp.lat = latAvg; gp.lon = lonAvg; doc.setDocGeo(gp); } else if (ag.getClass().equals(new Envelope().getClass())) { // <georss:box> Envelope env = ((Envelope) geoRSSModule.getGeometry()); double latAvg = (env.getMaxLatitude() + env.getMinLatitude()) / 2; double lonAvg = (env.getMaxLongitude() + env.getMinLongitude()) / 2; GeoPojo gp = new GeoPojo(); gp.lat = latAvg; gp.lon = lonAvg; doc.setDocGeo(gp); } } } // end if GeoRSS // Arbitrary other metadata: if (null != entry.getForeignMarkup()) { JSONObject rssMetadata = new JSONObject(); @SuppressWarnings("unchecked") List<Element> fms = (List<Element>) entry.getForeignMarkup(); for (Element fm : fms) { try { JSONObject subObj = XML.toJSONObject(new XMLOutputter().outputString(fm)); if (1 == subObj.length()) { for (String name : JSONObject.getNames(subObj)) { rssMetadata.put(name, subObj.get(name)); } } else { // (this will never happen in practice?) rssMetadata.put(fm.getName(), subObj); } } catch (JSONException e) { } // (do nothing just carry on) } if (!fms.isEmpty()) { doc.addToMetadata( "_FEED_METADATA_", XmlToMetadataParser.convertJsonObjectToLinkedHashMap(rssMetadata)); } } // TESTED (longs converted to string, eg edgar:assistantDirector from // "http.www.sec.gov.archives.edgar.usgaap.rss.xml") return doc; }
// Build the feed list @SuppressWarnings("unchecked") private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) { // If there's a max number of sources to get per harvest, configure that here: long nWaitTime_ms = props.getWebCrawlWaitTime(); long nMaxTime_ms = props.getMaxTimePerFeed(); // (can't override this, too easy to break the system...) int nMaxDocsPerSource = props.getMaxDocsPerSource(); long nNow = new Date().getTime(); if (null != source.getRssConfig()) { if (null != source.getRssConfig().getWaitTimeOverride_ms()) { nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms(); } } long nMaxDocs = Long.MAX_VALUE; if (nWaitTime_ms > 0) { nMaxDocs = nMaxTime_ms / nWaitTime_ms; } if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2) nMaxDocs = nMaxDocsPerSource; } // (end per feed configuration) // Add extra docs List<SyndEntry> tmpList = null; boolean bCreatedAggregateList = false; int nRealSyndEntries = 0; for (SyndFeed feed : syndFeeds) { if (0 == nRealSyndEntries) { tmpList = feed.getEntries(); } else if (!bCreatedAggregateList) { bCreatedAggregateList = true; tmpList = new LinkedList<SyndEntry>(tmpList); tmpList.addAll(feed.getEntries()); } else { tmpList.addAll(feed.getEntries()); } nRealSyndEntries += feed.getEntries().size(); } if (null == tmpList) { tmpList = new LinkedList<SyndEntry>(); } // TESTED if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) { for (ExtraUrlPojo extraUrl : source.getRssConfig().getExtraUrls()) { if (null == extraUrl.title) { continue; // (this is an RSS feed not a URL) } // TESTED SyndEntryImpl synd = new SyndEntryImpl(); synd.setLink(extraUrl.url); if (null != extraUrl.description) { SyndContentImpl description = new SyndContentImpl(); description.setValue(extraUrl.description); synd.setDescription(description); } synd.setTitle(extraUrl.title); if (null != extraUrl.publishedDate) { try { synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate))); } catch (Exception e) { } // do nothign will use now as pub date } tmpList.add((SyndEntry) synd); if (null != extraUrl.fullText) { SyndFeedImpl fullTextContainer = new SyndFeedImpl(); fullTextContainer.setDescription(extraUrl.fullText); synd.setSource(fullTextContainer); } } } // Then begin looping over entries LinkedList<String> duplicateSources = new LinkedList<String>(); try { Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>(); int nSyndEntries = 0; for (Object synd : tmpList) { nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls) final SyndEntry entry = (SyndEntry) synd; if (null != entry.getLink()) // if url returns null, skip this entry { String url = this.cleanUrlStart(entry.getLink()); if (null != source.getRssConfig()) { // Some RSS specific logic // If an include is specified, must match Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url); if (null != includeMatcher) { if (!includeMatcher.find()) { continue; } } // If an exclude is specified, must not match Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url); if (null != excludeMatcher) { if (excludeMatcher.find()) { continue; } } } // Some error checking: // sometimes the URL seems to have some characters in front of the HTTP - remove these this.nTmpDocsSubmitted++; if (null == url) { this.nTmpHttpErrors++; continue; } // Also save the title and description: String title = ""; if (null != entry.getTitle()) { title = entry.getTitle(); } String desc = ""; if (null != entry.getDescription()) { desc = entry.getDescription().getValue(); } boolean duplicate = false; // Look for duplicates within the current set of sources List<SyndEntry> possDups = null; if (null == (possDups = urlDups.get(url))) { // (new URL) possDups = new LinkedList<SyndEntry>(); possDups.add(entry); urlDups.put(url, possDups); } else { // (old URL, check if this is a duplicate...) int nCount = 0; for (SyndEntry possDup : possDups) { if (possDup.getTitle().equals(title) || ((null != possDup.getDescription()) && possDup.getDescription().getValue().equals(desc)) || ((null != possDup.getDescription()) && (null == entry.getDescription()))) { // If *either* the title or the description matches as well as the URL... duplicate = true; break; } nCount++; } if (!duplicate) { possDups.add(entry); } else { // DUPLICATE: ensure we have minimal set of data to cover all cases: boolean bTitleMatch = false; boolean bDescMatch = false; for (SyndEntry possDup : possDups) { if (!bTitleMatch && possDup .getTitle() .equals(title)) { // (don't bother if already have a title match) bTitleMatch = true; } else if (!bDescMatch) { // (don't yet have a desc match( if (null != entry.getDescription()) { if (null != possDup.getDescription()) { // (neither desc is null) if (possDup.getDescription().getValue().equals(desc)) { bDescMatch = true; } } } else { // curr desc is null if (null == possDup.getDescription()) { // dup desc is null bDescMatch = true; } } // (end various title match/desc match/both have no desc cases } // (end if no desc match) if (bTitleMatch && bDescMatch) { break; // (no way can fire) } } // (end loop over dups) if (!bTitleMatch || !bDescMatch) { possDups.add(entry); } } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles, // descs) } if (duplicate) { continue; } try { DuplicateManager qr = _context.getDuplicateManager(); if (null != entry.getDescription()) { duplicate = qr.isDuplicate_UrlTitleDescription( url, title.replaceAll("\\<.*?\\>", "").trim(), desc.replaceAll("\\<.*?\\>", "").trim(), source, duplicateSources); } else { duplicate = qr.isDuplicate_UrlTitleDescription( url, title.replaceAll("\\<.*?\\>", "").trim(), null, source, duplicateSources); // ^^^(this is different to isDuplicate_UrlTitle because it enforces that the // description be null, vs just checking the title) } if (duplicate && (null != source.getRssConfig()) && (null != source.getRssConfig().getUpdateCycle_secs())) { // Check modified times... Date dupModDate = qr.getLastDuplicateModifiedTime(); ObjectId dupId = qr.getLastDuplicateId(); if ((null != dupModDate) && (null != dupId)) { if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs() * 1000 < nNow) { DocumentPojo doc = buildDocument(entry, source, duplicateSources); if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) { // (Use dummy TitleEx to create a "fake" full text block) doc.setFullText(entry.getSource().getDescription()); } doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite) this.docsToUpdate.add(doc); if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) { source.setReachedMaxDocs(); break; // (that's enough documents) } } } } // TESTED (duplicates we update instead of ignoring) if (!duplicate) { DocumentPojo doc = buildDocument(entry, source, duplicateSources); if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) { // (Use dummy TitleEx to create a "fake" full text block) doc.setFullText(entry.getSource().getDescription()); } this.docsToAdd.add(doc); if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) { source.setReachedMaxDocs(); break; // (that's enough documents) } } if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number) if (nTmpHttpErrors == this.nTmpDocsSubmitted) { break; } } } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } } } // (end loop over feeds in a syndicate) } catch (Exception e) { // If an exception occurs log the error logger.error("Exception Message: " + e.getMessage(), e); } }