Exemplo n.º 1
0
  private DocumentPojo buildDocument(
      SyndEntry entry, SourcePojo source, LinkedList<String> duplicateSources) {

    String tmpURL = this.cleanUrlStart(entry.getLink().toString());
    // (can't return null because called from code which checks this)

    // create the feed pojo
    DocumentPojo doc = new DocumentPojo();

    doc.setUrl(tmpURL);
    doc.setCreated(new Date());
    doc.setModified(new Date());

    // Strip out html if it is present
    if (entry.getTitle() != null) doc.setTitle(entry.getTitle().replaceAll("\\<.*?\\>", "").trim());
    if (entry.getDescription() != null)
      doc.setDescription(entry.getDescription().getValue().replaceAll("\\<.*?\\>", "").trim());
    if (entry.getPublishedDate() != null) {
      doc.setPublishedDate(entry.getPublishedDate());
    } else {
      doc.setPublishedDate(new Date());
    }

    // Clone from an existing source if we can:
    if (!duplicateSources.isEmpty()
        && (null == doc.getUpdateId())) { // (can't duplicate updating document)
      doc.setDuplicateFrom(duplicateSources.getFirst());
    }

    // GeoRSS
    GeoRSSModule geoRSSModule =
        GeoRSSUtils.getGeoRSS(entry); // currently does not handle <georss:circle>
    if (null != geoRSSModule) {
      if (null != geoRSSModule.getPosition()) {
        double lat = geoRSSModule.getPosition().getLatitude();
        double lon = geoRSSModule.getPosition().getLongitude();
        GeoPojo gp = new GeoPojo();
        gp.lat = lat;
        gp.lon = lon;
        doc.setDocGeo(gp);
      }
      if (null != geoRSSModule.getGeometry()) {
        AbstractGeometry ag = geoRSSModule.getGeometry();
        if (ag.getClass().equals(new LineString().getClass())) { // <georss:line>
          LineString ls = ((LineString) geoRSSModule.getGeometry());

          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = ls.getPositionList().size();
          for (int i = 0; i < length; i++) {
            latAvg += ls.getPositionList().getLatitude(i);
            lonAvg += ls.getPositionList().getLongitude(i);
          }
          latAvg = latAvg / length;
          lonAvg = lonAvg / length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        } else if (ag.getClass().equals(new Polygon().getClass())) // <georss:polygon>
        {
          Polygon poly = ((Polygon) geoRSSModule.getGeometry());
          AbstractRing ar = poly.getExterior();
          LinearRing lr = (LinearRing) ar;

          double latAvg = 0.0;
          double lonAvg = 0.0;
          int length = lr.getPositionList().size();
          for (int i = 0; i < length; i++) {
            latAvg += lr.getPositionList().getLatitude(i);
            lonAvg += lr.getPositionList().getLongitude(i);
          }
          latAvg = latAvg / length;
          lonAvg = lonAvg / length;
          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        } else if (ag.getClass().equals(new Envelope().getClass())) { // <georss:box>
          Envelope env = ((Envelope) geoRSSModule.getGeometry());

          double latAvg = (env.getMaxLatitude() + env.getMinLatitude()) / 2;
          double lonAvg = (env.getMaxLongitude() + env.getMinLongitude()) / 2;

          GeoPojo gp = new GeoPojo();
          gp.lat = latAvg;
          gp.lon = lonAvg;
          doc.setDocGeo(gp);
        }
      }
    } // end if GeoRSS

    // Arbitrary other metadata:

    if (null != entry.getForeignMarkup()) {
      JSONObject rssMetadata = new JSONObject();

      @SuppressWarnings("unchecked")
      List<Element> fms = (List<Element>) entry.getForeignMarkup();
      for (Element fm : fms) {
        try {
          JSONObject subObj = XML.toJSONObject(new XMLOutputter().outputString(fm));
          if (1 == subObj.length()) {
            for (String name : JSONObject.getNames(subObj)) {
              rssMetadata.put(name, subObj.get(name));
            }
          } else { // (this will never happen in practice?)
            rssMetadata.put(fm.getName(), subObj);
          }
        } catch (JSONException e) {
        } // (do nothing just carry on)
      }
      if (!fms.isEmpty()) {
        doc.addToMetadata(
            "_FEED_METADATA_", XmlToMetadataParser.convertJsonObjectToLinkedHashMap(rssMetadata));
      }
    } // TESTED (longs converted to string, eg edgar:assistantDirector from
      // "http.www.sec.gov.archives.edgar.usgaap.rss.xml")

    return doc;
  }
Exemplo n.º 2
0
  // Build the feed list
  @SuppressWarnings("unchecked")
  private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) {
    // If there's a max number of sources to get per harvest, configure that here:
    long nWaitTime_ms = props.getWebCrawlWaitTime();
    long nMaxTime_ms =
        props.getMaxTimePerFeed(); // (can't override this, too easy to break the system...)
    int nMaxDocsPerSource = props.getMaxDocsPerSource();
    long nNow = new Date().getTime();
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    long nMaxDocs = Long.MAX_VALUE;
    if (nWaitTime_ms > 0) {
      nMaxDocs = nMaxTime_ms / nWaitTime_ms;
    }
    if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2)
      nMaxDocs = nMaxDocsPerSource;
    }
    // (end per feed configuration)

    // Add extra docs
    List<SyndEntry> tmpList = null;
    boolean bCreatedAggregateList = false;
    int nRealSyndEntries = 0;

    for (SyndFeed feed : syndFeeds) {
      if (0 == nRealSyndEntries) {
        tmpList = feed.getEntries();
      } else if (!bCreatedAggregateList) {
        bCreatedAggregateList = true;
        tmpList = new LinkedList<SyndEntry>(tmpList);
        tmpList.addAll(feed.getEntries());
      } else {
        tmpList.addAll(feed.getEntries());
      }
      nRealSyndEntries += feed.getEntries().size();
    }
    if (null == tmpList) {
      tmpList = new LinkedList<SyndEntry>();
    }
    // TESTED

    if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) {
      for (ExtraUrlPojo extraUrl : source.getRssConfig().getExtraUrls()) {
        if (null == extraUrl.title) {
          continue; // (this is an RSS feed not a URL)
        } // TESTED
        SyndEntryImpl synd = new SyndEntryImpl();
        synd.setLink(extraUrl.url);
        if (null != extraUrl.description) {
          SyndContentImpl description = new SyndContentImpl();
          description.setValue(extraUrl.description);
          synd.setDescription(description);
        }
        synd.setTitle(extraUrl.title);
        if (null != extraUrl.publishedDate) {
          try {
            synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate)));
          } catch (Exception e) {
          } // do nothign will use now as pub date
        }
        tmpList.add((SyndEntry) synd);

        if (null != extraUrl.fullText) {
          SyndFeedImpl fullTextContainer = new SyndFeedImpl();
          fullTextContainer.setDescription(extraUrl.fullText);
          synd.setSource(fullTextContainer);
        }
      }
    }

    // Then begin looping over entries

    LinkedList<String> duplicateSources = new LinkedList<String>();
    try {
      Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>();
      int nSyndEntries = 0;
      for (Object synd : tmpList) {
        nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls)
        final SyndEntry entry = (SyndEntry) synd;

        if (null != entry.getLink()) // if url returns null, skip this entry
        {
          String url = this.cleanUrlStart(entry.getLink());

          if (null != source.getRssConfig()) { // Some RSS specific logic
            // If an include is specified, must match
            Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url);
            if (null != includeMatcher) {
              if (!includeMatcher.find()) {
                continue;
              }
            }
            // If an exclude is specified, must not match
            Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url);
            if (null != excludeMatcher) {
              if (excludeMatcher.find()) {
                continue;
              }
            }
          }

          // Some error checking:
          // sometimes the URL seems to have some characters in front of the HTTP - remove these
          this.nTmpDocsSubmitted++;
          if (null == url) {
            this.nTmpHttpErrors++;
            continue;
          }

          // Also save the title and description:
          String title = "";
          if (null != entry.getTitle()) {
            title = entry.getTitle();
          }
          String desc = "";
          if (null != entry.getDescription()) {
            desc = entry.getDescription().getValue();
          }
          boolean duplicate = false;

          // Look for duplicates within the current set of sources
          List<SyndEntry> possDups = null;
          if (null == (possDups = urlDups.get(url))) { // (new URL)
            possDups = new LinkedList<SyndEntry>();
            possDups.add(entry);
            urlDups.put(url, possDups);
          } else { // (old URL, check if this is a duplicate...)
            int nCount = 0;
            for (SyndEntry possDup : possDups) {
              if (possDup.getTitle().equals(title)
                  || ((null != possDup.getDescription())
                      && possDup.getDescription().getValue().equals(desc))
                  || ((null != possDup.getDescription()) && (null == entry.getDescription()))) {
                // If *either* the title or the description matches as well as the URL...
                duplicate = true;
                break;
              }
              nCount++;
            }

            if (!duplicate) {
              possDups.add(entry);
            } else { // DUPLICATE: ensure we have minimal set of data to cover all cases:
              boolean bTitleMatch = false;
              boolean bDescMatch = false;
              for (SyndEntry possDup : possDups) {
                if (!bTitleMatch
                    && possDup
                        .getTitle()
                        .equals(title)) { // (don't bother if already have a title match)
                  bTitleMatch = true;
                } else if (!bDescMatch) { // (don't yet have a desc match(
                  if (null != entry.getDescription()) {
                    if (null != possDup.getDescription()) { // (neither desc is null)
                      if (possDup.getDescription().getValue().equals(desc)) {
                        bDescMatch = true;
                      }
                    }
                  } else { // curr desc is null
                    if (null == possDup.getDescription()) { // dup desc is null
                      bDescMatch = true;
                    }
                  } // (end various title match/desc match/both have no desc cases
                } // (end if no desc match)
                if (bTitleMatch && bDescMatch) {
                  break; // (no way can fire)
                }
              } // (end loop over dups)

              if (!bTitleMatch || !bDescMatch) {
                possDups.add(entry);
              }
            } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles,
              // descs)
          }
          if (duplicate) {
            continue;
          }

          try {
            DuplicateManager qr = _context.getDuplicateManager();
            if (null != entry.getDescription()) {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      desc.replaceAll("\\<.*?\\>", "").trim(),
                      source,
                      duplicateSources);
            } else {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      null,
                      source,
                      duplicateSources);
              // ^^^(this is different to isDuplicate_UrlTitle because it enforces that the
              // description be null, vs just checking the title)
            }
            if (duplicate
                && (null != source.getRssConfig())
                && (null != source.getRssConfig().getUpdateCycle_secs())) {
              // Check modified times...
              Date dupModDate = qr.getLastDuplicateModifiedTime();
              ObjectId dupId = qr.getLastDuplicateId();

              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs() * 1000
                    < nNow) {

                  DocumentPojo doc = buildDocument(entry, source, duplicateSources);
                  if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                    // (Use dummy TitleEx to create a "fake" full text block)
                    doc.setFullText(entry.getSource().getDescription());
                  }
                  doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite)
                  this.docsToUpdate.add(doc);

                  if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                    source.setReachedMaxDocs();
                    break; // (that's enough documents)
                  }
                }
              }
            } // TESTED (duplicates we update instead of ignoring)

            if (!duplicate) {
              DocumentPojo doc = buildDocument(entry, source, duplicateSources);
              if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                // (Use dummy TitleEx to create a "fake" full text block)
                doc.setFullText(entry.getSource().getDescription());
              }
              this.docsToAdd.add(doc);

              if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                source.setReachedMaxDocs();
                break; // (that's enough documents)
              }
            }
            if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number)
              if (nTmpHttpErrors == this.nTmpDocsSubmitted) {
                break;
              }
            }
          } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
          }
        }
      } // (end loop over feeds in a syndicate)
    } catch (Exception e) {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    }
  }