예제 #1
0
  private void handleRssError(Exception e, SourcePojo source) {
    // Error handling:
    // - If it's a 500 or 502 or 503 or 504 then just log and carry on
    // - Otherwise, if you get the same message twice in succession then error out
    boolean bSuspendSource = false;
    String sNewMessage = e.getMessage();
    if (null != sNewMessage) {
      if (sNewMessage.matches(".*50[0234].*")) {
        // Do nothing, this is just a temporary error
      } else if (null != source.getHarvestStatus()) {
        String sOldMessage = source.getHarvestStatus().getHarvest_message();
        if ((null != sOldMessage) && sOldMessage.equals(sNewMessage)) {
          bSuspendSource = true;
        }
      }
    }
    _context
        .getHarvestStatus()
        .update(source, new Date(), HarvestEnum.error, sNewMessage, bSuspendSource, false);

    // If an exception occurs log the error
    logger.error("Exception Message: " + e.getMessage(), e);
  }
예제 #2
0
  // Build the feed list
  @SuppressWarnings("unchecked")
  private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) {
    // If there's a max number of sources to get per harvest, configure that here:
    long nWaitTime_ms = props.getWebCrawlWaitTime();
    long nMaxTime_ms =
        props.getMaxTimePerFeed(); // (can't override this, too easy to break the system...)
    int nMaxDocsPerSource = props.getMaxDocsPerSource();
    long nNow = new Date().getTime();
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    long nMaxDocs = Long.MAX_VALUE;
    if (nWaitTime_ms > 0) {
      nMaxDocs = nMaxTime_ms / nWaitTime_ms;
    }
    if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2)
      nMaxDocs = nMaxDocsPerSource;
    }
    // (end per feed configuration)

    // Add extra docs
    List<SyndEntry> tmpList = null;
    boolean bCreatedAggregateList = false;
    int nRealSyndEntries = 0;

    for (SyndFeed feed : syndFeeds) {
      if (0 == nRealSyndEntries) {
        tmpList = feed.getEntries();
      } else if (!bCreatedAggregateList) {
        bCreatedAggregateList = true;
        tmpList = new LinkedList<SyndEntry>(tmpList);
        tmpList.addAll(feed.getEntries());
      } else {
        tmpList.addAll(feed.getEntries());
      }
      nRealSyndEntries += feed.getEntries().size();
    }
    if (null == tmpList) {
      tmpList = new LinkedList<SyndEntry>();
    }
    // TESTED

    if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) {
      for (ExtraUrlPojo extraUrl : source.getRssConfig().getExtraUrls()) {
        if (null == extraUrl.title) {
          continue; // (this is an RSS feed not a URL)
        } // TESTED
        SyndEntryImpl synd = new SyndEntryImpl();
        synd.setLink(extraUrl.url);
        if (null != extraUrl.description) {
          SyndContentImpl description = new SyndContentImpl();
          description.setValue(extraUrl.description);
          synd.setDescription(description);
        }
        synd.setTitle(extraUrl.title);
        if (null != extraUrl.publishedDate) {
          try {
            synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate)));
          } catch (Exception e) {
          } // do nothign will use now as pub date
        }
        tmpList.add((SyndEntry) synd);

        if (null != extraUrl.fullText) {
          SyndFeedImpl fullTextContainer = new SyndFeedImpl();
          fullTextContainer.setDescription(extraUrl.fullText);
          synd.setSource(fullTextContainer);
        }
      }
    }

    // Then begin looping over entries

    LinkedList<String> duplicateSources = new LinkedList<String>();
    try {
      Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>();
      int nSyndEntries = 0;
      for (Object synd : tmpList) {
        nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls)
        final SyndEntry entry = (SyndEntry) synd;

        if (null != entry.getLink()) // if url returns null, skip this entry
        {
          String url = this.cleanUrlStart(entry.getLink());

          if (null != source.getRssConfig()) { // Some RSS specific logic
            // If an include is specified, must match
            Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url);
            if (null != includeMatcher) {
              if (!includeMatcher.find()) {
                continue;
              }
            }
            // If an exclude is specified, must not match
            Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url);
            if (null != excludeMatcher) {
              if (excludeMatcher.find()) {
                continue;
              }
            }
          }

          // Some error checking:
          // sometimes the URL seems to have some characters in front of the HTTP - remove these
          this.nTmpDocsSubmitted++;
          if (null == url) {
            this.nTmpHttpErrors++;
            continue;
          }

          // Also save the title and description:
          String title = "";
          if (null != entry.getTitle()) {
            title = entry.getTitle();
          }
          String desc = "";
          if (null != entry.getDescription()) {
            desc = entry.getDescription().getValue();
          }
          boolean duplicate = false;

          // Look for duplicates within the current set of sources
          List<SyndEntry> possDups = null;
          if (null == (possDups = urlDups.get(url))) { // (new URL)
            possDups = new LinkedList<SyndEntry>();
            possDups.add(entry);
            urlDups.put(url, possDups);
          } else { // (old URL, check if this is a duplicate...)
            int nCount = 0;
            for (SyndEntry possDup : possDups) {
              if (possDup.getTitle().equals(title)
                  || ((null != possDup.getDescription())
                      && possDup.getDescription().getValue().equals(desc))
                  || ((null != possDup.getDescription()) && (null == entry.getDescription()))) {
                // If *either* the title or the description matches as well as the URL...
                duplicate = true;
                break;
              }
              nCount++;
            }

            if (!duplicate) {
              possDups.add(entry);
            } else { // DUPLICATE: ensure we have minimal set of data to cover all cases:
              boolean bTitleMatch = false;
              boolean bDescMatch = false;
              for (SyndEntry possDup : possDups) {
                if (!bTitleMatch
                    && possDup
                        .getTitle()
                        .equals(title)) { // (don't bother if already have a title match)
                  bTitleMatch = true;
                } else if (!bDescMatch) { // (don't yet have a desc match(
                  if (null != entry.getDescription()) {
                    if (null != possDup.getDescription()) { // (neither desc is null)
                      if (possDup.getDescription().getValue().equals(desc)) {
                        bDescMatch = true;
                      }
                    }
                  } else { // curr desc is null
                    if (null == possDup.getDescription()) { // dup desc is null
                      bDescMatch = true;
                    }
                  } // (end various title match/desc match/both have no desc cases
                } // (end if no desc match)
                if (bTitleMatch && bDescMatch) {
                  break; // (no way can fire)
                }
              } // (end loop over dups)

              if (!bTitleMatch || !bDescMatch) {
                possDups.add(entry);
              }
            } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles,
              // descs)
          }
          if (duplicate) {
            continue;
          }

          try {
            DuplicateManager qr = _context.getDuplicateManager();
            if (null != entry.getDescription()) {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      desc.replaceAll("\\<.*?\\>", "").trim(),
                      source,
                      duplicateSources);
            } else {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      null,
                      source,
                      duplicateSources);
              // ^^^(this is different to isDuplicate_UrlTitle because it enforces that the
              // description be null, vs just checking the title)
            }
            if (duplicate
                && (null != source.getRssConfig())
                && (null != source.getRssConfig().getUpdateCycle_secs())) {
              // Check modified times...
              Date dupModDate = qr.getLastDuplicateModifiedTime();
              ObjectId dupId = qr.getLastDuplicateId();

              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs() * 1000
                    < nNow) {

                  DocumentPojo doc = buildDocument(entry, source, duplicateSources);
                  if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                    // (Use dummy TitleEx to create a "fake" full text block)
                    doc.setFullText(entry.getSource().getDescription());
                  }
                  doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite)
                  this.docsToUpdate.add(doc);

                  if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                    source.setReachedMaxDocs();
                    break; // (that's enough documents)
                  }
                }
              }
            } // TESTED (duplicates we update instead of ignoring)

            if (!duplicate) {
              DocumentPojo doc = buildDocument(entry, source, duplicateSources);
              if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                // (Use dummy TitleEx to create a "fake" full text block)
                doc.setFullText(entry.getSource().getDescription());
              }
              this.docsToAdd.add(doc);

              if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                source.setReachedMaxDocs();
                break; // (that's enough documents)
              }
            }
            if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number)
              if (nTmpHttpErrors == this.nTmpDocsSubmitted) {
                break;
              }
            }
          } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
          }
        }
      } // (end loop over feeds in a syndicate)
    } catch (Exception e) {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    }
  }
예제 #3
0
  // Process the feed
  private void processFeed(SourcePojo source) throws Exception {
    // Process the feed
    LinkedList<SyndFeed> feeds = new LinkedList<SyndFeed>();
    boolean bExtraUrls = (null == source.getUrl());

    if ((null != source.getUrl())
        && ((null == source.getRssConfig()) || (null == source.getRssConfig().getSearchConfig()))) {
      // (if the second clause is false, the URL is a search query, will process differently, inside
      // buildFeedList)

      SyndFeed feed = getFeed(source, null);
      if (null != feed) {
        feeds.add(feed);
      }
    } else if ((null != source.getRssConfig())
        && (null != source.getRssConfig().getSearchConfig())) {
      FeedHarvester_searchEngineSubsystem searchEngineSubsystem =
          new FeedHarvester_searchEngineSubsystem();
      searchEngineSubsystem.generateFeedFromSearch(source, _context);
      bExtraUrls = true;
    } // TESTED

    if ((null != source.getRssConfig())
        && (null != source.getRssConfig().getExtraUrls())
        && (null == source.getRssConfig().getSearchConfig())) {
      // Some of these might be RSS feeds, check if title==null
      for (ExtraUrlPojo url : source.getRssConfig().getExtraUrls()) {
        if ((null == url.title) && (null != url.url)) {
          SyndFeed feed = getFeed(source, url.url);
          if (null != feed) {
            feeds.add(feed);
          }
        }
      }
    } // TESTED

    if (!feeds.isEmpty() || bExtraUrls) // (second case: also have extra URLs)
    {
      // Error handling, part 1:
      this.nTmpHttpErrors = 0;
      this.nTmpDocsSubmitted = 0;

      // Extract the feed and place into the pojo
      try {
        buildFeedList(feeds, source);
      } catch (Exception e) {
        // Propagate upwards:
        throw e;
      }

      // Error handling part 2:
      // clean up
      if ((nTmpHttpErrors == this.nTmpDocsSubmitted) && (this.nTmpDocsSubmitted > 5)) {
        // any time when all a decent number of feeds are errors

        logger.error(
            "Source generates only invalid feeds: "
                + " http_errs="
                + nTmpHttpErrors
                + " source="
                + source.getUrl());

        if (this.nTmpDocsSubmitted < 20) {
          // harvested unsucessfully, post in mongo
          _context
              .getHarvestStatus()
              .update(
                  source,
                  new Date(),
                  HarvestEnum.error,
                  "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors,
                  true,
                  false);
        } else {
          // harvested unsucessfully, post in mongo *AND DISABLE*
          _context
              .getHarvestStatus()
              .update(
                  source,
                  new Date(),
                  HarvestEnum.error,
                  "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors,
                  true,
                  true);
        }
      } else {
        // harvested successfully, post in mongo
        _context
            .getHarvestStatus()
            .update(source, new Date(), HarvestEnum.in_progress, "", false, false);
      }
    }
  }