Example #1
0
  private void handleRssError(Exception e, SourcePojo source) {
    // Error handling:
    // - If it's a 500 or 502 or 503 or 504 then just log and carry on
    // - Otherwise, if you get the same message twice in succession then error out
    boolean bSuspendSource = false;
    String sNewMessage = e.getMessage();
    if (null != sNewMessage) {
      if (sNewMessage.matches(".*50[0234].*")) {
        // Do nothing, this is just a temporary error
      } else if (null != source.getHarvestStatus()) {
        String sOldMessage = source.getHarvestStatus().getHarvest_message();
        if ((null != sOldMessage) && sOldMessage.equals(sNewMessage)) {
          bSuspendSource = true;
        }
      }
    }
    _context
        .getHarvestStatus()
        .update(source, new Date(), HarvestEnum.error, sNewMessage, bSuspendSource, false);

    // If an exception occurs log the error
    logger.error("Exception Message: " + e.getMessage(), e);
  }
Example #2
0
  // Process the feed
  private void processFeed(SourcePojo source) throws Exception {
    // Process the feed
    LinkedList<SyndFeed> feeds = new LinkedList<SyndFeed>();
    boolean bExtraUrls = (null == source.getUrl());

    if ((null != source.getUrl())
        && ((null == source.getRssConfig()) || (null == source.getRssConfig().getSearchConfig()))) {
      // (if the second clause is false, the URL is a search query, will process differently, inside
      // buildFeedList)

      SyndFeed feed = getFeed(source, null);
      if (null != feed) {
        feeds.add(feed);
      }
    } else if ((null != source.getRssConfig())
        && (null != source.getRssConfig().getSearchConfig())) {
      FeedHarvester_searchEngineSubsystem searchEngineSubsystem =
          new FeedHarvester_searchEngineSubsystem();
      searchEngineSubsystem.generateFeedFromSearch(source, _context);
      bExtraUrls = true;
    } // TESTED

    if ((null != source.getRssConfig())
        && (null != source.getRssConfig().getExtraUrls())
        && (null == source.getRssConfig().getSearchConfig())) {
      // Some of these might be RSS feeds, check if title==null
      for (ExtraUrlPojo url : source.getRssConfig().getExtraUrls()) {
        if ((null == url.title) && (null != url.url)) {
          SyndFeed feed = getFeed(source, url.url);
          if (null != feed) {
            feeds.add(feed);
          }
        }
      }
    } // TESTED

    if (!feeds.isEmpty() || bExtraUrls) // (second case: also have extra URLs)
    {
      // Error handling, part 1:
      this.nTmpHttpErrors = 0;
      this.nTmpDocsSubmitted = 0;

      // Extract the feed and place into the pojo
      try {
        buildFeedList(feeds, source);
      } catch (Exception e) {
        // Propagate upwards:
        throw e;
      }

      // Error handling part 2:
      // clean up
      if ((nTmpHttpErrors == this.nTmpDocsSubmitted) && (this.nTmpDocsSubmitted > 5)) {
        // any time when all a decent number of feeds are errors

        logger.error(
            "Source generates only invalid feeds: "
                + " http_errs="
                + nTmpHttpErrors
                + " source="
                + source.getUrl());

        if (this.nTmpDocsSubmitted < 20) {
          // harvested unsucessfully, post in mongo
          _context
              .getHarvestStatus()
              .update(
                  source,
                  new Date(),
                  HarvestEnum.error,
                  "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors,
                  true,
                  false);
        } else {
          // harvested unsucessfully, post in mongo *AND DISABLE*
          _context
              .getHarvestStatus()
              .update(
                  source,
                  new Date(),
                  HarvestEnum.error,
                  "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors,
                  true,
                  true);
        }
      } else {
        // harvested successfully, post in mongo
        _context
            .getHarvestStatus()
            .update(source, new Date(), HarvestEnum.in_progress, "", false, false);
      }
    }
  }