Java HarvestContext Examples

Programming Language: Java

Namespace/Package Name: com.ikanow.infinit.e.harvest

Class/Type: HarvestContext

Examples at hotexamples.com: 3

Java HarvestContext - 3 examples found. These are the top rated real world Java examples of com.ikanow.infinit.e.harvest.HarvestContext extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getHarvestStatus(2)

getDuplicateManager(1)

Example #1

Show file

File: FeedHarvester.java Project: agilee/Infinit.e

  private void handleRssError(Exception e, SourcePojo source) {
    // Error handling:
    // - If it's a 500 or 502 or 503 or 504 then just log and carry on
    // - Otherwise, if you get the same message twice in succession then error out
    boolean bSuspendSource = false;
    String sNewMessage = e.getMessage();
    if (null != sNewMessage) {
      if (sNewMessage.matches(".*50[0234].*")) {
        // Do nothing, this is just a temporary error
      } else if (null != source.getHarvestStatus()) {
        String sOldMessage = source.getHarvestStatus().getHarvest_message();
        if ((null != sOldMessage) && sOldMessage.equals(sNewMessage)) {
          bSuspendSource = true;
        }
      }
    }
    _context
        .getHarvestStatus()
        .update(source, new Date(), HarvestEnum.error, sNewMessage, bSuspendSource, false);

    // If an exception occurs log the error
    logger.error("Exception Message: " + e.getMessage(), e);
  }

Example #2

Show file

File: FeedHarvester.java Project: agilee/Infinit.e

  // Build the feed list
  @SuppressWarnings("unchecked")
  private void buildFeedList(LinkedList<SyndFeed> syndFeeds, SourcePojo source) {
    // If there's a max number of sources to get per harvest, configure that here:
    long nWaitTime_ms = props.getWebCrawlWaitTime();
    long nMaxTime_ms =
        props.getMaxTimePerFeed(); // (can't override this, too easy to break the system...)
    int nMaxDocsPerSource = props.getMaxDocsPerSource();
    long nNow = new Date().getTime();
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nWaitTime_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    long nMaxDocs = Long.MAX_VALUE;
    if (nWaitTime_ms > 0) {
      nMaxDocs = nMaxTime_ms / nWaitTime_ms;
    }
    if (nMaxDocs > nMaxDocsPerSource) { // (another limit, take the smaller of the 2)
      nMaxDocs = nMaxDocsPerSource;
    }
    // (end per feed configuration)

    // Add extra docs
    List<SyndEntry> tmpList = null;
    boolean bCreatedAggregateList = false;
    int nRealSyndEntries = 0;

    for (SyndFeed feed : syndFeeds) {
      if (0 == nRealSyndEntries) {
        tmpList = feed.getEntries();
      } else if (!bCreatedAggregateList) {
        bCreatedAggregateList = true;
        tmpList = new LinkedList<SyndEntry>(tmpList);
        tmpList.addAll(feed.getEntries());
      } else {
        tmpList.addAll(feed.getEntries());
      }
      nRealSyndEntries += feed.getEntries().size();
    }
    if (null == tmpList) {
      tmpList = new LinkedList<SyndEntry>();
    }
    // TESTED

    if ((null != source.getRssConfig()) && (null != source.getRssConfig().getExtraUrls())) {
      for (ExtraUrlPojo extraUrl : source.getRssConfig().getExtraUrls()) {
        if (null == extraUrl.title) {
          continue; // (this is an RSS feed not a URL)
        } // TESTED
        SyndEntryImpl synd = new SyndEntryImpl();
        synd.setLink(extraUrl.url);
        if (null != extraUrl.description) {
          SyndContentImpl description = new SyndContentImpl();
          description.setValue(extraUrl.description);
          synd.setDescription(description);
        }
        synd.setTitle(extraUrl.title);
        if (null != extraUrl.publishedDate) {
          try {
            synd.setPublishedDate(new Date(DateUtility.parseDate(extraUrl.publishedDate)));
          } catch (Exception e) {
          } // do nothign will use now as pub date
        }
        tmpList.add((SyndEntry) synd);

        if (null != extraUrl.fullText) {
          SyndFeedImpl fullTextContainer = new SyndFeedImpl();
          fullTextContainer.setDescription(extraUrl.fullText);
          synd.setSource(fullTextContainer);
        }
      }
    }

    // Then begin looping over entries

    LinkedList<String> duplicateSources = new LinkedList<String>();
    try {
      Map<String, List<SyndEntry>> urlDups = new HashMap<String, List<SyndEntry>>();
      int nSyndEntries = 0;
      for (Object synd : tmpList) {
        nSyndEntries++; // (keep count so we know we're accessing our own fake SyndEntryImpls)
        final SyndEntry entry = (SyndEntry) synd;

        if (null != entry.getLink()) // if url returns null, skip this entry
        {
          String url = this.cleanUrlStart(entry.getLink());

          if (null != source.getRssConfig()) { // Some RSS specific logic
            // If an include is specified, must match
            Matcher includeMatcher = source.getRssConfig().getIncludeMatcher(url);
            if (null != includeMatcher) {
              if (!includeMatcher.find()) {
                continue;
              }
            }
            // If an exclude is specified, must not match
            Matcher excludeMatcher = source.getRssConfig().getExcludeMatcher(url);
            if (null != excludeMatcher) {
              if (excludeMatcher.find()) {
                continue;
              }
            }
          }

          // Some error checking:
          // sometimes the URL seems to have some characters in front of the HTTP - remove these
          this.nTmpDocsSubmitted++;
          if (null == url) {
            this.nTmpHttpErrors++;
            continue;
          }

          // Also save the title and description:
          String title = "";
          if (null != entry.getTitle()) {
            title = entry.getTitle();
          }
          String desc = "";
          if (null != entry.getDescription()) {
            desc = entry.getDescription().getValue();
          }
          boolean duplicate = false;

          // Look for duplicates within the current set of sources
          List<SyndEntry> possDups = null;
          if (null == (possDups = urlDups.get(url))) { // (new URL)
            possDups = new LinkedList<SyndEntry>();
            possDups.add(entry);
            urlDups.put(url, possDups);
          } else { // (old URL, check if this is a duplicate...)
            int nCount = 0;
            for (SyndEntry possDup : possDups) {
              if (possDup.getTitle().equals(title)
                  || ((null != possDup.getDescription())
                      && possDup.getDescription().getValue().equals(desc))
                  || ((null != possDup.getDescription()) && (null == entry.getDescription()))) {
                // If *either* the title or the description matches as well as the URL...
                duplicate = true;
                break;
              }
              nCount++;
            }

            if (!duplicate) {
              possDups.add(entry);
            } else { // DUPLICATE: ensure we have minimal set of data to cover all cases:
              boolean bTitleMatch = false;
              boolean bDescMatch = false;
              for (SyndEntry possDup : possDups) {
                if (!bTitleMatch
                    && possDup
                        .getTitle()
                        .equals(title)) { // (don't bother if already have a title match)
                  bTitleMatch = true;
                } else if (!bDescMatch) { // (don't yet have a desc match(
                  if (null != entry.getDescription()) {
                    if (null != possDup.getDescription()) { // (neither desc is null)
                      if (possDup.getDescription().getValue().equals(desc)) {
                        bDescMatch = true;
                      }
                    }
                  } else { // curr desc is null
                    if (null == possDup.getDescription()) { // dup desc is null
                      bDescMatch = true;
                    }
                  } // (end various title match/desc match/both have no desc cases
                } // (end if no desc match)
                if (bTitleMatch && bDescMatch) {
                  break; // (no way can fire)
                }
              } // (end loop over dups)

              if (!bTitleMatch || !bDescMatch) {
                possDups.add(entry);
              }
            } // (end is duplicate, nasty logic to add minimal set to dup list to cover all titles,
              // descs)
          }
          if (duplicate) {
            continue;
          }

          try {
            DuplicateManager qr = _context.getDuplicateManager();
            if (null != entry.getDescription()) {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      desc.replaceAll("\\<.*?\\>", "").trim(),
                      source,
                      duplicateSources);
            } else {
              duplicate =
                  qr.isDuplicate_UrlTitleDescription(
                      url,
                      title.replaceAll("\\<.*?\\>", "").trim(),
                      null,
                      source,
                      duplicateSources);
              // ^^^(this is different to isDuplicate_UrlTitle because it enforces that the
              // description be null, vs just checking the title)
            }
            if (duplicate
                && (null != source.getRssConfig())
                && (null != source.getRssConfig().getUpdateCycle_secs())) {
              // Check modified times...
              Date dupModDate = qr.getLastDuplicateModifiedTime();
              ObjectId dupId = qr.getLastDuplicateId();

              if ((null != dupModDate) && (null != dupId)) {
                if (dupModDate.getTime() + source.getRssConfig().getUpdateCycle_secs() * 1000
                    < nNow) {

                  DocumentPojo doc = buildDocument(entry, source, duplicateSources);
                  if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                    // (Use dummy TitleEx to create a "fake" full text block)
                    doc.setFullText(entry.getSource().getDescription());
                  }
                  doc.setUpdateId(dupId); // (set _id to document I'm going to overwrite)
                  this.docsToUpdate.add(doc);

                  if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                    source.setReachedMaxDocs();
                    break; // (that's enough documents)
                  }
                }
              }
            } // TESTED (duplicates we update instead of ignoring)

            if (!duplicate) {
              DocumentPojo doc = buildDocument(entry, source, duplicateSources);
              if ((nSyndEntries > nRealSyndEntries) && (null != entry.getSource())) {
                // (Use dummy TitleEx to create a "fake" full text block)
                doc.setFullText(entry.getSource().getDescription());
              }
              this.docsToAdd.add(doc);

              if ((this.docsToAdd.size() + this.docsToUpdate.size()) >= nMaxDocs) {
                source.setReachedMaxDocs();
                break; // (that's enough documents)
              }
            }
            if (this.nTmpDocsSubmitted > 20) { // (some arbitrary "significant" number)
              if (nTmpHttpErrors == this.nTmpDocsSubmitted) {
                break;
              }
            }
          } catch (Exception e) {
            // If an exception occurs log the error
            logger.error("Exception Message: " + e.getMessage(), e);
          }
        }
      } // (end loop over feeds in a syndicate)
    } catch (Exception e) {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    }
  }

Example #3

Show file

File: FeedHarvester.java Project: agilee/Infinit.e

  // Process the feed
  private void processFeed(SourcePojo source) throws Exception {
    // Process the feed
    LinkedList<SyndFeed> feeds = new LinkedList<SyndFeed>();
    boolean bExtraUrls = (null == source.getUrl());

    if ((null != source.getUrl())
        && ((null == source.getRssConfig()) || (null == source.getRssConfig().getSearchConfig()))) {
      // (if the second clause is false, the URL is a search query, will process differently, inside
      // buildFeedList)

      SyndFeed feed = getFeed(source, null);
      if (null != feed) {
        feeds.add(feed);
      }
    } else if ((null != source.getRssConfig())
        && (null != source.getRssConfig().getSearchConfig())) {
      FeedHarvester_searchEngineSubsystem searchEngineSubsystem =
          new FeedHarvester_searchEngineSubsystem();
      searchEngineSubsystem.generateFeedFromSearch(source, _context);
      bExtraUrls = true;
    } // TESTED

    if ((null != source.getRssConfig())
        && (null != source.getRssConfig().getExtraUrls())
        && (null == source.getRssConfig().getSearchConfig())) {
      // Some of these might be RSS feeds, check if title==null
      for (ExtraUrlPojo url : source.getRssConfig().getExtraUrls()) {
        if ((null == url.title) && (null != url.url)) {
          SyndFeed feed = getFeed(source, url.url);
          if (null != feed) {
            feeds.add(feed);
          }
        }
      }
    } // TESTED

    if (!feeds.isEmpty() || bExtraUrls) // (second case: also have extra URLs)
    {
      // Error handling, part 1:
      this.nTmpHttpErrors = 0;
      this.nTmpDocsSubmitted = 0;

      // Extract the feed and place into the pojo
      try {
        buildFeedList(feeds, source);
      } catch (Exception e) {
        // Propagate upwards:
        throw e;
      }

      // Error handling part 2:
      // clean up
      if ((nTmpHttpErrors == this.nTmpDocsSubmitted) && (this.nTmpDocsSubmitted > 5)) {
        // any time when all a decent number of feeds are errors

        logger.error(
            "Source generates only invalid feeds: "
                + " http_errs="
                + nTmpHttpErrors
                + " source="
                + source.getUrl());

        if (this.nTmpDocsSubmitted < 20) {
          // harvested unsucessfully, post in mongo
          _context
              .getHarvestStatus()
              .update(
                  source,
                  new Date(),
                  HarvestEnum.error,
                  "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors,
                  true,
                  false);
        } else {
          // harvested unsucessfully, post in mongo *AND DISABLE*
          _context
              .getHarvestStatus()
              .update(
                  source,
                  new Date(),
                  HarvestEnum.error,
                  "Extraction errors: redirect_errs=" + "http_errs=" + nTmpHttpErrors,
                  true,
                  true);
        }
      } else {
        // harvested successfully, post in mongo
        _context
            .getHarvestStatus()
            .update(source, new Date(), HarvestEnum.in_progress, "", false, false);
      }
    }
  }