コード例 #1
0
 protected String fullURL(final Link link, final URIBuilder lastBuilder, final Set<String> urls) {
   final String uri = link.getUri().trim();
   final boolean basicExclude = (uri.length() != 0) && !uri.equals("/");
   if (basicExclude) {
     String fullURL = "";
     // We have some URL's
     if (extractLinks(uri).size() > 0) {
       // First add full URL to set //
       fullURL = uri;
     } else {
       if (uri.startsWith("/")) {
         fullURL = lastBuilder.getScheme() + "://" + lastBuilder.getHost() + uri;
       } else {
         final String path = parsePath(lastBuilder.getPath());
         fullURL = lastBuilder.getScheme() + "://" + lastBuilder.getHost() + path + uri;
       } // End of the if - else //
     } // End //
     if (urls.contains(fullURL)) {
       // Return null so we don't reprocess
       return null;
     }
     // We should have a valid full URL.
     urls.add(fullURL);
     return fullURL;
   } // End of the if //
   return null;
 }
コード例 #2
0
  public List<BotLink> parse(
      final BotLink origLink, final URIBuilder lastBuilder, final String document) {
    final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory");
    final Session session = sf.openSession();
    try {
      final InputStream input = new ByteArrayInputStream(document.getBytes());
      final LinkContentHandler linkHandler = new LinkContentHandler();
      final ContentHandler textHandler = new BodyContentHandler();
      final ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler();
      final TeeContentHandler teeHandler =
          new TeeContentHandler(linkHandler, textHandler, toHTMLHandler);
      final Metadata metadata = new Metadata();
      final ParseContext parseContext = new ParseContext();
      final HtmlParser parser = new HtmlParser();
      parser.parse(input, teeHandler, metadata, parseContext);

      final String titleOfPage = metadata.get("title");
      // For analytical data, ignore pages that don't have titles
      if (!NullRef.hasValue(titleOfPage)) {
        logger.warn("Warning, invalid title for page, EXITING logic, link=" + origLink);
        return null;
      }

      // Continue with parsing //
      final List<BotLink> linksForProcessing = new ArrayList<BotLink>();
      final Set<String> urls = new HashSet<String>();

      int fullLinkCount = 0;
      for (final Link link : linkHandler.getLinks()) {
        fullLinkCount++;
      }
      int linkcount = 0;
      // Loop through the links on the page
      // And add a set number to the queue.
      final Random rchk = new Random(System.currentTimeMillis());
      final List<Link> linksFromPage = linkHandler.getLinks();
      Collections.shuffle(linksFromPage);
      for (final Link link : linksFromPage) {
        // Add a 30% chance of adding this link
        final double rval = rchk.nextDouble();
        final boolean okToAdd = rval > 0.65;
        if (okToAdd && link.getUri() != null) {
          linkcount++;
          if (linkcount > MAX_LINKS_PAGE) {
            // Only process a given number of links on a page //
            break;
          } // End of if max reached
          final String fullURL = this.fullURL(link, lastBuilder, urls);
          if (fullURL != null) {
            try {
              this.processFullURL(linksForProcessing, link, fullURL);
            } catch (final Throwable te) {
              te.printStackTrace();
            }
          }
        } // End of the if //
      } // End of the for through the links //

      // Parse the available URLS //
      logger.info(
          "In Web Parser for "
              + lastBuilder
              + " # availableNumberOfLinks="
              + urls.size()
              + " fullLinkCount="
              + fullLinkCount);

      // Persist the current link //
      origLink.setNumberLinks(fullLinkCount);
      this.writeFileAndSave(origLink, session, metadata, document, textHandler.toString());

      processLinksForQueue(linksForProcessing);
      return linksForProcessing;

    } catch (final Throwable e) {
      e.printStackTrace();
    } finally {
      if (session != null) {
        session.close();
      }
    } // End of the try - catch //
    return null;
  } // End of the method //