Пример #1
0
 protected void processFullURL(
     final List<BotLink> linksForProcessing, final Link tkLink, final String u) {
   String scheme = "";
   String host = "";
   String path = "";
   String query = "";
   final Matcher m = SIMPLE_LINK.matcher(u);
   while (m.find()) {
     if (m.groupCount() >= 2) {
       scheme = m.group(1).trim();
       final String tmp = m.group(2).trim();
       final Matcher m2 = SIMPLE_LINK2.matcher(tmp);
       while (m2.find()) {
         if (m2.groupCount() >= 2) {
           host = m2.group(1).trim();
           // At this point we should have a path
           // Remove the 'query' section if available
           final String tmp2 = m2.group(2).trim();
           if (tmp2.indexOf('?') > 0) {
             final String wQuery = tmp2.substring(tmp2.indexOf('?') + 1);
             path = tmp2.substring(0, tmp2.indexOf('?'));
             query = wQuery;
           } else {
             path = tmp2;
           }
         } // End of the if //
       }
     }
   } // End of the while
   if (scheme.length() > 0 && host.length() > 0) {
     // Create a link for for further processing //
     final BotLink link = new BotLink();
     link.setHost(host);
     if (path.length() > 0) {
       link.setPath("/" + path);
     } // End of the if //
     link.setScheme(scheme);
     link.setQuery(query);
     link.setLink(tkLink);
     logger.info("Attempt to process and add to queue / link , link=" + link);
     linksForProcessing.add(link);
   } // End of the if //
 } // End of method //
Пример #2
0
  protected void writeFileAndSave(
      final BotLink link,
      final Session session,
      final Metadata metadata,
      final String document,
      final String txtOnlyDoc) {
    // Link added to processing, write the information to file
    final TextHelpers text = new TextHelpers();
    final String dir = text.baseDirectory(link);
    final String fpathText = text.mangleText(link);
    final String fpathFull = text.mangleFull(link);
    final File chkdir = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + dir);
    final boolean res = chkdir.mkdirs();
    final long wstart = System.currentTimeMillis();
    final File txtfile = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + fpathText);
    final File fullfile = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + fpathFull);
    try {
      // Write text version and full file //
      new IO<Void>()
          .w(
              txtfile.getAbsolutePath(),
              new Fx<PrintWriter>() {
                public void $(final PrintWriter o, final int idx) {
                  o.println(txtOnlyDoc);
                }
              });
      new IO<Void>()
          .w(
              fullfile.getAbsolutePath(),
              new Fx<PrintWriter>() {
                public void $(final PrintWriter o, final int idx) {
                  o.println(document);
                }
              });
      final long diff = System.currentTimeMillis() - wstart;
      logger.info(
          "Result after directory and writes / mk:" + chkdir + " res=" + res + " procTime=" + diff);

      final BotCrawlerDAO dao = new BotCrawlerDAO();
      final BotCrawlerLink persistLink = new BotCrawlerLink();
      persistLink.setHost(link.getHost());
      persistLink.setUrl(String.valueOf(link.toBuilder()));
      persistLink.setTitle(metadata.get("title"));
      persistLink.setDescr(metadata.get("description"));
      persistLink.setPath(fpathText);
      persistLink.setLinkcount(link.getNumberLinks());
      persistLink.setSource(Thread.currentThread().getName());
      if (NullRef.hasValue(link.getStatusline())) {
        persistLink.setStatusline(link.getStatusline());
        persistLink.setStatus(link.getCode());
      } // Check the status line //
      if (link.getLink() != null) {
        if (link.getLink().getText() != null) {
          persistLink.setLinktext(link.getLink().getText());
        }
      }
      dao.createLink(session, persistLink);

    } catch (final Throwable e) {
      e.printStackTrace();
    } // End of try - catch //
  } // End of method write file and persist //
Пример #3
0
  public List<BotLink> parse(
      final BotLink origLink, final URIBuilder lastBuilder, final String document) {
    final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory");
    final Session session = sf.openSession();
    try {
      final InputStream input = new ByteArrayInputStream(document.getBytes());
      final LinkContentHandler linkHandler = new LinkContentHandler();
      final ContentHandler textHandler = new BodyContentHandler();
      final ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler();
      final TeeContentHandler teeHandler =
          new TeeContentHandler(linkHandler, textHandler, toHTMLHandler);
      final Metadata metadata = new Metadata();
      final ParseContext parseContext = new ParseContext();
      final HtmlParser parser = new HtmlParser();
      parser.parse(input, teeHandler, metadata, parseContext);

      final String titleOfPage = metadata.get("title");
      // For analytical data, ignore pages that don't have titles
      if (!NullRef.hasValue(titleOfPage)) {
        logger.warn("Warning, invalid title for page, EXITING logic, link=" + origLink);
        return null;
      }

      // Continue with parsing //
      final List<BotLink> linksForProcessing = new ArrayList<BotLink>();
      final Set<String> urls = new HashSet<String>();

      int fullLinkCount = 0;
      for (final Link link : linkHandler.getLinks()) {
        fullLinkCount++;
      }
      int linkcount = 0;
      // Loop through the links on the page
      // And add a set number to the queue.
      final Random rchk = new Random(System.currentTimeMillis());
      final List<Link> linksFromPage = linkHandler.getLinks();
      Collections.shuffle(linksFromPage);
      for (final Link link : linksFromPage) {
        // Add a 30% chance of adding this link
        final double rval = rchk.nextDouble();
        final boolean okToAdd = rval > 0.65;
        if (okToAdd && link.getUri() != null) {
          linkcount++;
          if (linkcount > MAX_LINKS_PAGE) {
            // Only process a given number of links on a page //
            break;
          } // End of if max reached
          final String fullURL = this.fullURL(link, lastBuilder, urls);
          if (fullURL != null) {
            try {
              this.processFullURL(linksForProcessing, link, fullURL);
            } catch (final Throwable te) {
              te.printStackTrace();
            }
          }
        } // End of the if //
      } // End of the for through the links //

      // Parse the available URLS //
      logger.info(
          "In Web Parser for "
              + lastBuilder
              + " # availableNumberOfLinks="
              + urls.size()
              + " fullLinkCount="
              + fullLinkCount);

      // Persist the current link //
      origLink.setNumberLinks(fullLinkCount);
      this.writeFileAndSave(origLink, session, metadata, document, textHandler.toString());

      processLinksForQueue(linksForProcessing);
      return linksForProcessing;

    } catch (final Throwable e) {
      e.printStackTrace();
    } finally {
      if (session != null) {
        session.close();
      }
    } // End of the try - catch //
    return null;
  } // End of the method //