Ejemplo n.º 1
0
  protected void writeFileAndSave(
      final BotLink link,
      final Session session,
      final Metadata metadata,
      final String document,
      final String txtOnlyDoc) {
    // Link added to processing, write the information to file
    final TextHelpers text = new TextHelpers();
    final String dir = text.baseDirectory(link);
    final String fpathText = text.mangleText(link);
    final String fpathFull = text.mangleFull(link);
    final File chkdir = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + dir);
    final boolean res = chkdir.mkdirs();
    final long wstart = System.currentTimeMillis();
    final File txtfile = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + fpathText);
    final File fullfile = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + fpathFull);
    try {
      // Write text version and full file //
      new IO<Void>()
          .w(
              txtfile.getAbsolutePath(),
              new Fx<PrintWriter>() {
                public void $(final PrintWriter o, final int idx) {
                  o.println(txtOnlyDoc);
                }
              });
      new IO<Void>()
          .w(
              fullfile.getAbsolutePath(),
              new Fx<PrintWriter>() {
                public void $(final PrintWriter o, final int idx) {
                  o.println(document);
                }
              });
      final long diff = System.currentTimeMillis() - wstart;
      logger.info(
          "Result after directory and writes / mk:" + chkdir + " res=" + res + " procTime=" + diff);

      final BotCrawlerDAO dao = new BotCrawlerDAO();
      final BotCrawlerLink persistLink = new BotCrawlerLink();
      persistLink.setHost(link.getHost());
      persistLink.setUrl(String.valueOf(link.toBuilder()));
      persistLink.setTitle(metadata.get("title"));
      persistLink.setDescr(metadata.get("description"));
      persistLink.setPath(fpathText);
      persistLink.setLinkcount(link.getNumberLinks());
      persistLink.setSource(Thread.currentThread().getName());
      if (NullRef.hasValue(link.getStatusline())) {
        persistLink.setStatusline(link.getStatusline());
        persistLink.setStatus(link.getCode());
      } // Check the status line //
      if (link.getLink() != null) {
        if (link.getLink().getText() != null) {
          persistLink.setLinktext(link.getLink().getText());
        }
      }
      dao.createLink(session, persistLink);

    } catch (final Throwable e) {
      e.printStackTrace();
    } // End of try - catch //
  } // End of method write file and persist //
Ejemplo n.º 2
0
  public List<BotLink> parse(
      final BotLink origLink, final URIBuilder lastBuilder, final String document) {
    final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory");
    final Session session = sf.openSession();
    try {
      final InputStream input = new ByteArrayInputStream(document.getBytes());
      final LinkContentHandler linkHandler = new LinkContentHandler();
      final ContentHandler textHandler = new BodyContentHandler();
      final ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler();
      final TeeContentHandler teeHandler =
          new TeeContentHandler(linkHandler, textHandler, toHTMLHandler);
      final Metadata metadata = new Metadata();
      final ParseContext parseContext = new ParseContext();
      final HtmlParser parser = new HtmlParser();
      parser.parse(input, teeHandler, metadata, parseContext);

      final String titleOfPage = metadata.get("title");
      // For analytical data, ignore pages that don't have titles
      if (!NullRef.hasValue(titleOfPage)) {
        logger.warn("Warning, invalid title for page, EXITING logic, link=" + origLink);
        return null;
      }

      // Continue with parsing //
      final List<BotLink> linksForProcessing = new ArrayList<BotLink>();
      final Set<String> urls = new HashSet<String>();

      int fullLinkCount = 0;
      for (final Link link : linkHandler.getLinks()) {
        fullLinkCount++;
      }
      int linkcount = 0;
      // Loop through the links on the page
      // And add a set number to the queue.
      final Random rchk = new Random(System.currentTimeMillis());
      final List<Link> linksFromPage = linkHandler.getLinks();
      Collections.shuffle(linksFromPage);
      for (final Link link : linksFromPage) {
        // Add a 30% chance of adding this link
        final double rval = rchk.nextDouble();
        final boolean okToAdd = rval > 0.65;
        if (okToAdd && link.getUri() != null) {
          linkcount++;
          if (linkcount > MAX_LINKS_PAGE) {
            // Only process a given number of links on a page //
            break;
          } // End of if max reached
          final String fullURL = this.fullURL(link, lastBuilder, urls);
          if (fullURL != null) {
            try {
              this.processFullURL(linksForProcessing, link, fullURL);
            } catch (final Throwable te) {
              te.printStackTrace();
            }
          }
        } // End of the if //
      } // End of the for through the links //

      // Parse the available URLS //
      logger.info(
          "In Web Parser for "
              + lastBuilder
              + " # availableNumberOfLinks="
              + urls.size()
              + " fullLinkCount="
              + fullLinkCount);

      // Persist the current link //
      origLink.setNumberLinks(fullLinkCount);
      this.writeFileAndSave(origLink, session, metadata, document, textHandler.toString());

      processLinksForQueue(linksForProcessing);
      return linksForProcessing;

    } catch (final Throwable e) {
      e.printStackTrace();
    } finally {
      if (session != null) {
        session.close();
      }
    } // End of the try - catch //
    return null;
  } // End of the method //