protected void processFullURL( final List<BotLink> linksForProcessing, final Link tkLink, final String u) { String scheme = ""; String host = ""; String path = ""; String query = ""; final Matcher m = SIMPLE_LINK.matcher(u); while (m.find()) { if (m.groupCount() >= 2) { scheme = m.group(1).trim(); final String tmp = m.group(2).trim(); final Matcher m2 = SIMPLE_LINK2.matcher(tmp); while (m2.find()) { if (m2.groupCount() >= 2) { host = m2.group(1).trim(); // At this point we should have a path // Remove the 'query' section if available final String tmp2 = m2.group(2).trim(); if (tmp2.indexOf('?') > 0) { final String wQuery = tmp2.substring(tmp2.indexOf('?') + 1); path = tmp2.substring(0, tmp2.indexOf('?')); query = wQuery; } else { path = tmp2; } } // End of the if // } } } // End of the while if (scheme.length() > 0 && host.length() > 0) { // Create a link for for further processing // final BotLink link = new BotLink(); link.setHost(host); if (path.length() > 0) { link.setPath("/" + path); } // End of the if // link.setScheme(scheme); link.setQuery(query); link.setLink(tkLink); logger.info("Attempt to process and add to queue / link , link=" + link); linksForProcessing.add(link); } // End of the if // } // End of method //
protected void writeFileAndSave( final BotLink link, final Session session, final Metadata metadata, final String document, final String txtOnlyDoc) { // Link added to processing, write the information to file final TextHelpers text = new TextHelpers(); final String dir = text.baseDirectory(link); final String fpathText = text.mangleText(link); final String fpathFull = text.mangleFull(link); final File chkdir = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + dir); final boolean res = chkdir.mkdirs(); final long wstart = System.currentTimeMillis(); final File txtfile = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + fpathText); final File fullfile = new File(OctaneCrawlerConstants.CRAWLER_HOME + "/" + fpathFull); try { // Write text version and full file // new IO<Void>() .w( txtfile.getAbsolutePath(), new Fx<PrintWriter>() { public void $(final PrintWriter o, final int idx) { o.println(txtOnlyDoc); } }); new IO<Void>() .w( fullfile.getAbsolutePath(), new Fx<PrintWriter>() { public void $(final PrintWriter o, final int idx) { o.println(document); } }); final long diff = System.currentTimeMillis() - wstart; logger.info( "Result after directory and writes / mk:" + chkdir + " res=" + res + " procTime=" + diff); final BotCrawlerDAO dao = new BotCrawlerDAO(); final BotCrawlerLink persistLink = new BotCrawlerLink(); persistLink.setHost(link.getHost()); persistLink.setUrl(String.valueOf(link.toBuilder())); persistLink.setTitle(metadata.get("title")); persistLink.setDescr(metadata.get("description")); persistLink.setPath(fpathText); persistLink.setLinkcount(link.getNumberLinks()); persistLink.setSource(Thread.currentThread().getName()); if (NullRef.hasValue(link.getStatusline())) { persistLink.setStatusline(link.getStatusline()); persistLink.setStatus(link.getCode()); } // Check the status line // if (link.getLink() != null) { if (link.getLink().getText() != null) { persistLink.setLinktext(link.getLink().getText()); } } dao.createLink(session, persistLink); } catch (final Throwable e) { e.printStackTrace(); } // End of try - catch // } // End of method write file and persist //
public List<BotLink> parse( final BotLink origLink, final URIBuilder lastBuilder, final String document) { final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory"); final Session session = sf.openSession(); try { final InputStream input = new ByteArrayInputStream(document.getBytes()); final LinkContentHandler linkHandler = new LinkContentHandler(); final ContentHandler textHandler = new BodyContentHandler(); final ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler(); final TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler, toHTMLHandler); final Metadata metadata = new Metadata(); final ParseContext parseContext = new ParseContext(); final HtmlParser parser = new HtmlParser(); parser.parse(input, teeHandler, metadata, parseContext); final String titleOfPage = metadata.get("title"); // For analytical data, ignore pages that don't have titles if (!NullRef.hasValue(titleOfPage)) { logger.warn("Warning, invalid title for page, EXITING logic, link=" + origLink); return null; } // Continue with parsing // final List<BotLink> linksForProcessing = new ArrayList<BotLink>(); final Set<String> urls = new HashSet<String>(); int fullLinkCount = 0; for (final Link link : linkHandler.getLinks()) { fullLinkCount++; } int linkcount = 0; // Loop through the links on the page // And add a set number to the queue. final Random rchk = new Random(System.currentTimeMillis()); final List<Link> linksFromPage = linkHandler.getLinks(); Collections.shuffle(linksFromPage); for (final Link link : linksFromPage) { // Add a 30% chance of adding this link final double rval = rchk.nextDouble(); final boolean okToAdd = rval > 0.65; if (okToAdd && link.getUri() != null) { linkcount++; if (linkcount > MAX_LINKS_PAGE) { // Only process a given number of links on a page // break; } // End of if max reached final String fullURL = this.fullURL(link, lastBuilder, urls); if (fullURL != null) { try { this.processFullURL(linksForProcessing, link, fullURL); } catch (final Throwable te) { te.printStackTrace(); } } } // End of the if // } // End of the for through the links // // Parse the available URLS // logger.info( "In Web Parser for " + lastBuilder + " # availableNumberOfLinks=" + urls.size() + " fullLinkCount=" + fullLinkCount); // Persist the current link // origLink.setNumberLinks(fullLinkCount); this.writeFileAndSave(origLink, session, metadata, document, textHandler.toString()); processLinksForQueue(linksForProcessing); return linksForProcessing; } catch (final Throwable e) { e.printStackTrace(); } finally { if (session != null) { session.close(); } } // End of the try - catch // return null; } // End of the method //