protected String fullURL(final Link link, final URIBuilder lastBuilder, final Set<String> urls) { final String uri = link.getUri().trim(); final boolean basicExclude = (uri.length() != 0) && !uri.equals("/"); if (basicExclude) { String fullURL = ""; // We have some URL's if (extractLinks(uri).size() > 0) { // First add full URL to set // fullURL = uri; } else { if (uri.startsWith("/")) { fullURL = lastBuilder.getScheme() + "://" + lastBuilder.getHost() + uri; } else { final String path = parsePath(lastBuilder.getPath()); fullURL = lastBuilder.getScheme() + "://" + lastBuilder.getHost() + path + uri; } // End of the if - else // } // End // if (urls.contains(fullURL)) { // Return null so we don't reprocess return null; } // We should have a valid full URL. urls.add(fullURL); return fullURL; } // End of the if // return null; }
public List<BotLink> parse( final BotLink origLink, final URIBuilder lastBuilder, final String document) { final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory"); final Session session = sf.openSession(); try { final InputStream input = new ByteArrayInputStream(document.getBytes()); final LinkContentHandler linkHandler = new LinkContentHandler(); final ContentHandler textHandler = new BodyContentHandler(); final ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler(); final TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler, toHTMLHandler); final Metadata metadata = new Metadata(); final ParseContext parseContext = new ParseContext(); final HtmlParser parser = new HtmlParser(); parser.parse(input, teeHandler, metadata, parseContext); final String titleOfPage = metadata.get("title"); // For analytical data, ignore pages that don't have titles if (!NullRef.hasValue(titleOfPage)) { logger.warn("Warning, invalid title for page, EXITING logic, link=" + origLink); return null; } // Continue with parsing // final List<BotLink> linksForProcessing = new ArrayList<BotLink>(); final Set<String> urls = new HashSet<String>(); int fullLinkCount = 0; for (final Link link : linkHandler.getLinks()) { fullLinkCount++; } int linkcount = 0; // Loop through the links on the page // And add a set number to the queue. final Random rchk = new Random(System.currentTimeMillis()); final List<Link> linksFromPage = linkHandler.getLinks(); Collections.shuffle(linksFromPage); for (final Link link : linksFromPage) { // Add a 30% chance of adding this link final double rval = rchk.nextDouble(); final boolean okToAdd = rval > 0.65; if (okToAdd && link.getUri() != null) { linkcount++; if (linkcount > MAX_LINKS_PAGE) { // Only process a given number of links on a page // break; } // End of if max reached final String fullURL = this.fullURL(link, lastBuilder, urls); if (fullURL != null) { try { this.processFullURL(linksForProcessing, link, fullURL); } catch (final Throwable te) { te.printStackTrace(); } } } // End of the if // } // End of the for through the links // // Parse the available URLS // logger.info( "In Web Parser for " + lastBuilder + " # availableNumberOfLinks=" + urls.size() + " fullLinkCount=" + fullLinkCount); // Persist the current link // origLink.setNumberLinks(fullLinkCount); this.writeFileAndSave(origLink, session, metadata, document, textHandler.toString()); processLinksForQueue(linksForProcessing); return linksForProcessing; } catch (final Throwable e) { e.printStackTrace(); } finally { if (session != null) { session.close(); } } // End of the try - catch // return null; } // End of the method //