private String[] processBlog(InputStream in) throws BlogCrawlingException { // using a set here to avoid duplicates Set<String> linksToBlogs = new TreeSet<String>(); try { Page page = new Page(in, null); Parser parser = new Parser(new Lexer(page)); // register a filter to extract all the anchor tags TagNameFilter anchorTagsFilter = new TagNameFilter("a"); StringBuffer buf = new StringBuffer(); NodeList anchorTagsList = parser.parse(anchorTagsFilter); for (int i = 0; i < anchorTagsList.size(); i++) { Node node = anchorTagsList.elementAt(i); LinkTag tag = (LinkTag) node; String linkURL = tag.getLink(); if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) { // logger.info(" *BLOG Detected* ==> " + linkURL); System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL); linksToBlogs.add(linkURL); } else { System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL); } } String[] links = new String[linksToBlogs.size()]; int count = 0; for (String linksToBlog : linksToBlogs) { links[count++] = linksToBlog; } return links; } catch (ParserException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (UnsupportedEncodingException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } }
/** * This will process/save a given web page. First this needs to identify whether the given url is * a blog or not. If it is a blog, then this should handover the processing of the page to the * relevant BlogProcessor which will return a set of URLs to be fetched by the crawler. * * <p>Having got the return from the Blog processor, this method should then save all the relevant * information, like link information, the whole page, etc, * * @param webPage - the location where the web site is saved. * @param pageURL - original url of the web page * @return list of urls to be fetched. * @throws BlogCrawlingException */ private String[] processPage(File webPage, String pageURL) throws BlogCrawlingException { // let's not fetch media files if (blogDetector.isMediaFile(pageURL)) return new String[0]; try { // first let's get the blog id. If this URL is not a blog, this should return // Constants.NOT_A_BLOG // int blogId = blogDetector.identifyURL(pageURL, webPage); // if (blogId > 0) { // if this is a processable blog // process it and get the grouped set of urls. The map returned will contain urls as the key // and url type as the value. FileInputStream fileInputStream = new FileInputStream(webPage); String[] result = processBlog(fileInputStream); // save the link connection information. if (result.length > 0) { System.out.println("[" + myNumber + "] Inserting " + result.length + " links to database "); dbManager.insertBlogLinks(pageURL, result); } fileInputStream.close(); // return the the set of urls to be fetched for further processing return result; // } } catch (FileNotFoundException e) { e.printStackTrace(); dbManager.setBlogProcessingFailed(pageURL); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); dbManager.setBlogProcessingFailed(pageURL); throw new BlogCrawlingException(e); } catch (SQLException e) { e.printStackTrace(); dbManager.setBlogProcessingFailed(pageURL); throw new BlogCrawlingException(e); } }