/** * This will process/save a given web page. First this needs to identify whether the given url is * a blog or not. If it is a blog, then this should handover the processing of the page to the * relevant BlogProcessor which will return a set of URLs to be fetched by the crawler. * * <p>Having got the return from the Blog processor, this method should then save all the relevant * information, like link information, the whole page, etc, * * @param webPage - the location where the web site is saved. * @param pageURL - original url of the web page * @return list of urls to be fetched. * @throws BlogCrawlingException */ private String[] processPage(File webPage, String pageURL) throws BlogCrawlingException { // let's not fetch media files if (blogDetector.isMediaFile(pageURL)) return new String[0]; try { // first let's get the blog id. If this URL is not a blog, this should return // Constants.NOT_A_BLOG // int blogId = blogDetector.identifyURL(pageURL, webPage); // if (blogId > 0) { // if this is a processable blog // process it and get the grouped set of urls. The map returned will contain urls as the key // and url type as the value. FileInputStream fileInputStream = new FileInputStream(webPage); String[] result = processBlog(fileInputStream); // save the link connection information. if (result.length > 0) { System.out.println("[" + myNumber + "] Inserting " + result.length + " links to database "); dbManager.insertBlogLinks(pageURL, result); } fileInputStream.close(); // return the the set of urls to be fetched for further processing return result; // } } catch (FileNotFoundException e) { e.printStackTrace(); dbManager.setBlogProcessingFailed(pageURL); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); dbManager.setBlogProcessingFailed(pageURL); throw new BlogCrawlingException(e); } catch (SQLException e) { e.printStackTrace(); dbManager.setBlogProcessingFailed(pageURL); throw new BlogCrawlingException(e); } }
public void run() { System.out.println("[" + myNumber + "] Starting Blog Processor ..."); while (true) { try { // fetch blog url to process BlogInfo blogInfo = dbManager.getNextBlogToProcess(); // find the file and process it if (blogInfo != null) { System.out.println( "[" + myNumber + "] Starting to process blog [" + blogInfo.getUrl() + " ]"); File blogPage = new File(blogFileStore, blogInfo.getFileName()); try { processPage(blogPage, blogInfo.getUrl()); System.out.println( "[" + myNumber + "] Finished processing Blog [" + blogInfo.getUrl() + " ]"); } catch (BlogCrawlingException e) { dbManager.setBlogProcessingFailed(blogInfo.getUrl()); System.out.println("Blog processing failed [" + blogInfo.getUrl() + " ]"); } } else { try { Thread.sleep(1000 * 60 * 1); } catch (InterruptedException e) { e.printStackTrace(); } } } catch (Exception e) { e.printStackTrace(); } catch (Throwable t) { t.printStackTrace(); } } }