예제 #1
0
  /**
   * This will process/save a given web page. First this needs to identify whether the given url is
   * a blog or not. If it is a blog, then this should handover the processing of the page to the
   * relevant BlogProcessor which will return a set of URLs to be fetched by the crawler.
   *
   * <p>Having got the return from the Blog processor, this method should then save all the relevant
   * information, like link information, the whole page, etc,
   *
   * @param webPage - the location where the web site is saved.
   * @param pageURL - original url of the web page
   * @return list of urls to be fetched.
   * @throws BlogCrawlingException
   */
  private String[] processPage(File webPage, String pageURL) throws BlogCrawlingException {

    // let's not fetch media files
    if (blogDetector.isMediaFile(pageURL)) return new String[0];

    try {

      // first let's get the blog id. If this URL is not a blog, this should return
      // Constants.NOT_A_BLOG
      //            int blogId = blogDetector.identifyURL(pageURL, webPage);

      //            if (blogId > 0) {             // if this is a processable blog

      // process it and get the grouped set of urls. The map returned will contain urls as the key
      // and url type as the value.
      FileInputStream fileInputStream = new FileInputStream(webPage);
      String[] result = processBlog(fileInputStream);

      // save the link connection information.
      if (result.length > 0) {
        System.out.println("[" + myNumber + "] Inserting " + result.length + " links to database ");
        dbManager.insertBlogLinks(pageURL, result);
      }

      fileInputStream.close();
      // return the the set of urls to be fetched for further processing
      return result;

      //            }

    } catch (FileNotFoundException e) {
      e.printStackTrace();
      dbManager.setBlogProcessingFailed(pageURL);
      throw new BlogCrawlingException(e);
    } catch (IOException e) {
      e.printStackTrace();
      dbManager.setBlogProcessingFailed(pageURL);
      throw new BlogCrawlingException(e);
    } catch (SQLException e) {
      e.printStackTrace();
      dbManager.setBlogProcessingFailed(pageURL);
      throw new BlogCrawlingException(e);
    }
  }
예제 #2
0
  public void run() {

    System.out.println("[" + myNumber + "] Starting Blog Processor ...");
    while (true) {
      try {
        // fetch blog url to process
        BlogInfo blogInfo = dbManager.getNextBlogToProcess();

        // find the file and process it
        if (blogInfo != null) {

          System.out.println(
              "[" + myNumber + "] Starting to process blog [" + blogInfo.getUrl() + " ]");
          File blogPage = new File(blogFileStore, blogInfo.getFileName());
          try {
            processPage(blogPage, blogInfo.getUrl());
            System.out.println(
                "[" + myNumber + "] Finished processing Blog [" + blogInfo.getUrl() + " ]");
          } catch (BlogCrawlingException e) {
            dbManager.setBlogProcessingFailed(blogInfo.getUrl());
            System.out.println("Blog processing failed [" + blogInfo.getUrl() + " ]");
          }
        } else {
          try {
            Thread.sleep(1000 * 60 * 1);
          } catch (InterruptedException e) {
            e.printStackTrace();
          }
        }
      } catch (Exception e) {
        e.printStackTrace();

      } catch (Throwable t) {
        t.printStackTrace();
      }
    }
  }