private String[] processBlog(InputStream in) throws BlogCrawlingException {

    // using a set here to avoid duplicates
    Set<String> linksToBlogs = new TreeSet<String>();

    try {

      Page page = new Page(in, null);
      Parser parser = new Parser(new Lexer(page));

      // register a filter to extract all the anchor tags
      TagNameFilter anchorTagsFilter = new TagNameFilter("a");

      StringBuffer buf = new StringBuffer();
      NodeList anchorTagsList = parser.parse(anchorTagsFilter);

      for (int i = 0; i < anchorTagsList.size(); i++) {
        Node node = anchorTagsList.elementAt(i);
        LinkTag tag = (LinkTag) node;
        String linkURL = tag.getLink();

        if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) {
          // logger.info(" *BLOG Detected* ==> " + linkURL);
          System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL);
          linksToBlogs.add(linkURL);
        } else {
          System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL);
        }
      }

      String[] links = new String[linksToBlogs.size()];
      int count = 0;
      for (String linksToBlog : linksToBlogs) {
        links[count++] = linksToBlog;
      }

      return links;

    } catch (ParserException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (IOException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    }
  }
  /**
   * This will process/save a given web page. First this needs to identify whether the given url is
   * a blog or not. If it is a blog, then this should handover the processing of the page to the
   * relevant BlogProcessor which will return a set of URLs to be fetched by the crawler.
   *
   * <p>Having got the return from the Blog processor, this method should then save all the relevant
   * information, like link information, the whole page, etc,
   *
   * @param webPage - the location where the web site is saved.
   * @param pageURL - original url of the web page
   * @return list of urls to be fetched.
   * @throws BlogCrawlingException
   */
  private String[] processPage(File webPage, String pageURL) throws BlogCrawlingException {

    // let's not fetch media files
    if (blogDetector.isMediaFile(pageURL)) return new String[0];

    try {

      // first let's get the blog id. If this URL is not a blog, this should return
      // Constants.NOT_A_BLOG
      //            int blogId = blogDetector.identifyURL(pageURL, webPage);

      //            if (blogId > 0) {             // if this is a processable blog

      // process it and get the grouped set of urls. The map returned will contain urls as the key
      // and url type as the value.
      FileInputStream fileInputStream = new FileInputStream(webPage);
      String[] result = processBlog(fileInputStream);

      // save the link connection information.
      if (result.length > 0) {
        System.out.println("[" + myNumber + "] Inserting " + result.length + " links to database ");
        dbManager.insertBlogLinks(pageURL, result);
      }

      fileInputStream.close();
      // return the the set of urls to be fetched for further processing
      return result;

      //            }

    } catch (FileNotFoundException e) {
      e.printStackTrace();
      dbManager.setBlogProcessingFailed(pageURL);
      throw new BlogCrawlingException(e);
    } catch (IOException e) {
      e.printStackTrace();
      dbManager.setBlogProcessingFailed(pageURL);
      throw new BlogCrawlingException(e);
    } catch (SQLException e) {
      e.printStackTrace();
      dbManager.setBlogProcessingFailed(pageURL);
      throw new BlogCrawlingException(e);
    }
  }