private String[] processBlog(InputStream in) throws BlogCrawlingException { // using a set here to avoid duplicates Set<String> linksToBlogs = new TreeSet<String>(); try { Page page = new Page(in, null); Parser parser = new Parser(new Lexer(page)); // register a filter to extract all the anchor tags TagNameFilter anchorTagsFilter = new TagNameFilter("a"); StringBuffer buf = new StringBuffer(); NodeList anchorTagsList = parser.parse(anchorTagsFilter); for (int i = 0; i < anchorTagsList.size(); i++) { Node node = anchorTagsList.elementAt(i); LinkTag tag = (LinkTag) node; String linkURL = tag.getLink(); if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) { // logger.info(" *BLOG Detected* ==> " + linkURL); System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL); linksToBlogs.add(linkURL); } else { System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL); } } String[] links = new String[linksToBlogs.size()]; int count = 0; for (String linksToBlog : linksToBlogs) { links[count++] = linksToBlog; } return links; } catch (ParserException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (UnsupportedEncodingException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } }