private void getImgLinks(HTML dequeuedHtml) { // TODO Auto-generated method stub String[] allValidExtension = m_ConfigData.getImageExtensions(); StringBuilder extenstionAddToRegex = new StringBuilder(); extenstionAddToRegex.append("("); for (int i = 0; i < allValidExtension.length; i++) { if (i == allValidExtension.length - 1) { extenstionAddToRegex.append(allValidExtension[i].toLowerCase() + "|"); extenstionAddToRegex.append(allValidExtension[i].toUpperCase() + ")"); } else { extenstionAddToRegex.append(allValidExtension[i].toLowerCase() + "|"); extenstionAddToRegex.append(allValidExtension[i].toUpperCase() + "|"); } } String regex = "<(img|IMG)\\s+(src|SRC)=\"(.*?\\." + extenstionAddToRegex.toString() + ")\""; System.out.println("Regex for image is " + regex); Pattern pattern = Pattern.compile(regex); Matcher m = pattern.matcher(dequeuedHtml.GetBody()); System.out.println("fetching image links"); while (m.find()) { String link = m.group(3); System.out.println("enqueue to Downloader: " + link); if (!m_AllTraversedLinks.Exists(link)) { // m_DownloadQueue.enqueue(link); m_DownloadQueue.enqueue(link); // m_AllTraversedLinks.enqueue(link); System.out.println("Analyzer: adding image link: " + link); } } System.out.println("Done with fetching image links"); }
private void getUrlLinks(HTML dequeuedHtml) { // String regex = "<(a|A) (href|HREF)=\"(http|HTTP)://" + m_Domain + // "/(.*?)\">"; String regex = "<(a|A)\\s+(href|HREF)=\"(?!#)(.*?)\">"; // TODO: do HTTP // prefix means // its external? Pattern pattern = Pattern.compile(regex); Matcher m = pattern.matcher(dequeuedHtml.GetBody()); System.out.println("fetching links"); while (m.find()) { String link = m.group(3); System.out.println("enqueue to Downloader: " + link); if (!m_AllTraversedLinks.Exists(link)) { // m_DownloadQueue.enqueue(link); m_DownloadQueue.enqueue(link); // m_AllTraversedLinks.enqueue(link); System.out.println("Analyzer: adding link: " + link); } } System.out.println("Done with fetching links"); }