public static List genSitemap(String mapUrl, String base) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements links = doc.select("a"); Elements imgs = doc.select("img"); List<String> stringLinks = new ArrayList<String>(); for (Element link : links) { stringLinks.add(link.attr("abs:href")); } Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains while (domIt.hasNext()) { String incDom = domIt.next(); boolean domTest; domTest = incDom.contains(base); if (domTest == false) { domIt.remove(); } } Iterator<String> i = stringLinks.iterator(); while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop String incA = i.next(); if (incA.contains("index")) { i.remove(); } } return stringLinks; } catch (Exception e) { // System.out.println(e); return null; } }
public Worker(String url, boolean verbose) throws Exception { Document doc; doc = Jsoup.connect(url).get(); // select anchors with href only Elements links = doc.select("a[href]"); String l_Href; String host; int linksNum; Parser parser; for (Element link : links) { // absolute = http:// added l_Href = link.attr("abs:href"); if (!l_Href.isEmpty()) { parser = new Parser(l_Href); host = parser.getHost(); // if tempStats contains the url, add one to the value if (tempStats.containsKey(host)) { linksNum = tempStats.get(host); tempStats.put(host, linksNum += 1); } // if it doesn't, add it else { tempStats.put(host, 1); } // parse the url tempQueue.add(parser.getURL()); } } if (verbose) { System.out.println( Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url); } }
public static List getImgs(String mapUrl) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements imgs = doc.select("img"); List<String> stringImgs = new ArrayList<String>(); stringImgs.add(mapUrl); for (Element img : imgs) { String imgSrc = img.attr("abs:src"); if (imgSrc.contains("paypal") == false) stringImgs.add(imgSrc); } return stringImgs; } catch (Exception e) { System.out.println(e); return null; } }