public Worker(String url, boolean verbose) throws Exception { Document doc; doc = Jsoup.connect(url).get(); // select anchors with href only Elements links = doc.select("a[href]"); String l_Href; String host; int linksNum; Parser parser; for (Element link : links) { // absolute = http:// added l_Href = link.attr("abs:href"); if (!l_Href.isEmpty()) { parser = new Parser(l_Href); host = parser.getHost(); // if tempStats contains the url, add one to the value if (tempStats.containsKey(host)) { linksNum = tempStats.get(host); tempStats.put(host, linksNum += 1); } // if it doesn't, add it else { tempStats.put(host, 1); } // parse the url tempQueue.add(parser.getURL()); } } if (verbose) { System.out.println( Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url); } }
public static List genSitemap(String mapUrl, String base) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements links = doc.select("a"); Elements imgs = doc.select("img"); List<String> stringLinks = new ArrayList<String>(); for (Element link : links) { stringLinks.add(link.attr("abs:href")); } Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains while (domIt.hasNext()) { String incDom = domIt.next(); boolean domTest; domTest = incDom.contains(base); if (domTest == false) { domIt.remove(); } } Iterator<String> i = stringLinks.iterator(); while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop String incA = i.next(); if (incA.contains("index")) { i.remove(); } } return stringLinks; } catch (Exception e) { // System.out.println(e); return null; } }
private static void accumulateParents(Element el, Elements parents) { Element parent = el.parent(); if (parent != null && !parent.tagName().equals("#root")) { parents.add(parent); accumulateParents(parent, parents); } }
static boolean preserveWhitespace(Node node) { // looks only at this element and one level up, to prevent recursion & needless stack searches if (node != null && node instanceof Element) { Element element = (Element) node; return element.tag.preserveWhitespace() || element.parent() != null && element.parent().tag.preserveWhitespace(); } return false; }
/** * Test if this element has any text content (that is not just whitespace). * * @return true if element has non-blank text content. */ public boolean hasText() { for (Node child : childNodes) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; if (!textNode.isBlank()) return true; } else if (child instanceof Element) { Element el = (Element) child; if (el.hasText()) return true; } } return false; }
/** * Get the combined data of this element. Data is e.g. the inside of a {@code script} tag. * * @return the data, or empty string if none * @see #dataNodes() */ public String data() { StringBuilder sb = new StringBuilder(); for (Node childNode : childNodes) { if (childNode instanceof DataNode) { DataNode data = (DataNode) childNode; sb.append(data.getWholeData()); } else if (childNode instanceof Element) { Element element = (Element) childNode; String elementData = element.data(); sb.append(elementData); } } return sb.toString(); }
public static List getImgs(String mapUrl) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements imgs = doc.select("img"); List<String> stringImgs = new ArrayList<String>(); stringImgs.add(mapUrl); for (Element img : imgs) { String imgSrc = img.attr("abs:src"); if (imgSrc.contains("paypal") == false) stringImgs.add(imgSrc); } return stringImgs; } catch (Exception e) { System.out.println(e); return null; } }
public int walk(Element el) { Elements children = el.children(); String tagName = el.tagName().toLowerCase(); if (tagName.matches("h[1-6]")) { secIndex++; String secName = el.text(); String key = "SectionTitle" + "/" + title + "/" + hash + "/" + new Integer(secIndex).toString(); String value = secNameFilter(secName); dataStore.put(key, value); } for (Element child : children) { secIndex = (new Walker(secIndex)).walk(child); } return secIndex; }
public Scraper() { Document doc = null; try { doc = Jsoup.connect( "http://www.geog.leeds.ac.uk/courses/other/programming/practicals/general/web/scraping-intro/table.html") .get(); } catch (IOException ioe) { ioe.printStackTrace(); } Element table = doc.getElementById("datatable"); Elements rows = table.getElementsByTag("TR"); for (Element row : rows) { Elements tds = row.getElementsByTag("TD"); for (int i = 0; i < tds.size(); i++) { if (i == 1) System.out.println(tds.get(i).text()); } } }
public static void main(String[] args) { Document doc = null; try { // get page doc = (Document) Jsoup.connect("http://fskm.uitm.edu.my/v1/fakulti/staff-directory/academic/1097.html") .get(); } catch (IOException ex) { ex.printStackTrace(); } // Get Element with specific ID Element table = doc.getElementById("mytable"); // Get text inside Element Elements rows = table.getElementsByTag("TR"); for (Element row : rows) { Elements tds = row.getElementsByTag("TD"); for (int i = 0; i < tds.size(); i++) { if (i == 1) System.out.println(tds.get(i).text()); } } }
@Override public List<HSDeck> getDeckListFiltered(final DeckBrowserRequest deckBrowserRequest) { final List<HSPlayerClass> classFilter = deckBrowserRequest.getClassFilter(); final ArrayList<HSDeck> list = new ArrayList<HSDeck>(); try { String s2; final String s = s2 = HPDeckSource.BASE_URL + HPDeckSource.DECKS_URL; if (deckBrowserRequest.getSortingKey() != null) { s2 = s; if (!deckBrowserRequest.getSortingKey().trim().isEmpty()) { s2 = s + "&" + HP_REQUEST_PARAMS.FILTER_OPTION.requestParam + deckBrowserRequest.getSortingKey(); } } String string = s2; if (deckBrowserRequest.getDeckNameFilter() != null) { string = s2; if (!deckBrowserRequest.getDeckNameFilter().trim().isEmpty()) { string = s2 + "&" + HP_REQUEST_PARAMS.FILTER_SEARCH.requestParam + this.constructDeckNameFilter(deckBrowserRequest.getDeckNameFilter()); } } String string2 = string; if (classFilter != null) { string2 = string; if (classFilter.size() > 0) { string2 = string; if (!classFilter.contains(HSPlayerClass.ALL)) { int n = 0; for (final HSPlayerClass hsPlayerClass : classFilter) { if (hsPlayerClass.isSingleClass()) { n += hsPlayerClass.getHsFilterValue(); } } string2 = string + "&" + HP_REQUEST_PARAMS.FILTER_CLASS.requestParam + n; } } } String string3 = string2; if (deckBrowserRequest.getOrderBy() != null) { string3 = string2; if (!deckBrowserRequest.getOrderBy().isEmpty()) { String s3; if (deckBrowserRequest.isAsc()) { s3 = ""; } else { s3 = "-"; } string3 = string2 + "&" + HP_REQUEST_PARAMS.FILTER_SORT.requestParam + s3 + deckBrowserRequest.getOrderBy(); } } final Elements select = Jsoup.connect(string3) .referrer(HPDeckSource.BASE_URL + "/") .followRedirects(true) .ignoreHttpErrors(true) .get() .select("table#decks tr"); for (int i = 1; i < select.size() - 1; ++i) { final Element value = select.get(i); final Elements select2 = value.select("td.col-name span.tip a"); final Elements select3 = value.select("td.col-deck-type"); final Elements select4 = value.select("td.col-class"); final Elements select5 = value.select("td.col-ratings div.rating-sum"); final Elements select6 = value.select("td.col-dust-cost"); final Elements select7 = value.select("td.col-updated abbr"); final HSDeck hsDeck = new HSDeck(); hsDeck.setName(select2.get(0).text()); hsDeck.setUrl(select2.get(0).attr("href")); hsDeck.setType(select3.get(0).text()); hsDeck.setPlayerClass(select4.get(0).text()); hsDeck.setRating(select5.get(0).text()); hsDeck.setCost(select6.get(0).text()); if (select7.get(0).hasAttr("data-epoch")) { hsDeck.setLastUpdate(select7.get(0).attributes().get("data-epoch")); } hsDeck.setLastUpdateAsString(select7.get(0).text()); list.add(hsDeck); } } catch (IOException ex) { ex.printStackTrace(); } return list; }
@Override public Element clone() { Element clone = (Element) super.clone(); clone.classNames = null; // derived on first hit, otherwise gets a pointer to source classnames return clone; }