@Override public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) { try { final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get(); final Elements select = value.select("section.class-listing table.listing td.col-name"); final HashMap<String, String> classHsItemMap = new HashMap<String, String>(); final ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < select.size(); ++i) { final String text = select.get(i).select("a").get(0).text(); classHsItemMap.put( text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1)); list.add(text); } hsDeck.setClassHsItemMap(classHsItemMap); hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list)); final Elements select2 = value.select("section.neutral-listing table.listing td.col-name"); final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>(); final ArrayList<String> list2 = new ArrayList<String>(); for (int j = 0; j < select2.size(); ++j) { final String text2 = select2.get(j).select("a").get(0).text(); neutralHsItemMap.put( text2, select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1)); list2.add(text2); } hsDeck.setNeutralHsItemMap(neutralHsItemMap); hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2)); hsDeck.setDescription( HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false)); return hsDeck; } catch (IOException ex) { ex.printStackTrace(); return hsDeck; } }
@Test public void testPseudoHas() { Document doc = Jsoup.parse( "<div id=0><p><span>Hello</span></p></div> <div id=1><span class=foo>There</span></div> <div id=2><p>Not</p></div>"); Elements divs1 = doc.select("div:has(span)"); assertEquals(2, divs1.size()); assertEquals("0", divs1.get(0).id()); assertEquals("1", divs1.get(1).id()); Elements divs2 = doc.select("div:has([class]"); assertEquals(1, divs2.size()); assertEquals("1", divs2.get(0).id()); Elements divs3 = doc.select("div:has(span, p)"); assertEquals(3, divs3.size()); assertEquals("0", divs3.get(0).id()); assertEquals("1", divs3.get(1).id()); assertEquals("2", divs3.get(2).id()); Elements els1 = doc.body().select(":has(p)"); assertEquals(3, els1.size()); // body, div, dib assertEquals("body", els1.first().tagName()); assertEquals("0", els1.get(1).id()); assertEquals("2", els1.get(2).id()); }
@Test public void testPseudoGreaterThan() { Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</p></div><div><p>Four</p>"); Elements ps = doc.select("div p:gt(0)"); assertEquals(2, ps.size()); assertEquals("Two", ps.get(0).text()); assertEquals("Three", ps.get(1).text()); }
@Test public void testPseudoLessThan() { Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>"); Elements ps = doc.select("div p:lt(2)"); assertEquals(3, ps.size()); assertEquals("One", ps.get(0).text()); assertEquals("Two", ps.get(1).text()); assertEquals("Four", ps.get(2).text()); }
@Test public void adjacentSiblings() { String h = "<ol><li>One<li>Two<li>Three</ol>"; Document doc = Jsoup.parse(h); Elements sibs = doc.select("li + li"); assertEquals(2, sibs.size()); assertEquals("Two", sibs.get(0).text()); assertEquals("Three", sibs.get(1).text()); }
@Test public void parents() { Document doc = Jsoup.parse("<div><p>Hello</p></div><p>There</p>"); Elements parents = doc.select("p").parents(); assertEquals(3, parents.size()); assertEquals("div", parents.get(0).tagName()); assertEquals("body", parents.get(1).tagName()); assertEquals("html", parents.get(2).tagName()); }
@Test public void parentChildStar() { String h = "<div id=1><p>Hello<p><b>there</b></p></div><div id=2><span>Hi</span></div>"; Document doc = Jsoup.parse(h); Elements divChilds = doc.select("div > *"); assertEquals(3, divChilds.size()); assertEquals("p", divChilds.get(0).tagName()); assertEquals("p", divChilds.get(1).tagName()); assertEquals("span", divChilds.get(2).tagName()); }
@Test public void testGroupOrAttribute() { String h = "<div id=1 /><div id=2 /><div title=foo /><div title=bar />"; Elements els = Jsoup.parse(h).select("[id],[title=foo]"); assertEquals(3, els.size()); assertEquals("1", els.get(0).id()); assertEquals("2", els.get(1).id()); assertEquals("foo", els.get(2).attr("title")); }
@Test public void mixCombinator() { String h = "<div class=foo><ol><li>One<li>Two<li>Three</ol></div>"; Document doc = Jsoup.parse(h); Elements sibs = doc.select("body > div.foo li + li"); assertEquals(2, sibs.size()); assertEquals("Two", sibs.get(0).text()); assertEquals("Three", sibs.get(1).text()); }
@Test public void testByAttributeRegex() { Document doc = Jsoup.parse( "<p><img src=foo.png id=1><img src=bar.jpg id=2><img src=qux.JPEG id=3><img src=old.gif><img></p>"); Elements imgs = doc.select("img[src~=(?i)\\.(png|jpe?g)]"); assertEquals(3, imgs.size()); assertEquals("1", imgs.get(0).id()); assertEquals("2", imgs.get(1).id()); assertEquals("3", imgs.get(2).id()); }
@Test public void testById() { Elements els = Jsoup.parse("<div><p id=foo>Hello</p><p id=foo>Foo two!</p></div>").select("#foo"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("Foo two!", els.get(1).text()); Elements none = Jsoup.parse("<div id=1></div>").select("#foo"); assertEquals(0, none.size()); }
@Test public void mixCombinatorGroup() { String h = "<div class=foo><ol><li>One<li>Two<li>Three</ol></div>"; Document doc = Jsoup.parse(h); Elements els = doc.select(".foo > ol, ol > li + li"); assertEquals(3, els.size()); assertEquals("ol", els.get(0).tagName()); assertEquals("Two", els.get(1).text()); assertEquals("Three", els.get(2).text()); }
@Test public void setHtml() { Document doc = Jsoup.parse("<p>One</p><p>Two</p><p>Three</p>"); Elements ps = doc.select("p"); ps.prepend("<b>Bold</b>").append("<i>Ital</i>"); assertEquals("<p><b>Bold</b>Two<i>Ital</i></p>", TextUtil.stripNewlines(ps.get(1).outerHtml())); ps.html("<span>Gone</span>"); assertEquals("<p><span>Gone</span></p>", TextUtil.stripNewlines(ps.get(1).outerHtml())); }
@Test public void testPseudoEquals() { Document doc = Jsoup.parse("<div><p>One</p><p>Two</p><p>Three</>p></div><div><p>Four</p>"); Elements ps = doc.select("div p:eq(0)"); assertEquals(2, ps.size()); assertEquals("One", ps.get(0).text()); assertEquals("Four", ps.get(1).text()); Elements ps2 = doc.select("div:eq(0) p:eq(0)"); assertEquals(1, ps2.size()); assertEquals("One", ps2.get(0).text()); assertEquals("p", ps2.get(0).tagName()); }
@Test public void parentChildElement() { String h = "<div id=1><div id=2><div id = 3></div></div></div><div id=4></div>"; Document doc = Jsoup.parse(h); Elements divs = doc.select("div > div"); assertEquals(2, divs.size()); assertEquals("2", divs.get(0).id()); // 2 is child of 1 assertEquals("3", divs.get(1).id()); // 3 is child of 2 Elements div2 = doc.select("div#1 > div"); assertEquals(1, div2.size()); assertEquals("2", div2.get(0).id()); }
@Test public void classes() { Document doc = Jsoup.parse("<div><p class='mellow yellow'></p><p class='red green'></p>"); Elements els = doc.select("p"); assertTrue(els.hasClass("red")); assertFalse(els.hasClass("blue")); els.addClass("blue"); els.removeClass("yellow"); els.toggleClass("mellow"); assertEquals("blue", els.get(0).className()); assertEquals("red green blue mellow", els.get(1).className()); }
@Test public void testByTag() { // should be case insensitive Elements els = Jsoup.parse("<div id=1><div id=2><p>Hello</p></div></div><DIV id=3>").select("DIV"); assertEquals(3, els.size()); assertEquals("1", els.get(0).id()); assertEquals("2", els.get(1).id()); assertEquals("3", els.get(2).id()); Elements none = Jsoup.parse("<div id=1><div id=2><p>Hello</p></div></div><div id=3>").select("span"); assertEquals(0, none.size()); }
@Test public void handlesCommasInSelector() { Document doc = Jsoup.parse("<p name='1,2'>One</p><div>Two</div><ol><li>123</li><li>Text</li></ol>"); Elements ps = doc.select("[name=1,2]"); assertEquals(1, ps.size()); Elements containers = doc.select("div, li:matches([0-9,]+)"); assertEquals(2, containers.size()); assertEquals("div", containers.get(0).tagName()); assertEquals("li", containers.get(1).tagName()); assertEquals("123", containers.get(1).text()); }
/** * Find an element by ID, including or under this element. * * <p>Note that this finds the first matching ID, starting with this element. If you search down * from a different starting point, it is possible to find a different element by ID. For unique * element by ID within a Document, use {@link Document#getElementById(String)} * * @param id The ID to search for. * @return The first matching element by ID, starting with this element, or null if none found. */ public Element getElementById(String id) { Validate.notEmpty(id); Elements elements = Collector.collect(new Evaluator.Id(id), this); if (elements.size() > 0) return elements.get(0); else return null; }
@Test public void adjacentSiblingsWithId() { String h = "<ol><li id=1>One<li id=2>Two<li id=3>Three</ol>"; Document doc = Jsoup.parse(h); Elements sibs = doc.select("li#1 + li#2"); assertEquals(1, sibs.size()); assertEquals("Two", sibs.get(0).text()); }
@Test public void testPseudoCombined() { Document doc = Jsoup.parse( "<div class='foo'><p>One</p><p>Two</p></div><div><p>Three</p><p>Four</p></div>"); Elements ps = doc.select("div.foo p:gt(0)"); assertEquals(1, ps.size()); assertEquals("Two", ps.get(0).text()); }
@Test public void testByAttributeRegexCharacterClass() { Document doc = Jsoup.parse( "<p><img src=foo.png id=1><img src=bar.jpg id=2><img src=qux.JPEG id=3><img src=old.gif id=4></p>"); Elements imgs = doc.select("img[src~=[o]]"); assertEquals(2, imgs.size()); assertEquals("1", imgs.get(0).id()); assertEquals("4", imgs.get(1).id()); }
@Test public void filter() { String h = "<p>Excl</p><div class=headline><p>Hello</p><p>There</p></div><div class=headline><h1>Headline</h1></div>"; Document doc = Jsoup.parse(h); Elements els = doc.select(".headline").select("p"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("There", els.get(1).text()); }
@Test public void selectClassWithSpace() { final String html = "<div class=\"value\">class without space</div>\n" + "<div class=\"value \">class with space</div>"; Document doc = Jsoup.parse(html); Elements found = doc.select("div[class=value ]"); assertEquals(2, found.size()); assertEquals("class without space", found.get(0).text()); assertEquals("class with space", found.get(1).text()); found = doc.select("div[class=\"value \"]"); assertEquals(2, found.size()); assertEquals("class without space", found.get(0).text()); assertEquals("class with space", found.get(1).text()); found = doc.select("div[class=\"value\\ \"]"); assertEquals(0, found.size()); }
@Test public void descendant() { String h = "<div class=head><p class=first>Hello</p><p>There</p></div><p>None</p>"; Document doc = Jsoup.parse(h); Element root = doc.getElementsByClass("HEAD").first(); Elements els = root.select(".head p"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("There", els.get(1).text()); Elements p = root.select("p.first"); assertEquals(1, p.size()); assertEquals("Hello", p.get(0).text()); Elements empty = root.select("p .first"); // self, not descend, should not match assertEquals(0, empty.size()); Elements aboveRoot = root.select("body div.head"); assertEquals(0, aboveRoot.size()); }
@Test public void containsData() { String html = "<p>jsoup</p><script>jsoup</script><span><!-- comments --></span>"; Document doc = Jsoup.parse(html); Element body = doc.body(); Elements dataEls1 = body.select(":containsData(jsoup)"); Elements dataEls2 = body.select("script:containsData(jsoup)"); Elements dataEls3 = body.select("span:containsData(comments)"); Elements dataEls4 = body.select(":containsData(s)"); assertEquals(2, dataEls1.size()); // body and script assertEquals(1, dataEls2.size()); assertEquals(dataEls1.last(), dataEls2.first()); assertEquals("<script>jsoup</script>", dataEls2.outerHtml()); assertEquals(1, dataEls3.size()); assertEquals("span", dataEls3.first().tagName()); assertEquals(3, dataEls4.size()); assertEquals("body", dataEls4.first().tagName()); assertEquals("script", dataEls4.get(1).tagName()); assertEquals("span", dataEls4.get(2).tagName()); }
@Test public void testByClass() { Elements els = Jsoup.parse("<p id=0 class='ONE two'><p id=1 class='one'><p id=2 class='two'>") .select("P.One"); assertEquals(2, els.size()); assertEquals("0", els.get(0).id()); assertEquals("1", els.get(1).id()); Elements none = Jsoup.parse("<div class='one'></div>").select(".foo"); assertEquals(0, none.size()); Elements els2 = Jsoup.parse("<div class='One-Two'></div>").select(".one-two"); assertEquals(1, els2.size()); }
@Test public void testGroupOr() { String h = "<div title=foo /><div title=bar /><div /><p></p><img /><span title=qux>"; Document doc = Jsoup.parse(h); Elements els = doc.select("p,div,[title]"); assertEquals(5, els.size()); assertEquals("div", els.get(0).tagName()); assertEquals("foo", els.get(0).attr("title")); assertEquals("div", els.get(1).tagName()); assertEquals("bar", els.get(1).attr("title")); assertEquals("div", els.get(2).tagName()); assertTrue( els.get(2).attr("title").length() == 0); // missing attributes come back as empty string assertFalse(els.get(2).hasAttr("title")); assertEquals("p", els.get(3).tagName()); assertEquals("span", els.get(4).tagName()); }
public Scraper() { Document doc = null; try { doc = Jsoup.connect( "http://www.geog.leeds.ac.uk/courses/other/programming/practicals/general/web/scraping-intro/table.html") .get(); } catch (IOException ioe) { ioe.printStackTrace(); } Element table = doc.getElementById("datatable"); Elements rows = table.getElementsByTag("TR"); for (Element row : rows) { Elements tds = row.getElementsByTag("TD"); for (int i = 0; i < tds.size(); i++) { if (i == 1) System.out.println(tds.get(i).text()); } } }
public static void main(String[] args) { Document doc = null; try { // get page doc = (Document) Jsoup.connect("http://fskm.uitm.edu.my/v1/fakulti/staff-directory/academic/1097.html") .get(); } catch (IOException ex) { ex.printStackTrace(); } // Get Element with specific ID Element table = doc.getElementById("mytable"); // Get text inside Element Elements rows = table.getElementsByTag("TR"); for (Element row : rows) { Elements tds = row.getElementsByTag("TD"); for (int i = 0; i < tds.size(); i++) { if (i == 1) System.out.println(tds.get(i).text()); } } }