@Override public HSDeck getDeckDetail(final HSDeck hsDeck, final float n) { try { final Document value = Jsoup.connect(HPDeckSource.BASE_URL + hsDeck.getUrl()).get(); final Elements select = value.select("section.class-listing table.listing td.col-name"); final HashMap<String, String> classHsItemMap = new HashMap<String, String>(); final ArrayList<String> list = new ArrayList<String>(); for (int i = 0; i < select.size(); ++i) { final String text = select.get(i).select("a").get(0).text(); classHsItemMap.put( text, select.get(i).text().trim().substring(select.get(i).text().trim().length() - 1)); list.add(text); } hsDeck.setClassHsItemMap(classHsItemMap); hsDeck.setClassHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list)); final Elements select2 = value.select("section.neutral-listing table.listing td.col-name"); final HashMap<String, String> neutralHsItemMap = new HashMap<String, String>(); final ArrayList<String> list2 = new ArrayList<String>(); for (int j = 0; j < select2.size(); ++j) { final String text2 = select2.get(j).select("a").get(0).text(); neutralHsItemMap.put( text2, select2.get(j).text().trim().substring(select2.get(j).text().trim().length() - 1)); list2.add(text2); } hsDeck.setNeutralHsItemMap(neutralHsItemMap); hsDeck.setNeutralHsItemList(DataBaseManager.getInstance().getAllCardsByNames(list2)); hsDeck.setDescription( HtmlHelper.parseDescription(value.select("div.deck-description").html(), n, false)); return hsDeck; } catch (IOException ex) { ex.printStackTrace(); return hsDeck; } }
@Test public void dataset() { Document doc = Jsoup.parse( "<div id=1 data-name=jsoup class=new data-package=jar>Hello</div><p id=2>Hello</p>"); Element div = doc.select("div").first(); Map<String, String> dataset = div.dataset(); Attributes attributes = div.attributes(); // size, get, set, add, remove assertEquals(2, dataset.size()); assertEquals("jsoup", dataset.get("name")); assertEquals("jar", dataset.get("package")); dataset.put("name", "jsoup updated"); dataset.put("language", "java"); dataset.remove("package"); assertEquals(2, dataset.size()); assertEquals(4, attributes.size()); assertEquals("jsoup updated", attributes.get("data-name")); assertEquals("jsoup updated", dataset.get("name")); assertEquals("java", attributes.get("data-language")); assertEquals("java", dataset.get("language")); attributes.put("data-food", "bacon"); assertEquals(3, dataset.size()); assertEquals("bacon", dataset.get("food")); attributes.put("data-", "empty"); assertEquals(null, dataset.get("")); // data- is not a data attribute Element p = doc.select("p").first(); assertEquals(0, p.dataset().size()); }
@Test public void insertChildrenAtPosition() { Document doc = Jsoup.parse( "<div id=1>Text1 <p>One</p> Text2 <p>Two</p></div><div id=2>Text3 <p>Three</p></div>"); Element div1 = doc.select("div").get(0); Elements p1s = div1.select("p"); Element div2 = doc.select("div").get(1); assertEquals(2, div2.childNodeSize()); div2.insertChildren(-1, p1s); assertEquals(2, div1.childNodeSize()); // moved two out assertEquals(4, div2.childNodeSize()); assertEquals(3, p1s.get(1).siblingIndex()); // should be last List<Node> els = new ArrayList<Node>(); Element el1 = new Element(Tag.valueOf("span"), "").text("Span1"); Element el2 = new Element(Tag.valueOf("span"), "").text("Span2"); TextNode tn1 = new TextNode("Text4", ""); els.add(el1); els.add(el2); els.add(tn1); assertNull(el1.parent()); div2.insertChildren(-2, els); assertEquals(div2, el1.parent()); assertEquals(7, div2.childNodeSize()); assertEquals(3, el1.siblingIndex()); assertEquals(4, el2.siblingIndex()); assertEquals(5, tn1.siblingIndex()); }
public static List genSitemap(String mapUrl, String base) { try { Document doc = Jsoup.connect(mapUrl).get(); Elements links = doc.select("a"); Elements imgs = doc.select("img"); List<String> stringLinks = new ArrayList<String>(); for (Element link : links) { stringLinks.add(link.attr("abs:href")); } Iterator<String> domIt = stringLinks.iterator(); // filter out links to external domains while (domIt.hasNext()) { String incDom = domIt.next(); boolean domTest; domTest = incDom.contains(base); if (domTest == false) { domIt.remove(); } } Iterator<String> i = stringLinks.iterator(); while (i.hasNext()) { // remove index.html from incoming links prevents infinite loop String incA = i.next(); if (incA.contains("index")) { i.remove(); } } return stringLinks; } catch (Exception e) { // System.out.println(e); return null; } }
@Test public void testHtmlContainsOuter() { Document doc = Jsoup.parse("<title>Check</title> <div>Hello there</div>"); doc.outputSettings().indentAmount(0); assertTrue(doc.html().contains(doc.select("title").outerHtml())); assertTrue(doc.html().contains(doc.select("div").outerHtml())); }
@Test public void testTagNameSet() { Document doc = Jsoup.parse("<div><i>Hello</i>"); doc.select("i").first().tagName("em"); assertEquals(0, doc.select("i").size()); assertEquals(1, doc.select("em").size()); assertEquals("<em>Hello</em>", doc.select("div").first().html()); }
@Test public void testHasText() { Document doc = Jsoup.parse("<div><p>Hello</p><p></p></div>"); Element div = doc.select("div").first(); Elements ps = doc.select("p"); assertTrue(div.hasText()); assertTrue(ps.first().hasText()); assertFalse(ps.last().hasText()); }
@Test public void testSetText() { String h = "<div id=1>Hello <p>there <b>now</b></p></div>"; Document doc = Jsoup.parse(h); assertEquals("Hello there now", doc.text()); // need to sort out node whitespace assertEquals("there now", doc.select("p").get(0).text()); Element div = doc.getElementById("1").text("Gone"); assertEquals("Gone", div.text()); assertEquals(0, doc.select("p").size()); }
@Test public void testGetTextNodes() { Document doc = Jsoup.parse("<p>One <span>Two</span> Three <br> Four</p>"); List<TextNode> textNodes = doc.select("p").first().textNodes(); assertEquals(3, textNodes.size()); assertEquals("One ", textNodes.get(0).text()); assertEquals(" Three ", textNodes.get(1).text()); assertEquals(" Four", textNodes.get(2).text()); assertEquals(0, doc.select("br").first().textNodes().size()); }
@Test public void testContainerOutput() { Document doc = Jsoup.parse( "<title>Hello there</title> <div><p>Hello</p><p>there</p></div> <div>Another</div>"); assertEquals("<title>Hello there</title>", doc.select("title").first().outerHtml()); assertEquals( "<div>\n <p>Hello</p>\n <p>there</p>\n</div>", doc.select("div").first().outerHtml()); assertEquals( "<div>\n <p>Hello</p>\n <p>there</p>\n</div> \n<div>\n Another\n</div>", doc.select("body").first().html()); }
@Test public void after() { Document doc = Jsoup.parse("<div><p>Hello</p><p>There</p></div>"); Element p1 = doc.select("p").first(); p1.after("<div>one</div><div>two</div>"); assertEquals( "<div><p>Hello</p><div>one</div><div>two</div><p>There</p></div>", TextUtil.stripNewlines(doc.body().html())); doc.select("p").last().after("<p>Three</p><!-- four -->"); assertEquals( "<div><p>Hello</p><div>one</div><div>two</div><p>There</p><p>Three</p><!-- four --></div>", TextUtil.stripNewlines(doc.body().html())); }
@Test public void testCssPath() { Document doc = Jsoup.parse("<div id=\"id1\">A</div><div>B</div><div class=\"c1 c2\">C</div>"); Element divA = doc.select("div").get(0); Element divB = doc.select("div").get(1); Element divC = doc.select("div").get(2); assertEquals(divA.cssSelector(), "#id1"); assertEquals(divB.cssSelector(), "html > body > div:nth-child(2)"); assertEquals(divC.cssSelector(), "html > body > div.c1.c2"); assertTrue(divA == doc.select(divA.cssSelector()).first()); assertTrue(divB == doc.select(divB.cssSelector()).first()); assertTrue(divC == doc.select(divC.cssSelector()).first()); }
@Test public void insertChildrenAsCopy() { Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>"); Element div1 = doc.select("div").get(0); Element div2 = doc.select("div").get(1); Elements ps = doc.select("p").clone(); ps.first().text("One cloned"); div2.insertChildren(-1, ps); assertEquals(4, div1.childNodeSize()); // not moved -- cloned assertEquals(2, div2.childNodeSize()); assertEquals( "<div id=\"1\">Text <p>One</p> Text <p>Two</p></div><div id=\"2\"><p>One cloned</p><p>Two</p></div>", TextUtil.stripNewlines(doc.body().html())); }
@Test public void testGetChildText() { Document doc = Jsoup.parse("<p>Hello <b>there</b> now"); Element p = doc.select("p").first(); assertEquals("Hello there now", p.text()); assertEquals("Hello now", p.ownText()); }
@Test public void testPrependRowToTable() { Document doc = Jsoup.parse("<table><tr><td>1</td></tr></table>"); Element table = doc.select("tbody").first(); table.prepend("<tr><td>2</td></tr>"); assertEquals( "<table><tbody><tr><td>2</td></tr><tr><td>1</td></tr></tbody></table>", TextUtil.stripNewlines(doc.body().html())); // check sibling index (reindexChildren): Elements ps = doc.select("tr"); for (int i = 0; i < ps.size(); i++) { assertEquals(i, ps.get(i).siblingIndex); } }
@Test public void testClonesClassnames() { Document doc = Jsoup.parse("<div class='one two'></div>"); Element div = doc.select("div").first(); Set<String> classes = div.classNames(); assertEquals(2, classes.size()); assertTrue(classes.contains("one")); assertTrue(classes.contains("two")); Element copy = div.clone(); Set<String> copyClasses = copy.classNames(); assertEquals(2, copyClasses.size()); assertTrue(copyClasses.contains("one")); assertTrue(copyClasses.contains("two")); copyClasses.add("three"); copyClasses.remove("one"); assertTrue(classes.contains("one")); assertFalse(classes.contains("three")); assertFalse(copyClasses.contains("one")); assertTrue(copyClasses.contains("three")); assertEquals("", div.html()); assertEquals("", copy.html()); }
@Test public void testClone() { Document doc = Jsoup.parse("<div><p>One<p><span>Two</div>"); Element p = doc.select("p").get(1); Element clone = p.clone(); assertNull(clone.parent()); // should be orphaned assertEquals(0, clone.siblingIndex); assertEquals(1, p.siblingIndex); assertNotNull(p.parent()); clone.append("<span>Three"); assertEquals( "<p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(clone.outerHtml())); assertEquals( "<div><p>One</p><p><span>Two</span></p></div>", TextUtil.stripNewlines(doc.body().html())); // not modified doc.body().appendChild(clone); // adopt assertNotNull(clone.parent()); assertEquals( "<div><p>One</p><p><span>Two</span></p></div><p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(doc.body().html())); }
public Worker(String url, boolean verbose) throws Exception { Document doc; doc = Jsoup.connect(url).get(); // select anchors with href only Elements links = doc.select("a[href]"); String l_Href; String host; int linksNum; Parser parser; for (Element link : links) { // absolute = http:// added l_Href = link.attr("abs:href"); if (!l_Href.isEmpty()) { parser = new Parser(l_Href); host = parser.getHost(); // if tempStats contains the url, add one to the value if (tempStats.containsKey(host)) { linksNum = tempStats.get(host); tempStats.put(host, linksNum += 1); } // if it doesn't, add it else { tempStats.put(host, 1); } // parse the url tempQueue.add(parser.getURL()); } } if (verbose) { System.out.println( Thread.currentThread().getName() + " : " + tempQueue.size() + " links from " + url); } }
public List<String> extractCities(Document doc) { HashMap<String, String> cityMap = new HashMap<String, String>(); cityMap.put("Adana", "Adana"); cityMap.put("Konya", "Konya"); cityMap.put("Tekirda\u011f", "Tekirda\u011f"); // \u011f List<String> cityList = new ArrayList<String>(); Element ilanDetay = doc.select("div#divIlanDetay").first(); String patternJobTitle = ".*(\u015eehir/\u00dclke|City/Country|Location).*"; Pattern pattern = Pattern.compile(patternJobTitle); Matcher matcher = pattern.matcher(getPlainText(ilanDetay)); if (matcher.find()) { String cityLine = matcher.group(); String[] cityLineArr = cityLine.split(":"); if (cityLineArr.length > 1) { String cityCommaStr = cityLineArr[1].trim(); String[] cityArr = cityCommaStr.split(" ")[0].split(","); for (String city : cityArr) { cityList.add(trim(city)); } } } if (cityList.size() == 0) { Set<String> tokenSet = tokenize(doc.text()); for (String s : tokenSet) { if (cityMap.containsKey(s)) { cityList.add(trim(cityMap.get(s))); } } } return cityList; }
public JobData parse(Document doc) { String keyword = doc.select("meta[name=keywords]").first().attr("content"); String[] keywordArr = keyword.split(","); String uniqueId = extractUniqueId(doc); String companyName = keywordArr[0]; String jobTitle = keywordArr[1]; List<String> cityList = extractCities(doc); Date date = extractDate(doc); String summary = summary(doc); JobData jobData = null; try { jobData = new JobData( trim(SOURCE), trim(uniqueId), trim(companyName), trim(jobTitle), cityList, trim(summary), date, trim(doc.baseUri()), ""); } catch (Exception ex) { System.err.println(doc.baseUri() + " , " + ex.getMessage()); } return jobData; // To change body of implemented methods use File | Settings | File Templates. }
@Test public void testClassNames() { Document doc = Jsoup.parse("<div class=\"c1 c2\">C</div>"); Element div = doc.select("div").get(0); assertEquals("c1 c2", div.className()); final Set<String> set1 = div.classNames(); final Object[] arr1 = set1.toArray(); assertTrue(arr1.length == 2); assertEquals("c1", arr1[0]); assertEquals("c2", arr1[1]); // Changes to the set should not be reflected in the Elements getters set1.add("c3"); assertTrue(2 == div.classNames().size()); assertEquals("c1 c2", div.className()); // Update the class names to a fresh set final Set<String> newSet = new LinkedHashSet<String>(3); newSet.addAll(set1); newSet.add("c3"); div.classNames(newSet); assertEquals("c1 c2 c3", div.className()); final Set<String> set2 = div.classNames(); final Object[] arr2 = set2.toArray(); assertTrue(arr2.length == 3); assertEquals("c1", arr2[0]); assertEquals("c2", arr2[1]); assertEquals("c3", arr2[2]); }
@Test public void insertChildrenArgumentValidation() { Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>"); Element div1 = doc.select("div").get(0); Element div2 = doc.select("div").get(1); List<Node> children = div1.childNodes(); try { div2.insertChildren(6, children); fail(); } catch (IllegalArgumentException e) { } try { div2.insertChildren(-5, children); fail(); } catch (IllegalArgumentException e) { } try { div2.insertChildren(0, null); fail(); } catch (IllegalArgumentException e) { } }
@Test public void testElementSiblingIndexSameContent() { Document doc = Jsoup.parse("<div><p>One</p>...<p>One</p>...<p>One</p>"); Elements ps = doc.select("p"); assertTrue(0 == ps.get(0).elementSiblingIndex()); assertTrue(1 == ps.get(1).elementSiblingIndex()); assertTrue(2 == ps.get(2).elementSiblingIndex()); }
@Test public void testBrHasSpace() { Document doc = Jsoup.parse("<p>Hello<br>there</p>"); assertEquals("Hello there", doc.text()); assertEquals("Hello there", doc.select("p").first().ownText()); doc = Jsoup.parse("<p>Hello <br> there</p>"); assertEquals("Hello there", doc.text()); }
private String summary(Document doc) { String summary = ""; Element element = doc.select("#divIlanDetay").first(); if (element != null) { summary = element.text(); } return summary; }
@Test public void testGetElementsWithAttributeDash() { Document doc = Jsoup.parse( "<meta http-equiv=content-type value=utf8 id=1> <meta name=foo content=bar id=2> <div http-equiv=content-type value=utf8 id=3>"); Elements meta = doc.select("meta[http-equiv=content-type], meta[charset]"); assertEquals(1, meta.size()); assertEquals("1", meta.first().id()); }
@Test public void testWrapWithRemainder() { Document doc = Jsoup.parse("<div><p>Hello</p></div>"); Element p = doc.select("p").first(); p.wrap("<div class='head'></div><p>There!</p>"); assertEquals( "<div><div class=\"head\"><p>Hello</p><p>There!</p></div></div>", TextUtil.stripNewlines(doc.body().html())); }
@Test public void parentlessToString() { Document doc = Jsoup.parse("<img src='foo'>"); Element img = doc.select("img").first(); assertEquals("<img src=\"foo\">", img.toString()); img.remove(); // lost its parent assertEquals("<img src=\"foo\">", img.toString()); }
@Test public void testNamespacedElements() { // Namespaces with ns:tag in HTML must be translated to ns|tag in CSS. String html = "<html><body><fb:comments /></body></html>"; Document doc = Jsoup.parse(html, "http://example.com/bar/"); Elements els = doc.select("fb|comments"); assertEquals(1, els.size()); assertEquals("html > body > fb|comments", els.get(0).cssSelector()); }
@Test public void testGetDataNodes() { Document doc = Jsoup.parse("<script>One Two</script> <style>Three Four</style> <p>Fix Six</p>"); Element script = doc.select("script").first(); Element style = doc.select("style").first(); Element p = doc.select("p").first(); List<DataNode> scriptData = script.dataNodes(); assertEquals(1, scriptData.size()); assertEquals("One Two", scriptData.get(0).getWholeData()); List<DataNode> styleData = style.dataNodes(); assertEquals(1, styleData.size()); assertEquals("Three Four", styleData.get(0).getWholeData()); List<DataNode> pData = p.dataNodes(); assertEquals(0, pData.size()); }