public void scrapeInformation() throws IOException, SolrServerException { String key = ""; String value = ""; for (int i = 9900; i < 10101; i++) { Document doc = Jsoup.connect("http://www.auto-data.net/en/?f=showCar&car_id=" + i).get(); Elements td = doc.select("td"); for (Element el : td) { if (el.childNodeSize() == 1) { key = el.ownText(); continue; } if (el.childNodeSize() > 1) { value = el.getAllElements().select("strong").text(); if (!key.equals("") && !value.equals("")) { car.put(key, value); key = ""; value = ""; } } } createCar(car, i); if ((i % 100) == 0) persistDocuments(cars); } // persistDocuments(cars); // REMOVE !!!!!!!!!!!!!!!!!!!!!!!!!!! return; }
@Test public void insertChildrenAtPosition() { Document doc = Jsoup.parse( "<div id=1>Text1 <p>One</p> Text2 <p>Two</p></div><div id=2>Text3 <p>Three</p></div>"); Element div1 = doc.select("div").get(0); Elements p1s = div1.select("p"); Element div2 = doc.select("div").get(1); assertEquals(2, div2.childNodeSize()); div2.insertChildren(-1, p1s); assertEquals(2, div1.childNodeSize()); // moved two out assertEquals(4, div2.childNodeSize()); assertEquals(3, p1s.get(1).siblingIndex()); // should be last List<Node> els = new ArrayList<Node>(); Element el1 = new Element(Tag.valueOf("span"), "").text("Span1"); Element el2 = new Element(Tag.valueOf("span"), "").text("Span2"); TextNode tn1 = new TextNode("Text4", ""); els.add(el1); els.add(el2); els.add(tn1); assertNull(el1.parent()); div2.insertChildren(-2, els); assertEquals(div2, el1.parent()); assertEquals(7, div2.childNodeSize()); assertEquals(3, el1.siblingIndex()); assertEquals(4, el2.siblingIndex()); assertEquals(5, tn1.siblingIndex()); }
@Test public void insertChildrenAsCopy() { Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>"); Element div1 = doc.select("div").get(0); Element div2 = doc.select("div").get(1); Elements ps = doc.select("p").clone(); ps.first().text("One cloned"); div2.insertChildren(-1, ps); assertEquals(4, div1.childNodeSize()); // not moved -- cloned assertEquals(2, div2.childNodeSize()); assertEquals( "<div id=\"1\">Text <p>One</p> Text <p>Two</p></div><div id=\"2\"><p>One cloned</p><p>Two</p></div>", TextUtil.stripNewlines(doc.body().html())); }
@Test public void moveByAppend() { // test for https://github.com/jhy/jsoup/issues/239 // can empty an element and append its children to another element Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>"); Element div1 = doc.select("div").get(0); Element div2 = doc.select("div").get(1); assertEquals(4, div1.childNodeSize()); List<Node> children = div1.childNodes(); assertEquals(4, children.size()); div2.insertChildren(0, children); assertEquals( 0, children.size()); // children is backed by div1.childNodes, moved, so should be 0 now assertEquals(0, div1.childNodeSize()); assertEquals(4, div2.childNodeSize()); assertEquals( "<div id=\"1\"></div>\n<div id=\"2\">\n Text \n <p>One</p> Text \n <p>Two</p>\n</div>", doc.body().html()); }
private Collection<Course> getCourseListForDepartment(String url) { List<Course> courses = new ArrayList<>(); url = url.replaceAll(" ", "_"); Document doc; try { doc = Jsoup.connect(url).timeout(0).get(); } catch (IOException e) { e.printStackTrace(); return courses; } final Elements spans = doc.getElementsByTag("span"); // java 7 solution for (Element e : spans) { if (e.childNodeSize() < 1 || e.textNodes().size() < 2) continue; String code; String name; // I'm not too sure why this algorithm is so complicated, but it works // maybe we can clean it up... // 0 should be code, 1 should be name String potentialCourseCode = e.child(0).text(); if (potentialCourseCode.matches(COURSE_REGEX)) code = potentialCourseCode; else continue; // This must be the text if we got the course code already name = e.ownText(); // Strip all the   - note that trim() does not work name = name.replaceAll("\u00A0", ""); Course course = new Course(code, name); courses.add(course); } return courses; }
// start setting of selectorbar public void selectorBarTranslate( Node selectorBarPanelNode, Element ele, Map<String, String> urlMap, String locale) { try { String title = (ele != null ? ele.getElementsByTag("a").first().text() : ""); String titleUrl = ele.getElementsByTag("a").first().absUrl("href"); if (StringUtil.isBlank(titleUrl)) { titleUrl = ele.getElementsByTag("a").first().attr("href"); } // Start extracting valid href log.debug("Before selector bar title LinkUrl" + titleUrl + "\n"); titleUrl = FrameworkUtils.getLocaleReference(titleUrl, urlMap, locale, sb); log.debug("after selector bar title LinkUrl" + titleUrl + "\n"); // End extracting valid href log.debug("selector component titleUrl: " + titleUrl); selectorBarPanelNode.setProperty("title", title); selectorBarPanelNode.setProperty("titleurl", titleUrl); if (ele.childNodeSize() >= 2) { log.debug("Child node size is greater than 1."); if (ele.select("div.menu").isEmpty()) { log.debug("Menu is not available."); sb.append( "<li>Selector bar drop down menu elements does not exist on the locale page.</li>"); } else { log.debug("Menu is available."); Element menuEle = ele.child(1); if (menuEle != null) { log.debug("selector component menuEle: " + menuEle.toString()); Element anchor = menuEle.getElementsByTag("a").last(); String allLinkText = anchor != null ? anchor.text() : ""; String allLinkUrl = anchor != null ? anchor.absUrl("href") : ""; if (StringUtil.isBlank(allLinkUrl)) { allLinkUrl = anchor.attr("href"); } // Start extracting valid href log.debug("Before selector bar menu LinkUrl" + allLinkUrl + "\n"); allLinkUrl = FrameworkUtils.getLocaleReference(allLinkUrl, urlMap, locale, sb); log.debug("after selector bar menu LinkUrl" + allLinkUrl + "\n"); // End extracting valid href selectorBarPanelNode.setProperty("alllinktext", allLinkText); selectorBarPanelNode.setProperty("alllinkurl", allLinkUrl); Elements menuUlList = menuEle.getElementsByTag("ul"); for (Element element : menuUlList) { java.util.List<String> list = new ArrayList<String>(); Elements menuLiList = element.getElementsByTag("li"); System.out.println(menuLiList.size()); for (Element li : menuLiList) { JSONObject jsonObj = new JSONObject(); Element listItemAnchor = li.getElementsByTag("a").first(); String anchorText = listItemAnchor != null ? listItemAnchor.text() : ""; String anchorHref = listItemAnchor.absUrl("href"); if (StringUtil.isBlank(anchorHref)) { anchorHref = listItemAnchor.attr("href"); } // Start extracting valid href log.debug("Before selectorbarLinkUrl" + anchorHref + "\n"); anchorHref = FrameworkUtils.getLocaleReference(anchorHref, urlMap, locale, sb); log.debug("after selectorbarLinkUrl" + anchorHref + "\n"); // End extracting valid href jsonObj.put("linktext", anchorText); jsonObj.put("linkurl", anchorHref); jsonObj.put("size", ""); list.add(jsonObj.toString()); } selectorBarPanelNode.setProperty("panelitems", list.toArray(new String[list.size()])); } } else { sb.append( "<li>Selector bar drop down menu elements does not exist on the locale page.</li>"); } } } else { sb.append( "<li>Selector bar drop down menu elements does not exist on the locale page.</li>"); } } catch (Exception e) { e.printStackTrace(); } }