@Test public void parsesUnterminatedComments() { String html = "<p>Hello<!-- <tr><td>"; Document doc = Jsoup.parse(html); Element p = doc.getElementsByTag("p").get(0); assertEquals("Hello", p.text()); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); Comment comment = (Comment) p.childNode(1); assertEquals(" <tr><td>", comment.getData()); }
@Test public void parsesComments() { String html = "<html><head></head><body><!-- <table><tr><td></table> --><p>Hello</p></body></html>"; Document doc = Jsoup.parse(html); Element body = doc.child(1); Comment comment = (Comment) body.childNode(0); assertEquals(" <table><tr><td></table> ", comment.getData()); Element p = body.child(0); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); }
public static void m18Page() throws IOException { String m18URL = "http://product.m18.com/p-F118420.htm"; Document m18Doc = Jsoup.parse(new URL(m18URL), 20000); Elements elements = m18Doc.select("#styleId"); for (Element element : elements) { System.out.println("麦考林商品编号:[" + element.childNode(0) + "]"); } elements = m18Doc.select("#stylePrice"); for (Element element : elements) { System.out.println("麦考林当前价格:[" + element.childNode(0) + "]"); } }
public static void ddPage() throws IOException { // String ddURL = "http://product.dangdang.com/product.aspx?product_id=20754996"; String ddURL = "http://product.dangdang.com/product.aspx?product_id=1262418002#ref=www-0-H"; Document ddDoc = Jsoup.parse(new URL(ddURL), 20000); Elements elements = ddDoc.select("#salePriceTag"); for (Element element : elements) { System.out.println("当当网当前价格:[" + element.childNode(0) + "]"); } System.out.println("///////////////////////"); HttpServletRequest request = HttpRequestParser.parse(ddURL); System.out.println("full URL: " + ddURL); System.out.println("request URL: " + request.getRequestURL()); System.out.println("id: " + request.getParameter("product_id")); }
protected ArrayList<Event> parseMonthPage(Document doc) { ArrayList<Event> events = new ArrayList<Event>(); String query = "div#content.mw-body div#bodyContent div#mw-content-text.mw-content-ltr"; for (int i = 1; i <= 31; i++) { query = query + " div#" + i + "_May_2005"; Elements days = doc.select(query); for (Element eachday : days) { // This will loop only once because it is the whole text String actualDate = null; String modifiedDate = eachday.attr( "id"); // This is essential to do because wikipedia present dates in weird manner // and if we want to faciliate search using dates in our database then they // should be present in this format YYYY-MM-DD int firstoccur = modifiedDate.indexOf("_"); String year = modifiedDate.substring(firstoccur + 5, firstoccur + 9); String day = modifiedDate.substring(0, 2).replace('_', ' ').trim(); if (day.length() == 1) { day = "0" + day; } actualDate = year + "-05-" + day; try { Date.valueOf(actualDate); } catch (Exception ex) { ex.printStackTrace(); System.err.println("ERROR: date format is wrong!!!! date = " + actualDate); continue; } Elements individual = eachday.children(); for (Element dateplustext : individual) { // This consists of alternate date and events (with or withour newsStory) if (dateplustext.tagName().equals("ul")) { // Complete news under a given date Elements stories = dateplustext .children(); // This contains different stories (newsStory may be present or // not) for (Element li : stories) { Elements uls = li .children(); // These are either <a> tags if it doesn't have a newsStory or it // is <a> and <ul> tag if it contains a newsStory boolean hasUL = false; for (Element ul : uls) { if (ul.tagName() .equals("ul")) { // If li has ul then it implies that it contains a news story hasUL = true; // news story is there Node storyNode = li.childNode( 0); // this the story .. it is used later at the end for each event Elements eventsNodes = ul .children(); // Now we get inside the ul element which containd different // li elements for (Element eventNode : eventsNodes) { // Here we are picking one li Event event = extractDescriptionAndLinks(eventNode); try { event.setDate(Date.valueOf(actualDate)); } catch (Exception ex) { ex.printStackTrace(); System.err.println("ERROR: date format is wrong!!!! date = " + actualDate); continue; } // News story if (!storyNode.attr("title").isEmpty() && !storyNode.attr("href").isEmpty()) { if (isValidWikiURL(storyNode.attr("href"))) { Story story = new Story(); // story.setName(st.attr("title")); story.setName(getEntityName(storyNode.attr("href"))); story.setWikipediaUrl(getEntityURL(storyNode.attr("href"))); event.setStory(story); } } events.add(event); } } } if (!hasUL) { // event does not have a story Event event = extractDescriptionAndLinks(li); try { event.setDate(Date.valueOf(actualDate)); } catch (Exception ex) { ex.printStackTrace(); System.err.println("ERROR: date format is wrong!!!! date = " + actualDate); continue; } events.add(event); } } } } } } return events; }