Ejemplo n.º 1
0
 @Test
 public void parsesUnterminatedComments() {
   String html = "<p>Hello<!-- <tr><td>";
   Document doc = Jsoup.parse(html);
   Element p = doc.getElementsByTag("p").get(0);
   assertEquals("Hello", p.text());
   TextNode text = (TextNode) p.childNode(0);
   assertEquals("Hello", text.getWholeText());
   Comment comment = (Comment) p.childNode(1);
   assertEquals(" <tr><td>", comment.getData());
 }
Ejemplo n.º 2
0
  @Test
  public void parsesComments() {
    String html =
        "<html><head></head><body><!-- <table><tr><td></table> --><p>Hello</p></body></html>";
    Document doc = Jsoup.parse(html);

    Element body = doc.child(1);
    Comment comment = (Comment) body.childNode(0);
    assertEquals(" <table><tr><td></table> ", comment.getData());
    Element p = body.child(0);
    TextNode text = (TextNode) p.childNode(0);
    assertEquals("Hello", text.getWholeText());
  }
Ejemplo n.º 3
0
  public static void m18Page() throws IOException {

    String m18URL = "http://product.m18.com/p-F118420.htm";
    Document m18Doc = Jsoup.parse(new URL(m18URL), 20000);

    Elements elements = m18Doc.select("#styleId");

    for (Element element : elements) {
      System.out.println("麦考林商品编号:[" + element.childNode(0) + "]");
    }

    elements = m18Doc.select("#stylePrice");
    for (Element element : elements) {
      System.out.println("麦考林当前价格:[" + element.childNode(0) + "]");
    }
  }
Ejemplo n.º 4
0
  public static void ddPage() throws IOException {

    //		String ddURL = "http://product.dangdang.com/product.aspx?product_id=20754996";
    String ddURL = "http://product.dangdang.com/product.aspx?product_id=1262418002#ref=www-0-H";

    Document ddDoc = Jsoup.parse(new URL(ddURL), 20000);

    Elements elements = ddDoc.select("#salePriceTag");
    for (Element element : elements) {
      System.out.println("当当网当前价格:[" + element.childNode(0) + "]");
    }

    System.out.println("///////////////////////");
    HttpServletRequest request = HttpRequestParser.parse(ddURL);

    System.out.println("full URL:     " + ddURL);
    System.out.println("request URL:  " + request.getRequestURL());
    System.out.println("id:           " + request.getParameter("product_id"));
  }
  protected ArrayList<Event> parseMonthPage(Document doc) {

    ArrayList<Event> events = new ArrayList<Event>();

    String query = "div#content.mw-body div#bodyContent div#mw-content-text.mw-content-ltr";
    for (int i = 1; i <= 31; i++) {
      query = query + " div#" + i + "_May_2005";
      Elements days = doc.select(query);
      for (Element eachday : days) { // This will loop only once because it is the whole text
        String actualDate = null;
        String modifiedDate =
            eachday.attr(
                "id"); // This is essential to do because wikipedia present dates in weird manner
                       // and if we want to faciliate search using dates in our database then they
                       // should be present in this format YYYY-MM-DD
        int firstoccur = modifiedDate.indexOf("_");
        String year = modifiedDate.substring(firstoccur + 5, firstoccur + 9);
        String day = modifiedDate.substring(0, 2).replace('_', ' ').trim();
        if (day.length() == 1) {
          day = "0" + day;
        }
        actualDate = year + "-05-" + day;
        try {
          Date.valueOf(actualDate);
        } catch (Exception ex) {
          ex.printStackTrace();
          System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
          continue;
        }

        Elements individual = eachday.children();
        for (Element dateplustext :
            individual) { // This consists of alternate date and events (with or withour newsStory)
          if (dateplustext.tagName().equals("ul")) {
            // Complete news under a given date
            Elements stories =
                dateplustext
                    .children(); // This contains different stories (newsStory may be present or
                                 // not)
            for (Element li : stories) {
              Elements uls =
                  li
                      .children(); // These are either <a> tags if it doesn't have a newsStory or it
                                   // is <a> and <ul> tag if it contains a newsStory
              boolean hasUL = false;
              for (Element ul : uls) {
                if (ul.tagName()
                    .equals("ul")) { // If li has ul then it implies that it contains a news story
                  hasUL = true; // news story is there
                  Node storyNode =
                      li.childNode(
                          0); // this the story .. it is used later at the end for each event
                  Elements eventsNodes =
                      ul
                          .children(); // Now we get inside the ul element which containd different
                                       // li elements
                  for (Element eventNode : eventsNodes) { // Here we are picking one li
                    Event event = extractDescriptionAndLinks(eventNode);
                    try {
                      event.setDate(Date.valueOf(actualDate));
                    } catch (Exception ex) {
                      ex.printStackTrace();
                      System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
                      continue;
                    }
                    // News story
                    if (!storyNode.attr("title").isEmpty() && !storyNode.attr("href").isEmpty()) {
                      if (isValidWikiURL(storyNode.attr("href"))) {
                        Story story = new Story();
                        // story.setName(st.attr("title"));
                        story.setName(getEntityName(storyNode.attr("href")));
                        story.setWikipediaUrl(getEntityURL(storyNode.attr("href")));
                        event.setStory(story);
                      }
                    }
                    events.add(event);
                  }
                }
              }
              if (!hasUL) { // event does not have a story
                Event event = extractDescriptionAndLinks(li);
                try {
                  event.setDate(Date.valueOf(actualDate));
                } catch (Exception ex) {
                  ex.printStackTrace();
                  System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
                  continue;
                }
                events.add(event);
              }
            }
          }
        }
      }
    }
    return events;
  }