Exemplo n.º 1
1
  public void scrapeInformation() throws IOException, SolrServerException {
    String key = "";
    String value = "";

    for (int i = 9900; i < 10101; i++) {
      Document doc = Jsoup.connect("http://www.auto-data.net/en/?f=showCar&car_id=" + i).get();
      Elements td = doc.select("td");

      for (Element el : td) {
        if (el.childNodeSize() == 1) {
          key = el.ownText();
          continue;
        }

        if (el.childNodeSize() > 1) {
          value = el.getAllElements().select("strong").text();
          if (!key.equals("") && !value.equals("")) {
            car.put(key, value);
            key = "";
            value = "";
          }
        }
      }

      createCar(car, i);

      if ((i % 100) == 0) persistDocuments(cars);
    }
    // persistDocuments(cars); // REMOVE !!!!!!!!!!!!!!!!!!!!!!!!!!!

    return;
  }
Exemplo n.º 2
0
  @Test
  public void insertChildrenAtPosition() {
    Document doc =
        Jsoup.parse(
            "<div id=1>Text1 <p>One</p> Text2 <p>Two</p></div><div id=2>Text3 <p>Three</p></div>");
    Element div1 = doc.select("div").get(0);
    Elements p1s = div1.select("p");
    Element div2 = doc.select("div").get(1);

    assertEquals(2, div2.childNodeSize());
    div2.insertChildren(-1, p1s);
    assertEquals(2, div1.childNodeSize()); // moved two out
    assertEquals(4, div2.childNodeSize());
    assertEquals(3, p1s.get(1).siblingIndex()); // should be last

    List<Node> els = new ArrayList<Node>();
    Element el1 = new Element(Tag.valueOf("span"), "").text("Span1");
    Element el2 = new Element(Tag.valueOf("span"), "").text("Span2");
    TextNode tn1 = new TextNode("Text4", "");
    els.add(el1);
    els.add(el2);
    els.add(tn1);

    assertNull(el1.parent());
    div2.insertChildren(-2, els);
    assertEquals(div2, el1.parent());
    assertEquals(7, div2.childNodeSize());
    assertEquals(3, el1.siblingIndex());
    assertEquals(4, el2.siblingIndex());
    assertEquals(5, tn1.siblingIndex());
  }
Exemplo n.º 3
0
  @Test
  public void insertChildrenAsCopy() {
    Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>");
    Element div1 = doc.select("div").get(0);
    Element div2 = doc.select("div").get(1);
    Elements ps = doc.select("p").clone();
    ps.first().text("One cloned");
    div2.insertChildren(-1, ps);

    assertEquals(4, div1.childNodeSize()); // not moved -- cloned
    assertEquals(2, div2.childNodeSize());
    assertEquals(
        "<div id=\"1\">Text <p>One</p> Text <p>Two</p></div><div id=\"2\"><p>One cloned</p><p>Two</p></div>",
        TextUtil.stripNewlines(doc.body().html()));
  }
Exemplo n.º 4
0
  @Test
  public void moveByAppend() {
    // test for https://github.com/jhy/jsoup/issues/239
    // can empty an element and append its children to another element
    Document doc = Jsoup.parse("<div id=1>Text <p>One</p> Text <p>Two</p></div><div id=2></div>");
    Element div1 = doc.select("div").get(0);
    Element div2 = doc.select("div").get(1);

    assertEquals(4, div1.childNodeSize());
    List<Node> children = div1.childNodes();
    assertEquals(4, children.size());

    div2.insertChildren(0, children);

    assertEquals(
        0, children.size()); // children is backed by div1.childNodes, moved, so should be 0 now
    assertEquals(0, div1.childNodeSize());
    assertEquals(4, div2.childNodeSize());
    assertEquals(
        "<div id=\"1\"></div>\n<div id=\"2\">\n Text \n <p>One</p> Text \n <p>Two</p>\n</div>",
        doc.body().html());
  }
Exemplo n.º 5
0
  private Collection<Course> getCourseListForDepartment(String url) {
    List<Course> courses = new ArrayList<>();

    url = url.replaceAll(" ", "_");

    Document doc;
    try {
      doc = Jsoup.connect(url).timeout(0).get();
    } catch (IOException e) {
      e.printStackTrace();
      return courses;
    }

    final Elements spans = doc.getElementsByTag("span");

    // java 7 solution
    for (Element e : spans) {

      if (e.childNodeSize() < 1 || e.textNodes().size() < 2) continue;

      String code;
      String name;

      // I'm not too sure why this algorithm is so complicated, but it works
      // maybe we can clean it up...

      // 0 should be code, 1 should be name
      String potentialCourseCode = e.child(0).text();
      if (potentialCourseCode.matches(COURSE_REGEX)) code = potentialCourseCode;
      else continue;

      // This must be the text if we got the course code already
      name = e.ownText();

      // Strip all the &nbsp - note that trim() does not work
      name = name.replaceAll("\u00A0", "");

      Course course = new Course(code, name);
      courses.add(course);
    }

    return courses;
  }
  // start setting of selectorbar
  public void selectorBarTranslate(
      Node selectorBarPanelNode, Element ele, Map<String, String> urlMap, String locale) {

    try {
      String title = (ele != null ? ele.getElementsByTag("a").first().text() : "");
      String titleUrl = ele.getElementsByTag("a").first().absUrl("href");
      if (StringUtil.isBlank(titleUrl)) {
        titleUrl = ele.getElementsByTag("a").first().attr("href");
      }
      // Start extracting valid href
      log.debug("Before selector bar title LinkUrl" + titleUrl + "\n");
      titleUrl = FrameworkUtils.getLocaleReference(titleUrl, urlMap, locale, sb);
      log.debug("after selector bar title LinkUrl" + titleUrl + "\n");
      // End extracting valid href
      log.debug("selector component titleUrl: " + titleUrl);
      selectorBarPanelNode.setProperty("title", title);
      selectorBarPanelNode.setProperty("titleurl", titleUrl);
      if (ele.childNodeSize() >= 2) {
        log.debug("Child node size is greater than 1.");
        if (ele.select("div.menu").isEmpty()) {
          log.debug("Menu is not available.");
          sb.append(
              "<li>Selector bar drop down menu elements does not exist on the locale page.</li>");
        } else {
          log.debug("Menu is available.");
          Element menuEle = ele.child(1);
          if (menuEle != null) {
            log.debug("selector component menuEle: " + menuEle.toString());
            Element anchor = menuEle.getElementsByTag("a").last();
            String allLinkText = anchor != null ? anchor.text() : "";
            String allLinkUrl = anchor != null ? anchor.absUrl("href") : "";
            if (StringUtil.isBlank(allLinkUrl)) {
              allLinkUrl = anchor.attr("href");
            }
            // Start extracting valid href
            log.debug("Before selector bar menu LinkUrl" + allLinkUrl + "\n");
            allLinkUrl = FrameworkUtils.getLocaleReference(allLinkUrl, urlMap, locale, sb);
            log.debug("after selector bar menu LinkUrl" + allLinkUrl + "\n");
            // End extracting valid href
            selectorBarPanelNode.setProperty("alllinktext", allLinkText);
            selectorBarPanelNode.setProperty("alllinkurl", allLinkUrl);

            Elements menuUlList = menuEle.getElementsByTag("ul");
            for (Element element : menuUlList) {
              java.util.List<String> list = new ArrayList<String>();
              Elements menuLiList = element.getElementsByTag("li");
              System.out.println(menuLiList.size());

              for (Element li : menuLiList) {
                JSONObject jsonObj = new JSONObject();
                Element listItemAnchor = li.getElementsByTag("a").first();
                String anchorText = listItemAnchor != null ? listItemAnchor.text() : "";
                String anchorHref = listItemAnchor.absUrl("href");
                if (StringUtil.isBlank(anchorHref)) {
                  anchorHref = listItemAnchor.attr("href");
                }
                // Start extracting valid href
                log.debug("Before selectorbarLinkUrl" + anchorHref + "\n");
                anchorHref = FrameworkUtils.getLocaleReference(anchorHref, urlMap, locale, sb);
                log.debug("after selectorbarLinkUrl" + anchorHref + "\n");
                // End extracting valid href

                jsonObj.put("linktext", anchorText);
                jsonObj.put("linkurl", anchorHref);
                jsonObj.put("size", "");
                list.add(jsonObj.toString());
              }

              selectorBarPanelNode.setProperty("panelitems", list.toArray(new String[list.size()]));
            }
          } else {
            sb.append(
                "<li>Selector bar drop down menu elements does not exist on the locale page.</li>");
          }
        }
      } else {
        sb.append(
            "<li>Selector bar drop down menu elements does not exist on the locale page.</li>");
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
  }