Beispiel #1
0
  @Test
  public void testClone() {
    Document doc = Jsoup.parse("<div><p>One<p><span>Two</div>");

    Element p = doc.select("p").get(1);
    Element clone = p.clone();

    assertNull(clone.parent()); // should be orphaned
    assertEquals(0, clone.siblingIndex);
    assertEquals(1, p.siblingIndex);
    assertNotNull(p.parent());

    clone.append("<span>Three");
    assertEquals(
        "<p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(clone.outerHtml()));
    assertEquals(
        "<div><p>One</p><p><span>Two</span></p></div>",
        TextUtil.stripNewlines(doc.body().html())); // not modified

    doc.body().appendChild(clone); // adopt
    assertNotNull(clone.parent());
    assertEquals(
        "<div><p>One</p><p><span>Two</span></p></div><p><span>Two</span><span>Three</span></p>",
        TextUtil.stripNewlines(doc.body().html()));
  }
Beispiel #2
0
 private String getUpdatedFileContent(List<Vacancy> vacancies) {
   Document document = null;
   try {
     document = getDocument();
     document.html();
     Element template = document.select("[class=vacancy template]").first();
     Element templateCopy = template.clone();
     templateCopy.removeAttr("style");
     templateCopy.removeAttr("class");
     templateCopy.addClass("vacancy");
     document.select("tr[class=vacancy]").remove();
     for (Vacancy vacancy : vacancies) {
       Element thisVacancyElement = templateCopy.clone();
       thisVacancyElement.select("[class=city]").first().text(vacancy.getCity());
       thisVacancyElement.select("[class=companyName]").first().text(vacancy.getCompanyName());
       thisVacancyElement.select("[class=salary]").first().text(vacancy.getSalary());
       thisVacancyElement
           .select("[class=title]")
           .select("a[href]")
           .first()
           .text(vacancy.getTitle());
       thisVacancyElement
           .select("[class=title]")
           .select("a[href]")
           .first()
           .attr("href", vacancy.getUrl());
       document.select("[class=vacancy template]").first().before(thisVacancyElement.outerHtml());
     }
   } catch (IOException e) {
     e.printStackTrace();
     System.out.println("Some exception occurred");
   }
   return document.html();
 }
Beispiel #3
0
 @Test
 public void testChainedRemoveAttributes() {
   String html = "<a one two three four>Text</a>";
   Document doc = Jsoup.parse(html);
   Element a = doc.select("a").first();
   a.removeAttr("zero")
       .removeAttr("one")
       .removeAttr("two")
       .removeAttr("three")
       .removeAttr("four")
       .removeAttr("five");
   assertEquals("<a>Text</a>", a.outerHtml());
 }
Beispiel #4
0
  @Test
  public void testAddBooleanAttribute() {
    Element div = new Element(Tag.valueOf("div"), "");

    div.attr("true", true);

    div.attr("false", "value");
    div.attr("false", false);

    assertTrue(div.hasAttr("true"));
    assertEquals("", div.attr("true"));

    List<Attribute> attributes = div.attributes().asList();
    assertEquals("There should be one attribute", 1, attributes.size());
    assertTrue("Attribute should be boolean", attributes.get(0) instanceof BooleanAttribute);

    assertFalse(div.hasAttr("false"));

    assertEquals("<div true></div>", div.outerHtml());
  }
 /**
  * Translates tables into individual html tables and puts them into the file_dir
  *
  * @param file_name
  * @param table_dir
  * @param pmc
  * @throws IOException
  */
 public static void translateTables(File file, String file_dir, String paper_dir, Integer pmc)
     throws IOException {
   String sep = File.separator;
   ExtractFiles ext = new ExtractFiles();
   File html = ext.convertHTML(file, paper_dir, pmc + "");
   if (html != null) {
     Document doc = Jsoup.parse(html, "UTF-8");
     Elements table_wrap = doc.getElementsByTag("table-wrap");
     int count = 1;
     FileWriter w;
     for (Element e : table_wrap) {
       String id = e.attr("id");
       if (id == null) {
         id = "T" + count;
       }
       File tab = new File(file_dir + sep + "PMC" + pmc + id + ".html");
       tab.canWrite();
       w = new FileWriter(tab);
       w.write(e.outerHtml());
       w.close();
       count++;
     }
   }
 }
Beispiel #6
0
  public static void getDetails(MajorForCollection major) throws Exception {
    Connection conn = Jsoup.connect(major.getUrl());
    Document doc = conn.timeout(10000).followRedirects(true).get();
    Element e = null;

    if (doc.select("table.course-page__table-basic").size() > 0) {
      e = doc.select("table.course-page__table-basic").get(0);
      for (Element tr : e.select("tr")) {
        if (tr.text().contains("Duration")) {
          major.setLength(getLength(e.text()));
        } else if (tr.text().contains("Start date")) {
          major.setMonthOfEntry(getMonthOfEntry(e.text()));
        }
      }
    }

    if (doc.select("a.btn.btn-bordered").size() > 0) {
      e = doc.select("a.btn.btn-bordered").get(0);
      major.setApplicationFee(e.attr("href"));
    }

    if (doc.select("#entry-requirements-2").size() > 0) {
      e = doc.select("#entry-requirements-2").get(0);
      major.setAcademicRequirements(e.text());
    }

    if (doc.select("div.course-page.row a").size() > 0) {
      e = doc.select("div.course-page.row a").last();
      major.setSchool(e.attr("href"));
      if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/design-and-architecture")) {
        major.setSchool("Monash Art Design & Architecture");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/business-and-economics")) {
        major.setSchool("Monash Business School");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/arts")) {
        major.setSchool("Faculty of Arts, Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/science")) {
        major.setSchool("Faculty of Science");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/medicine")) {
        major.setSchool("Faculty of Medicine, Nursing and Health Sciences");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/education")) {
        major.setSchool("Faculty of Education - Faculty of Education");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/engineering")) {
        major.setSchool("Faculty of Engineering, Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/information-technology")) {
        major.setSchool("Faculty of Information Technology - Monash University");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/pharmacy")) {
        major.setSchool("Faculty of Pharmacy and Pharmaceutical Sciences");
      } else if (major
          .getSchool()
          .equals("http://www.study.monash/media/links/faculty-websites/law")) {
        major.setSchool("Faculty of Law");
      }
    }

    if (doc.select("#fees").size() > 0) {
      e = doc.select("#fees").get(0);
      major.setTuitionFee(e.nextElementSibling().text());
    }

    if (!major
            .getApplicationFee()
            .equals("http://www.monash.edu.au/pubs/handbooks/courses/A6015.html")
        && !major
            .getApplicationFee()
            .equals("http://www.monash.edu.au/pubs/handbooks/courses/2276.html")) {
      doc = WebUtils.getDocument(major.getApplicationFee(), WebUtils.METHOD_GET, 10 * 1000);
      if (doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").size()
          > 0) {
        e = doc.select("h2.black.pub_heading:containsOwn(Requirements) + div.pub_body_text").get(0);
        major.setStructure(replaceSpecialCharacter(html2Str(e.outerHtml())).trim());
        if (major.getStructure().contains("Part A.")) {
          major.setStructure(
              major.getStructure().substring(major.getStructure().indexOf("Part A.")));
        } else if (doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text")
                .size()
            > 0) {
          e = doc.select("h2.black.pub_heading:containsOwn(Structure) + div.pub_body_text").get(0);
          major.setStructure(replaceSpecialCharacter(html2Str(e.text())).trim());
        }
      }
    }

    mark(major, true);
  }
 private void migrateThird2(
     Elements listEles,
     Node sBThirdNode2,
     String locale,
     Map<String, String> urlMap,
     String catType,
     String type)
     throws RepositoryException {
   if (listEles != null) {
     int eleSize = listEles.size();
     NodeIterator thirdNodes =
         sBThirdNode2.hasNode("Th-Third-1") ? sBThirdNode2.getNodes("Th-Third*") : null;
     if (thirdNodes != null) {
       int size = (int) thirdNodes.getSize();
       for (Element list : listEles) {
         if (thirdNodes.hasNext()) {
           Node thirdNode = thirdNodes.nextNode();
           if (thirdNode.hasNode("tile")) {
             Node tileNode = thirdNode.getNode("tile");
             Element title = list.getElementsByTag("h3").first();
             Element description = list.getElementsByTag("p").first();
             Element anchor = list.getElementsByTag("a").first();
             if (title != null) {
               tileNode.setProperty("title", title.text());
             } else {
               sb.append(Constants.LIST_HEADING_COMPONENT_NOT_FOUND);
             }
             if (description != null) {
               tileNode.setProperty("description", description.text());
             } else {
               sb.append(Constants.LIST_INTRO_PARAGRAPH_ELEMENT_NOT_FOUND);
             }
             if (anchor != null) {
               if (tileNode.hasNode("cta")) {
                 Node ctaNode = tileNode.getNode("cta");
                 ctaNode.setProperty("linktext", anchor.text());
                 String aUrl = anchor.absUrl("href");
                 if (StringUtils.isBlank(aUrl)) {
                   aUrl = anchor.attr("href");
                 }
                 aUrl = FrameworkUtils.getLocaleReference(aUrl, urlMap, locale, sb);
                 if (!aUrl.equals("") && !aUrl.isEmpty()) {
                   ctaNode.setProperty("url", aUrl);
                 } else {
                   sb.append(Constants.LINK_URL_NOT_FOUND_IN_LIST);
                 }
               } else {
                 sb.append(Constants.LINK_DATA_NODE_FOR_LIST_NOT_FOUND);
               }
               tileNode.setProperty("title", title.text());
             } else {
               sb.append(Constants.LIST_ANCHOR_ELEMENTS_NOT_FOUND);
             }
           } else {
             sb.append(Constants.LIST_ITEM_NODE_NOT_FOUND);
           }
         } else {
           sb.append(
               Constants.MISMATCH_IN_LIST_ELEMENT
                   + eleSize
                   + Constants.LIST_NODES_COUNT
                   + size
                   + ".</li>");
         }
       }
       if (thirdNodes.hasNext()) {
         Element letUsHelp = doc.select("div.rc-persel").first();
         if (letUsHelp != null) {
           Node thirdNode = thirdNodes.nextNode();
           if (thirdNode.hasNode("tile")) {
             boolean check = true;
             Node tileNode = thirdNode.getNode("tile");
             Element title = letUsHelp.getElementsByTag("h3").first();
             Element description = letUsHelp.getElementsByTag("p").first();
             Element dAnchor = letUsHelp.getElementsByTag("a").first();
             letUsHelp.getElementsByTag("a").first().remove();
             Element anchor = letUsHelp.getElementsByTag("a").last();
             if (title != null) {
               tileNode.setProperty("title", title.text());
             } else {
               sb.append(Constants.LIST_HEADING_COMPONENT_NOT_FOUND);
             }
             if (description != null) {
               String anchorTag = "";
               if (dAnchor != null) {
                 anchorTag = dAnchor.outerHtml();
               } else {
                 check = false;
                 sb.append(Constants.LIST_ANCHOR_ELEMENTS_NOT_FOUND);
               }
               if (description.text().equals("")) {
                 description = letUsHelp.getElementsByTag("p").last();
               }
               tileNode.setProperty("description", description.text() + "</br></br>" + anchorTag);
             } else {
               sb.append(Constants.LIST_INTRO_PARAGRAPH_ELEMENT_NOT_FOUND);
             }
             if (anchor != null) {
               if (tileNode.hasNode("cta")) {
                 Node ctaNode = tileNode.getNode("cta");
                 ctaNode.setProperty("linktext", anchor.text());
                 String aUrl = anchor.absUrl("href");
                 if (StringUtils.isBlank(aUrl)) {
                   aUrl = anchor.attr("href");
                 }
                 aUrl = FrameworkUtils.getLocaleReference(aUrl, urlMap, locale, sb);
                 if (!aUrl.equals("") && !aUrl.isEmpty()) {
                   ctaNode.setProperty("url", aUrl);
                 } else {
                   sb.append(Constants.LINK_URL_NOT_FOUND_IN_LIST);
                 }
               } else {
                 sb.append(Constants.LINK_DATA_NODE_FOR_LIST_NOT_FOUND);
               }
             } else {
               if (check) sb.append(Constants.LIST_ANCHOR_ELEMENTS_NOT_FOUND);
             }
           } else {
             sb.append(Constants.LIST_ITEM_NODE_NOT_FOUND);
           }
         } else {
           sb.append(
               Constants.MISMATCH_IN_LIST_NODES
                   + eleSize
                   + Constants.LIST_NODES_COUNT
                   + size
                   + ".</li>");
         }
       }
     } else {
       sb.append(Constants.NO_LIST_NODES_FOUND);
     }
   } else {
     sb.append(Constants.LIST_COMPONENT_NOT_FOUND);
   }
 }