예제 #1
0
  @Test
  public void insertChildrenAtPosition() {
    Document doc =
        Jsoup.parse(
            "<div id=1>Text1 <p>One</p> Text2 <p>Two</p></div><div id=2>Text3 <p>Three</p></div>");
    Element div1 = doc.select("div").get(0);
    Elements p1s = div1.select("p");
    Element div2 = doc.select("div").get(1);

    assertEquals(2, div2.childNodeSize());
    div2.insertChildren(-1, p1s);
    assertEquals(2, div1.childNodeSize()); // moved two out
    assertEquals(4, div2.childNodeSize());
    assertEquals(3, p1s.get(1).siblingIndex()); // should be last

    List<Node> els = new ArrayList<Node>();
    Element el1 = new Element(Tag.valueOf("span"), "").text("Span1");
    Element el2 = new Element(Tag.valueOf("span"), "").text("Span2");
    TextNode tn1 = new TextNode("Text4", "");
    els.add(el1);
    els.add(el2);
    els.add(tn1);

    assertNull(el1.parent());
    div2.insertChildren(-2, els);
    assertEquals(div2, el1.parent());
    assertEquals(7, div2.childNodeSize());
    assertEquals(3, el1.siblingIndex());
    assertEquals(4, el2.siblingIndex());
    assertEquals(5, tn1.siblingIndex());
  }
예제 #2
0
  private void recurse(final Element element, final Map<String, Object> values, final int depth) {

    final Tag tag = element.tag();
    final Set<String> classes = element.classNames();
    final String link = element.attr("href");
    final Object content = extractChildContent(element);

    if (!classes.isEmpty()) {

      removeEmpty(classes);

      // toplevel classes define type
      if (tag.isBlock()) {

        if (depth == 0) {

          // store type attribute
          values.put("type", classes);

          for (final Element child : element.children()) {
            recurse(child, values, depth + 1);
          }

        } else {

          final Map<String, Object> childMap = new LinkedHashMap<>();
          values.put(classes.iterator().next(), childMap);

          if (content != null) {
            childMap.put("name", content);
          }

          for (final Element child : element.children()) {
            recurse(child, childMap, depth + 1);
          }
        }

      } else if (tag.isInline()) {

        // extract href and store as URL
        if (classes.contains("url") && StringUtils.isNotBlank(link)) {

          values.put("url", link);
          classes.remove("url");
        }

        if (content != null) {

          for (final String type : classes) {
            values.put(type, content);
          }
        }
      }
    }
  }
예제 #3
0
  @Test
  public void testHashcodeIsStableWithContentChanges() {
    Element root = new Element(Tag.valueOf("root"), "");

    HashSet<Element> set = new HashSet<Element>();
    // Add root node:
    set.add(root);

    root.appendChild(new Element(Tag.valueOf("a"), ""));
    assertTrue(set.contains(root));
  }
예제 #4
0
  void outerHtmlHead(StringBuilder accum, int depth, Document.OutputSettings out) {
    if (accum.length() > 0
        && out.prettyPrint()
        && (tag.formatAsBlock()
            || (parent() != null && parent().tag().formatAsBlock())
            || out.outline())) indent(accum, depth, out);
    accum.append("<").append(tagName());
    attributes.html(accum, out);

    if (childNodes.isEmpty() && tag.isSelfClosing()) accum.append(" />");
    else accum.append(">");
  }
예제 #5
0
 void outerHtmlTail(StringBuilder accum, int depth, Document.OutputSettings out) {
   if (!(childNodes.isEmpty() && tag.isSelfClosing())) {
     if (out.prettyPrint()
         && (!childNodes.isEmpty()
             && (tag.formatAsBlock()
                 || (out.outline()
                     && (childNodes.size() > 1
                         || (childNodes.size() == 1
                             && !(childNodes.get(0) instanceof TextNode)))))))
       indent(accum, depth, out);
     accum.append("</").append(tagName()).append(">");
   }
 }
예제 #6
0
  private static Count modify(Element e, Count c) {
    List<Node> o = e.childNodes();
    if (o.size() == 0 && e.textNodes().size() == 0) return new Count(c.getCount(), c.getPgCount());
    for (Node n : o) {
      if (n instanceof TextNode) {
        TextNode nd = (TextNode) n;
        String[] arr = nd.text().trim().split("\\s");
        String txt = "";
        List<Node> nodes = new ArrayList<Node>();
        int j = 0;
        TextNode ndTemp = new TextNode("", " ");
        nodes.add(j, ndTemp);
        for (int i = 0; i < arr.length; i++) {
          if (arr[i].length() > 0) c.incrementCount();
          if (c.getCount() > PAGE_COUNT) {
            ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " ");
            j++;
            nodes.add(j, new Element(Tag.valueOf("pageid=" + c.getPgCount()), ""));
            j++;
            nodes.add(j, new TextNode(" " + arr[i] + " ", ""));
            // "<!--page id="+c.getPgCount()+ "--!>" + " " +  arr[i]);
            // txt = txt + " " + "<!--page id="+c.getPgCount()+ "--!>"  + " " + arr[i]; //<div
            // style='visibility:hidden'>Page="+pageCount+"</div>
            c.incrementPgCount();
            c.setCount(0);

          } else {
            // txt = txt + " " + arr[i];
            ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " " + arr[i]);
          }
        }
        if (nodes.size() > 1) {
          Element etemp = new Element(Tag.valueOf("span"), "");
          nd.replaceWith(etemp);
          for (Node d : nodes) {
            etemp.appendChild(d);
          }
        }
        // nd.text(ndTemp.text());

      } else if (n instanceof Element) {
        Count ctemp = modify((Element) n, c);
        c.setCount(ctemp.getCount());
        c.setPgCount(ctemp.getPgCount());
      }
    }

    return c;
  }
예제 #7
0
 @Override
 public int hashCode() {
   // todo: fixup, not very useful
   int result = super.hashCode();
   result = 31 * result + (tag != null ? tag.hashCode() : 0);
   return result;
 }
예제 #8
0
 @Before
 public void setUp() throws IOException {
   initMocks(this);
   metaElement =
       new JSoupMetaElement(
           new Element(Tag.valueOf("meta"), "about:blank", new Attributes()), ownerDocument);
 }
예제 #9
0
 public static void processEpub(String bookPath, String dest)
     throws FileNotFoundException, IOException {
   EpubReader reader = new EpubReader();
   Book b = reader.readEpub(new FileInputStream(new File(bookPath)));
   String content = "";
   int pagecount = 1;
   int tempCounter;
   Count cnt = new Count(0, 0);
   for (Resource res : b.getContents()) {
     content = new String(res.getData());
     Document doc = Jsoup.parse(content, "UTF-8");
     // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"");
     Element elem = new Element(Tag.valueOf("meta"), "");
     elem.attr("http-equiv", "content-type");
     elem.attr("content", "text/html; charset=utf-8");
     doc.head().after(elem);
     System.out.println(doc.head().data());
     Element ele = doc.body();
     alterElement(ele);
     Count cTemp = modify(ele, cnt);
     cnt.setCount(cTemp.getCount());
     cnt.setPgCount(cTemp.getPgCount());
     doc.body().html(ele.html());
     res.setData(doc.html().getBytes());
     if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html"));
   }
   EpubWriter wr = new EpubWriter();
   wr.write(b, new FileOutputStream(new File(dest)));
 }
 private Element cleanupElement(Element el) {
   Tag newTag = null;
   String newText = null;
   if (el.nodeName().equals("img")) {
     newTag = Tag.valueOf("x");
     newText = el.attr("src");
   }
   if (el.nodeName().equals("em")) {
     newTag = Tag.valueOf("b");
   }
   if (el.nodeName().equals("a")) {
     String clazz = el.attr("class");
     if (clazz.equals("user")) {
       newTag = Tag.valueOf("x");
       newText = "@" + el.text().trim();
     } else if (clazz.startsWith("postimg video")) {
       newTag = Tag.valueOf("x");
       newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src");
     } else if (clazz.startsWith("postimg")) {
       newTag = Tag.valueOf("x");
     } else if (clazz.equals("post")) {
       newTag = Tag.valueOf("x");
     } else {
       newTag = Tag.valueOf("x");
       newText = el.attr("href");
     }
   }
   if (el.nodeName().equals("div")) {
     newTag = Tag.valueOf("x");
   }
   Element nel;
   if (newTag == null) {
     // el = el;
     nel = new Element(el.tag(), "");
     //            for(List<Node> children = nel.childNodes(); children.size() > 0; children =
     // nel.childNodes()) {
     //                children.get(0).remove();
     //            }
   } else {
     nel = new Element(newTag, "");
   }
   if (newText != null) {
     nel.appendChild(new TextNode(newText, ""));
   } else {
     List<Node> children = el.childNodes();
     for (Node child : children) {
       if (child instanceof Element) {
         nel.appendChild(cleanupElement((Element) child));
       } else {
         nel.appendChild(new TextNode(child.toString(), ""));
       }
     }
   }
   return nel;
 }
예제 #11
0
  @Test
  public void testHasClassDomMethods() {
    Tag tag = Tag.valueOf("a");
    Attributes attribs = new Attributes();
    Element el = new Element(tag, "", attribs);

    attribs.put("class", "toto");
    boolean hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", " toto");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "toto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "\ttoto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "  toto ");
    hasClass = el.hasClass("toto");
    assertTrue(hasClass);

    attribs.put("class", "ab");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "     ");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "tototo");
    hasClass = el.hasClass("toto");
    assertFalse(hasClass);

    attribs.put("class", "raulpismuth  ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd  raulpismuth efgh ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd efgh raulpismuth");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);

    attribs.put("class", " abcd efgh raulpismuth ");
    hasClass = el.hasClass("raulpismuth");
    assertTrue(hasClass);
  }
예제 #12
0
  private ElementMeta createSafeElement(Element sourceEl) {
    String sourceTag = sourceEl.tagName();
    Attributes destAttrs = new Attributes();
    Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
    int numDiscarded = 0;

    Attributes sourceAttrs = sourceEl.attributes();
    for (Attribute sourceAttr : sourceAttrs) {
      if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) destAttrs.put(sourceAttr);
      else numDiscarded++;
    }
    Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
    destAttrs.addAll(enforcedAttrs);

    return new ElementMeta(dest, numDiscarded);
  }
예제 #13
0
  /**
   * This method is used for extraction of tables with lot of empty cells in it. It is required for
   * the successful extraction of most Matrix tables.
   */
  private void fillBlankCells() {
    // We say: cells get a line number. If a column does not contain a cell on a certain line, add a
    // whitespace.
    // Any cell that is not filled must be empty:
    for (Line line : data) {
      int lineNumber = line.getLineNumber();
      COLUMNLOOP:
      for (Column2 column : dataInColumns) {
        for (Cell cell : column.getCellObjects()) {
          if (cell.getLineNumber() == lineNumber) {
            break;
          }
          if (cell.getLineNumber() > line.getLineNumber()) { // the last cell?
            // Add a blank cell to this column.
            //                        System.out.println("Add line to :" + column + " in line: " +
            // line.getLineNumber());

            // <span class='ocrx_word' id='word_9' title="bbox 2175 514 2346 555">were</span>

            Tag t = Tag.valueOf("span");
            Attributes attributes = new Attributes();
            attributes.put("class", "ocrx_word");
            attributes.put("id", "word_ADDEDBYTEA");
            attributes.put(
                "title",
                "bbox "
                    + column.getAverageX1()
                    + " "
                    + (int) line.getAverageY1()
                    + " "
                    + column.getAverageX2()
                    + " "
                    + (int) line.getAverageY2());

            Element newElement = new Element(t, "localhost:8080", attributes);
            newElement.text(" ");
            ArrayList<Element> newCell = new ArrayList<Element>();
            newCell.add(newElement);
            //                        System.out.println("adding: " +newElement.text());
            column.addCell(newCell);
            break COLUMNLOOP;
          }
        }
      }
    }
  }
예제 #14
0
 private Elements parseNextNode(String query) {
   if (!NEXT_NODE_TAG.equals(query)) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   } else {
     Elements eles = new Elements();
     if (elements.size() == 1) {
       Attributes attributes = new Attributes();
       Node nextNode = elements.first().nextSibling();
       if (nextNode == null) {
         return eles;
       }
       attributes.put("value", nextNode.toString());
       eles.add(new Element(Tag.valueOf("nextnode"), "", attributes));
     } else {
       eles = elements;
     }
     return eles;
   }
 }
예제 #15
0
  @Test
  public void testAddBooleanAttribute() {
    Element div = new Element(Tag.valueOf("div"), "");

    div.attr("true", true);

    div.attr("false", "value");
    div.attr("false", false);

    assertTrue(div.hasAttr("true"));
    assertEquals("", div.attr("true"));

    List<Attribute> attributes = div.attributes().asList();
    assertEquals("There should be one attribute", 1, attributes.size());
    assertTrue("Attribute should be boolean", attributes.get(0) instanceof BooleanAttribute);

    assertFalse(div.hasAttr("false"));

    assertEquals("<div true></div>", div.outerHtml());
  }
예제 #16
0
 /**
  * Change the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with
  * {@code el.tagName("div");}.
  *
  * @param tagName new tag name for this element
  * @return this element, for chaining
  */
 public Element tagName(String tagName) {
   Validate.notEmpty(tagName, "Tag name must not be empty.");
   tag = Tag.valueOf(tagName);
   return this;
 }
예제 #17
0
  @Override
  protected String doProcess(File htmlfile, String originalUrl, Intent intent) {
    try {
      //            String charset = "utf-8";
      Connection coon = HttpConnection.connect(originalUrl);
      coon.followRedirects(
          false); // we don't want it be redirected to other page,example: 10.254.7.4
      Document doc = coon.get();
      Element head = doc.head();
      Element body = doc.body();
      if (body.children().size() == 0) {
        Log.e(TAG, "body has no child with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      /*
      Elements meta = head.select("meta");
      if(!meta.isEmpty()){
          Element m = meta.get(0);
          String content = m.attr("content");
          String attr = content.substring(content.indexOf("charset=")+8);
          if(!attr.trim().isEmpty()){
              charset = attr;
          }
      }
      */
      Elements base = head.select("base");
      if (base.isEmpty()) {
        String b = head.baseUri();
        Attributes attrs = new Attributes();
        attrs.put("href", b);
        ArrayList<Element> a = new ArrayList<>();
        a.add(new Element(Tag.valueOf("base"), b, attrs));
        head.insertChildren(0, a);
      }

      Element div = doc.select("div.content-main").first();
      if (div == null) {
        Log.e(TAG, "not found specific element with url=" + originalUrl);
        return PROCESS_FAILED_URL;
      }
      Element title = div.select("h1.title").first();
      title.remove();
      body.empty();
      ArrayList<Element> a = new ArrayList<>();
      a.add(div);
      body.insertChildren(0, a);
      int g = 0;
      while (g < 2) { // try two times.
        if (FileUtil.saveStringToFile(doc.toString(), htmlfile, false)) {
          break;
        }
        g++;
      }

      if (g < 2) return StringUtils.file2Url(htmlfile, PROCESS_FAILED_URL);
      Log.e(TAG, "save html to file failed with url=" + originalUrl);
    } catch (MalformedURLException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    return PROCESS_FAILED_URL;
  }
예제 #18
0
 /**
  * Create a new element by tag name, and add it as the first child.
  *
  * @param tagName the name of the tag (e.g. {@code div}).
  * @return the new element, to allow you to add content to it, e.g.: {@code
  *     parent.prependElement("h1").attr("id", "header").text("Welcome");}
  */
 public Element prependElement(String tagName) {
   Element child = new Element(Tag.valueOf(tagName), baseUri());
   prependChild(child);
   return child;
 }
예제 #19
0
 /**
  * Test if this element is a block-level element. (E.g. {@code <div> == true} or an inline element
  * {@code <p> == false}).
  *
  * @return true if block, false if not (and thus inline)
  */
 public boolean isBlock() {
   return tag.isBlock();
 }
예제 #20
0
 /**
  * Get the name of the tag for this element. E.g. {@code div}
  *
  * @return the tag name
  */
 public String tagName() {
   return tag.getName();
 }
예제 #21
0
 @Override
 public String nodeName() {
   return tag.getName();
 }