public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
private Element cleanupElement(Element el) { Tag newTag = null; String newText = null; if (el.nodeName().equals("img")) { newTag = Tag.valueOf("x"); newText = el.attr("src"); } if (el.nodeName().equals("em")) { newTag = Tag.valueOf("b"); } if (el.nodeName().equals("a")) { String clazz = el.attr("class"); if (clazz.equals("user")) { newTag = Tag.valueOf("x"); newText = "@" + el.text().trim(); } else if (clazz.startsWith("postimg video")) { newTag = Tag.valueOf("x"); newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src"); } else if (clazz.startsWith("postimg")) { newTag = Tag.valueOf("x"); } else if (clazz.equals("post")) { newTag = Tag.valueOf("x"); } else { newTag = Tag.valueOf("x"); newText = el.attr("href"); } } if (el.nodeName().equals("div")) { newTag = Tag.valueOf("x"); } Element nel; if (newTag == null) { // el = el; nel = new Element(el.tag(), ""); // for(List<Node> children = nel.childNodes(); children.size() > 0; children = // nel.childNodes()) { // children.get(0).remove(); // } } else { nel = new Element(newTag, ""); } if (newText != null) { nel.appendChild(new TextNode(newText, "")); } else { List<Node> children = el.childNodes(); for (Node child : children) { if (child instanceof Element) { nel.appendChild(cleanupElement((Element) child)); } else { nel.appendChild(new TextNode(child.toString(), "")); } } } return nel; }
public static Element createElement(Tag tag, String baseUri, Attributes attrs, Node[] contents) { Element el = new Element(tag, baseUri, attrs); for (Node n : contents) { el.appendChild(n); } return el; }
/** * Set the text of this element. Any existing contents (text or elements) will be cleared * * @param text unencoded text * @return this element */ public Element text(String text) { Validate.notNull(text); empty(); TextNode textNode = new TextNode(text, baseUri); appendChild(textNode); return this; }
@Test public void testHashcodeIsStableWithContentChanges() { Element root = new Element(Tag.valueOf("root"), ""); HashSet<Element> set = new HashSet<Element>(); // Add root node: set.add(root); root.appendChild(new Element(Tag.valueOf("a"), "")); assertTrue(set.contains(root)); }
/** * Generates an html tree representation of the component hierarchy having the root * designContext.getRootComponent(). The hierarchy is stored under <body> in the tree. The * generated tree represents a valid html document. * * @param designContext a DesignContext object specifying the root component * (designContext.getRootComponent()) of the hierarchy * @return an html tree representation of the component hierarchy */ private static Document createHtml(DesignContext designContext) { // Create the html tree skeleton. Document doc = new Document(""); DocumentType docType = new DocumentType("html", "", "", ""); doc.appendChild(docType); Element html = doc.createElement("html"); doc.appendChild(html); html.appendChild(doc.createElement("head")); Element body = doc.createElement("body"); html.appendChild(body); // Append the design under <body> in the html tree. createNode // creates the entire component hierarchy rooted at the // given root node. Component root = designContext.getRootComponent(); if (root != null) { Node rootNode = designContext.createElement(root); body.appendChild(rootNode); } designContext.writePackageMappings(doc); return doc; }
private static Count modify(Element e, Count c) { List<Node> o = e.childNodes(); if (o.size() == 0 && e.textNodes().size() == 0) return new Count(c.getCount(), c.getPgCount()); for (Node n : o) { if (n instanceof TextNode) { TextNode nd = (TextNode) n; String[] arr = nd.text().trim().split("\\s"); String txt = ""; List<Node> nodes = new ArrayList<Node>(); int j = 0; TextNode ndTemp = new TextNode("", " "); nodes.add(j, ndTemp); for (int i = 0; i < arr.length; i++) { if (arr[i].length() > 0) c.incrementCount(); if (c.getCount() > PAGE_COUNT) { ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " "); j++; nodes.add(j, new Element(Tag.valueOf("pageid=" + c.getPgCount()), "")); j++; nodes.add(j, new TextNode(" " + arr[i] + " ", "")); // "<!--page id="+c.getPgCount()+ "--!>" + " " + arr[i]); // txt = txt + " " + "<!--page id="+c.getPgCount()+ "--!>" + " " + arr[i]; //<div // style='visibility:hidden'>Page="+pageCount+"</div> c.incrementPgCount(); c.setCount(0); } else { // txt = txt + " " + arr[i]; ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " " + arr[i]); } } if (nodes.size() > 1) { Element etemp = new Element(Tag.valueOf("span"), ""); nd.replaceWith(etemp); for (Node d : nodes) { etemp.appendChild(d); } } // nd.text(ndTemp.text()); } else if (n instanceof Element) { Count ctemp = modify((Element) n, c); c.setCount(ctemp.getCount()); c.setPgCount(ctemp.getPgCount()); } } return c; }
/** * Wrap the supplied HTML around this node. * * @param html HTML to wrap around this element, e.g. {@code <div class="head"></div>}. Can be * arbitrarily deep. * @return this node, for chaining. */ public Node wrap(String html) { Validate.notEmpty(html); Element context = parent() instanceof Element ? (Element) parent() : null; List<Node> wrapChildren = Parser.parseFragment(html, context, baseUri()); Node wrapNode = wrapChildren.get(0); if (wrapNode == null || !(wrapNode instanceof Element)) // nothing to wrap with; noop return null; Element wrap = (Element) wrapNode; Element deepest = getDeepChild(wrap); parentNode.replaceChild(this, wrap); deepest.addChildren(this); // remainder (unbalanced wrap, like <div></div><p></p> -- The <p> is remainder if (wrapChildren.size() > 0) { for (int i = 0; i < wrapChildren.size(); i++) { Node remainder = wrapChildren.get(i); remainder.parentNode.removeChild(remainder); wrap.appendChild(remainder); } } return this; }
/** Appends all the given children to the parent Element. */ public static Element appendChildren(final Element parent, final Elements children) { for (final Element paragraph : children) { parent.appendChild(paragraph); } return parent; }
@Test public void shouldCleanHTMLContentReadability() throws IOException { // http://trib.al/KsvH2JE // http://tcrn.ch/1NxNAZJ String urll = "http://tcrn.ch/1NxNAZJ"; URL url = new URL(urll); final Readability extractor = new Readability(url, 10000); extractor.init(); String content = extractor.html(); Whitelist wl = Whitelist.relaxed(); // add additional tags here as necessary wl.addTags("figure"); String clean = Jsoup.clean(content, wl); Document mDocument = new Document(""); Element html = mDocument.createElement("html"); Element head = mDocument.createElement("head"); Element body = mDocument.createElement("body"); html.appendChild(head); body = body.append(clean).select("div").first(); html.appendChild(body); mDocument.appendChild(html); // Document document = Jsoup.parse(content); // Element head = document.head(); // String style = "<STYLE type=\"text/css\">"+ // "blockquote{"+ // " display:block;"+ // " background: #fff;"+ // " padding: 15px 20px 15px 45px;"+ // " margin: 0 0 20px;"+ // " position: relative;"+ // " "+ // " /*Font*/"+ // " font-family: Georgia, serif;"+ // " font-size: 16px;"+ // " line-height: 1.2;"+ // " color: #666;"+ // " text-align: justify;"+ // " "+ // " /*Borders - (Optional)*/"+ // " border-left: 15px solid #76AABA;"+ // " border-right: 2px solid #76AABA;"+ // " "+ // " /*Box Shadow - (Optional)*/"+ // " -moz-box-shadow: 2px 2px 15px #ccc;"+ // " -webkit-box-shadow: 2px 2px 15px #ccc;"+ // " box-shadow: 2px 2px 15px #ccc;"+ // "}"+ // ""+ // "blockquote::before{"+ // " content: \"\\201C\"; /*Unicode for Left Double Quote*/"+ // " "+ // " /*Font*/"+ // " font-family: Georgia, serif;"+ // " font-size: 60px;"+ // " font-weight: bold;"+ // " color: #999;"+ // " "+ // " /*Positioning*/"+ // " position: absolute;"+ // " left: 10px;"+ // " top:5px;"+ // "}"+ // ""+ // "blockquote::after{"+ // " /*Reset to make sure"+ // " content: \"\";*/"+ // " "+ // " content: \"\\201D\"; /*Unicode for Left Double Quote*/"+ // " "+ // " /*Font*/"+ // " font-family: Georgia, serif;"+ // " font-size: 60px;"+ // " font-weight: bold;"+ // " color: #999;"+ // " "+ // " /*Positioning*/"+ // " position: absolute;"+ // " right: 10px;"+ // " bottom:5px;"+ // "}"+ // ""+ // "blockquote a{"+ // " text-decoration: none;"+ // " background: #eee;"+ // " cursor: pointer;"+ // " padding: 0 3px;"+ // " color: #76AABA;"+ // "}"+ // ""+ // "blockquote a:hover{"+ // " color: #666;"+ // "}"+ // ""+ // "blockquote em{"+ // " font-style: italic;"+ // "}"+ // "</STYLE>"; // head.append(style); // // content = document.html(); LOG.debug("Article content : {}", mDocument.html()); // LOG.debug("Article content outer: {}", contentOuter); }
/** * Create and append a new TextNode to this element. * * @param text the unencoded text to add * @return this element */ public Element appendText(String text) { TextNode node = new TextNode(text, baseUri()); appendChild(node); return this; }
/** * Create a new element by tag name, and add it as the last child. * * @param tagName the name of the tag (e.g. {@code div}). * @return the new element, to allow you to add content to it, e.g.: {@code * parent.appendElement("h1").attr("id", "header").text("Welcome");} */ public Element appendElement(String tagName) { Element child = new Element(Tag.valueOf(tagName), baseUri()); appendChild(child); return child; }