private String getTextNodeText(TextNode tn, boolean normalText) { String input = normalText ? tn.text() : tn.getWholeText(); Node prev = tn.previousSibling(); Node next = tn.nextSibling(); boolean parentIsBlock = isBlock(tn.parent()); if (isBlock(prev)) { input = ltrim(input); } else if (prev == null && parentIsBlock) { input = ltrim(input); } else if (normalText && prev instanceof TextNode) { TextNode tprev = (TextNode) prev; if (EMPTY_MATCHER.matcher(tprev.text()).matches()) { input = ltrim(input); } } if (input.length() > 0) { if (isBlock(next)) { input = rtrim(input); } else if (next == null && parentIsBlock) { input = rtrim(input); } else if (normalText && next instanceof TextNode) { TextNode tnext = (TextNode) next; if (EMPTY_MATCHER.matcher(tnext.text()).matches()) { input = rtrim(input); } } } return input; }
public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
@Test public void parsesUnterminatedComments() { String html = "<p>Hello<!-- <tr><td>"; Document doc = Jsoup.parse(html); Element p = doc.getElementsByTag("p").get(0); assertEquals("Hello", p.text()); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); Comment comment = (Comment) p.childNode(1); assertEquals(" <tr><td>", comment.getData()); }
@Test public void parsesComments() { String html = "<html><head></head><body><!-- <table><tr><td></table> --><p>Hello</p></body></html>"; Document doc = Jsoup.parse(html); Element body = doc.child(1); Comment comment = (Comment) body.childNode(0); assertEquals(" <table><tr><td></table> ", comment.getData()); Element p = body.child(0); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); }
private static void appendNormalisedText(StringBuilder accum, TextNode textNode) { String text = textNode.getWholeText(); if (preserveWhitespace(textNode.parentNode())) accum.append(text); else StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsWhitespace(accum)); }