public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
private void recurse(Element element) { ElementAction action = classifyElement(element); if (action == ElementAction.Whitespace || action == ElementAction.Sentence) { appendSpace(); } for (Node childNode : element.childNodes()) { // n.b., cdata not possible if we are coming from TagSoup. If we also handle // real xhtml by directly parsing it, then we have another story on our hands. // though we could use canonical XML to get rid of them. if (childNode instanceof TextNode && action != ElementAction.Banned) { TextNode textContent = (TextNode) childNode; String textString = textContent.text(); append(textContent, textString); } else if (childNode instanceof Element) { recurse((Element) childNode); } } if (action == ElementAction.Whitespace) { appendSpace(); } else if (action == ElementAction.Sentence) { appendPeriod(); } else if (action == ElementAction.Mark) { Mark mark = new Mark(); mark.setOffset(pcDataOffset); mark.setTag(element.tagName()); } }
private String getTextNodeText(TextNode tn, boolean normalText) { String input = normalText ? tn.text() : tn.getWholeText(); Node prev = tn.previousSibling(); Node next = tn.nextSibling(); boolean parentIsBlock = isBlock(tn.parent()); if (isBlock(prev)) { input = ltrim(input); } else if (prev == null && parentIsBlock) { input = ltrim(input); } else if (normalText && prev instanceof TextNode) { TextNode tprev = (TextNode) prev; if (EMPTY_MATCHER.matcher(tprev.text()).matches()) { input = ltrim(input); } } if (input.length() > 0) { if (isBlock(next)) { input = rtrim(input); } else if (next == null && parentIsBlock) { input = rtrim(input); } else if (normalText && next instanceof TextNode) { TextNode tnext = (TextNode) next; if (EMPTY_MATCHER.matcher(tnext.text()).matches()) { input = rtrim(input); } } } return input; }
@Test public void parsesUnterminatedComments() { String html = "<p>Hello<!-- <tr><td>"; Document doc = Jsoup.parse(html); Element p = doc.getElementsByTag("p").get(0); assertEquals("Hello", p.text()); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); Comment comment = (Comment) p.childNode(1); assertEquals(" <tr><td>", comment.getData()); }
@Test public void parsesComments() { String html = "<html><head></head><body><!-- <table><tr><td></table> --><p>Hello</p></body></html>"; Document doc = Jsoup.parse(html); Element body = doc.child(1); Comment comment = (Comment) body.childNode(0); assertEquals(" <table><tr><td></table> ", comment.getData()); Element p = body.child(0); TextNode text = (TextNode) p.childNode(0); assertEquals("Hello", text.getWholeText()); }
private static Count modify(Element e, Count c) { List<Node> o = e.childNodes(); if (o.size() == 0 && e.textNodes().size() == 0) return new Count(c.getCount(), c.getPgCount()); for (Node n : o) { if (n instanceof TextNode) { TextNode nd = (TextNode) n; String[] arr = nd.text().trim().split("\\s"); String txt = ""; List<Node> nodes = new ArrayList<Node>(); int j = 0; TextNode ndTemp = new TextNode("", " "); nodes.add(j, ndTemp); for (int i = 0; i < arr.length; i++) { if (arr[i].length() > 0) c.incrementCount(); if (c.getCount() > PAGE_COUNT) { ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " "); j++; nodes.add(j, new Element(Tag.valueOf("pageid=" + c.getPgCount()), "")); j++; nodes.add(j, new TextNode(" " + arr[i] + " ", "")); // "<!--page id="+c.getPgCount()+ "--!>" + " " + arr[i]); // txt = txt + " " + "<!--page id="+c.getPgCount()+ "--!>" + " " + arr[i]; //<div // style='visibility:hidden'>Page="+pageCount+"</div> c.incrementPgCount(); c.setCount(0); } else { // txt = txt + " " + arr[i]; ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " " + arr[i]); } } if (nodes.size() > 1) { Element etemp = new Element(Tag.valueOf("span"), ""); nd.replaceWith(etemp); for (Node d : nodes) { etemp.appendChild(d); } } // nd.text(ndTemp.text()); } else if (n instanceof Element) { Count ctemp = modify((Element) n, c); c.setCount(ctemp.getCount()); c.setPgCount(ctemp.getPgCount()); } } return c; }
public Map<String, String> attempt(Element element) { Map<String, String> attributes = new HashMap<String, String>(); for (Entry<String, Matcher> entry : matchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), decode(element.text())); } } for (Entry<String, Matcher> entry : textMatchers.entrySet()) { if (entry.getValue().test(element)) { Node textNode = element.nextSibling(); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) { if (entry.getValue().test(element)) { TextNode textNode = element.textNodes().get(0); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), element.html()); } } for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element)); } } for (Entry<String, Object[]> entry : attrMatchers.entrySet()) { Object[] objects = entry.getValue(); Matcher matcher = (Matcher) objects[0]; String attr = (String) objects[1]; if (matcher.test(element)) { attributes.put(entry.getKey(), element.attr(attr)); } } return attributes; }
void appendTextSkipHidden(Element e, StringBuilder accum) { for (Node child : e.childNodes()) { if (unlikely(child)) continue; if (child instanceof TextNode) { TextNode textNode = (TextNode) child; String txt = textNode.text(); accum.append(txt); } else if (child instanceof Element) { Element element = (Element) child; if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) accum.append(" "); else if (element.tagName().equals("br")) accum.append(" "); appendTextSkipHidden(element, accum); } } }
private static void parseReplyTime(Topic.Builder topicBuilder, TextNode textNode) { final String text = textNode.text(); final Matcher matcher = PATTERN_REPLY_TIME.matcher(text); if (!matcher.find()) { throw new FatalException("match reply time for topic failed: " + text); } final String time = matcher.group(1); topicBuilder.setReplyTime(time); }
/** Walks the DOM recursively, and converts elements into corresponding sitebricks widgets. */ @NotNull private <N extends Node> WidgetChain walk(PageCompilingContext pc, N node) { WidgetChain widgetChain = Chains.proceeding(); for (Node n : node.childNodes()) { if (n instanceof Element) { final Element child = (Element) n; // push form if this is a form tag if (child.tagName().equals("form")) pc.form = (Element) n; // setup a lexical scope if we're going into a repeat widget (by reading the previous node) final boolean shouldPopScope = lexicalClimb(pc, child); // continue recursing down, perform a post-order, depth-first traversal of the DOM WidgetChain childsChildren; try { childsChildren = walk(pc, child); // process the widget itself into a Renderable with child tree widgetChain.addWidget(widgetize(pc, child, childsChildren)); } finally { lexicalDescend(pc, child, shouldPopScope); } } else if (n instanceof TextNode) { TextNode child = (TextNode) n; Renderable textWidget; // setup a lexical scope if we're going into a repeat widget (by reading the previous node) final boolean shouldPopScope = lexicalClimb(pc, child); // construct the text widget try { textWidget = registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek()); // if there are no annotations, add the text widget to the chain if (!child.hasAttr(ANNOTATION_KEY)) { widgetChain.addWidget(textWidget); } else { // construct a new widget chain for this text node WidgetChain childsChildren = Chains.proceeding().addWidget(textWidget); // make a new widget for the annotation, making the text chain the child String widgetName = child.attr(ANNOTATION_KEY).toLowerCase(); Renderable annotationWidget = registry.newWidget( widgetName, child.attr(ANNOTATION_CONTENT), childsChildren, pc.lexicalScopes.peek()); widgetChain.addWidget(annotationWidget); } } catch (ExpressionCompileException e) { pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e)); } if (shouldPopScope) pc.lexicalScopes.pop(); } else if ((n instanceof Comment) || (n instanceof DataNode)) { // process as raw text widget try { widgetChain.addWidget(registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek())); } catch (ExpressionCompileException e) { pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e)); } } else if (n instanceof XmlDeclaration) { try { widgetChain.addWidget( registry.xmlDirectiveWidget( ((XmlDeclaration) n).getWholeDeclaration(), pc.lexicalScopes.peek())); } catch (ExpressionCompileException e) { pc.errors.add(CompileError.in(node.outerHtml()).near(line(node)).causedBy(e)); } } } // return computed chain, or a terminal return widgetChain; }
private static void appendNormalisedText(StringBuilder accum, TextNode textNode) { String text = textNode.getWholeText(); if (preserveWhitespace(textNode.parentNode())) accum.append(text); else StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsWhitespace(accum)); }