// @todo(dallison) handle rtl text private static void recurseNode(Node node, StringBuilder builder) { // boundry case for text if (node instanceof TextNode) { builder.append(((TextNode) node).text()); builder.append(" "); return; } // check for title text if (node.hasAttr(TITLE)) { builder.append(node.attr(TITLE)); builder.append(" "); } // if the current node has alt text append it if (node.hasAttr(ALT_TEXT)) { builder.append(node.attr(ALT_TEXT)); // Add trailing white space to separate elements builder.append(" "); } // recurse into the child nodes. for (Node child : node.childNodes()) { recurseNode(child, builder); } }
private static Collection<Node> extractImageNodes(Element aInContent) { Collection<Node> lImageNodes = new LinkedList<>(); Elements lImageElements = aInContent.getElementsByTag("img"); if (!lImageElements.isEmpty()) { int i = 0; for (Element lImageElement : lImageElements) { i++; if (lImageElement.hasClass("float-left")) { if (!lImageElement.hasClass("alignleft")) { lImageElement.addClass("alignleft"); } } else if (lImageElement.hasClass("float-right")) { if (!lImageElement.hasClass("alignright")) { lImageElement.addClass("alignright"); } } if (i > 1) { lImageElement.removeAttr("width"); lImageElement.removeAttr("height"); } Node lThisNode = toNode(lImageElement); lImageNodes.add(lThisNode.clone()); } } return lImageNodes; }
public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
@Test public void designIsSerializedWithCorrectPrefixesAndPackageNames() throws IOException { ByteArrayOutputStream out = serializeDesign(ctx); // Check the mapping from prefixes to package names using the html tree String[] expectedPrefixes = {"my"}; String[] expectedPackageNames = {"com.addon.mypackage"}; int index = 0; Document doc = Jsoup.parse(out.toString("UTF-8")); Element head = doc.head(); for (Node child : head.childNodes()) { if ("meta".equals(child.nodeName())) { String name = child.attributes().get("name"); if ("package-mapping".equals(name)) { String content = child.attributes().get("content"); String[] parts = content.split(":"); assertEquals("Unexpected prefix.", expectedPrefixes[index], parts[0]); assertEquals("Unexpected package name.", expectedPackageNames[index], parts[1]); index++; } } } assertEquals("Unexpected number of prefix - package name pairs.", 1, index); }
// Ensures that embed bound properties are writable private void checkEmbedAgainst( PageCompilingContext pc, EvaluatorCompiler compiler, Map<String, String> properties, Class<?> embedClass, Node node) { // TODO also type check them against expressions for (String property : properties.keySet()) { try { if (!compiler.isWritable(property)) { pc.errors.add( CompileError.in(node.outerHtml()) // TODO we need better line number detection if there is whitespace between the // annotation and tag. .near(node.siblingIndex() - 1) // TODO - line number of the annotation .causedBy( CompileErrors.PROPERTY_NOT_WRITEABLE, String.format( "Property %s#%s was not writable. Did you forget to create " + "a setter or @Visible annotation?", embedClass.getSimpleName(), property))); } } catch (ExpressionCompileException ece) { pc.errors.add( CompileError.in(node.outerHtml()) .near(node.siblingIndex()) // TODO - line number .causedBy(CompileErrors.ERROR_COMPILING_PROPERTY)); } } }
protected void removeChild(Node out) { Validate.isTrue(out.parentNode == this); int index = out.siblingIndex(); childNodes.remove(index); reindexChildren(); out.parentNode = null; }
public List<Node> findSiblings(Node node) { Preconditions.checkNotNull(node); Node parent = node.parent(); if (null == parent) return null; return parent.childNodes(); }
boolean unlikely(Node e) { if (e.attr("class") != null && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption")) return true; String style = e.attr("style"); String clazz = e.attr("class"); return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find(); }
protected void addChildren(Node... children) { // most used. short circuit addChildren(int), which hits reindex children and array copy for (Node child : children) { reparentChild(child); childNodes.add(child); child.setSiblingIndex(childNodes.size() - 1); } }
private static Node toNode(Element aInElement) { int i = aInElement.siblingIndex(); Node lNode = aInElement.parent().childNode(i); if (!lNode.nodeName().equals(aInElement.tagName())) { throw new RuntimeException(lNode.nodeName() + " != " + aInElement.tagName()); } return lNode; }
public static boolean isEmpty(Node node, boolean doFilter) { return node == null || node.nodeName().equals("#comment") || node.nodeName().equals("#data") || node.nodeName().equals("style") || node.nodeName().equals("script") || isHidden(node) || (doFilter && isFiltered(node)) || (node.nodeName().equals("#text") && CommonUtil.isEmpty(node.toString(), true)); }
private static void removeComments(Node node) { for (int i = 0; i < node.childNodes().size(); ) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) child.remove(); else { removeComments(child); i++; } } }
private Element cleanupElement(Element el) { Tag newTag = null; String newText = null; if (el.nodeName().equals("img")) { newTag = Tag.valueOf("x"); newText = el.attr("src"); } if (el.nodeName().equals("em")) { newTag = Tag.valueOf("b"); } if (el.nodeName().equals("a")) { String clazz = el.attr("class"); if (clazz.equals("user")) { newTag = Tag.valueOf("x"); newText = "@" + el.text().trim(); } else if (clazz.startsWith("postimg video")) { newTag = Tag.valueOf("x"); newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src"); } else if (clazz.startsWith("postimg")) { newTag = Tag.valueOf("x"); } else if (clazz.equals("post")) { newTag = Tag.valueOf("x"); } else { newTag = Tag.valueOf("x"); newText = el.attr("href"); } } if (el.nodeName().equals("div")) { newTag = Tag.valueOf("x"); } Element nel; if (newTag == null) { // el = el; nel = new Element(el.tag(), ""); // for(List<Node> children = nel.childNodes(); children.size() > 0; children = // nel.childNodes()) { // children.get(0).remove(); // } } else { nel = new Element(newTag, ""); } if (newText != null) { nel.appendChild(new TextNode(newText, "")); } else { List<Node> children = el.childNodes(); for (Node child : children) { if (child instanceof Element) { nel.appendChild(cleanupElement((Element) child)); } else { nel.appendChild(new TextNode(child.toString(), "")); } } } return nel; }
protected void replaceChild(Node out, Node in) { Validate.isTrue(out.parentNode == this); Validate.notNull(in); if (in.parentNode != null) in.parentNode.removeChild(in); Integer index = out.siblingIndex(); childNodes.set(index, in); in.parentNode = this; in.setSiblingIndex(index); out.parentNode = null; }
public static int nearestBlock(Node node) { int nearest = 0; Node parent = node.parent(); while (parent != null) { ++nearest; if (NodeUtil.isProximityBlock(parent.nodeName())) { return nearest; } parent = parent.parent(); } return Integer.MAX_VALUE; }
public static boolean matches(HtmlNode reference, Node test) { if (test == null) { return false; } if (!CommonUtil.isEmpty(reference.id)) { return reference.id.equalsIgnoreCase(test.attr("id")); } if (!CommonUtil.isEmpty(reference.name)) { return reference.name.equalsIgnoreCase(test.attr("name")); } List<String[]> toMatch = new ArrayList<String[]>(); toMatch.add(new String[] {reference.tagName, test.nodeName()}); toMatch.add(new String[] {reference.type, test.attr("type")}); toMatch.add(new String[] {reference.value, test.attr("value")}); toMatch.add(new String[] {reference.title, test.attr("title")}); toMatch.add(new String[] {reference.role, test.attr("role")}); toMatch.add(new String[] {reference.alt, test.attr("alt")}); toMatch.add(new String[] {reference.href, test.attr("href")}); if (test instanceof Element) { toMatch.add( new String[] { CommonUtil.strip(reference.innerText, false), CommonUtil.strip(((Element) test).text(), false) }); } String refClassesString = CommonUtil.toString(reference.classes, " "); Collection<String> refClasses = new HashSet<String>(Arrays.asList(refClassesString.toLowerCase().split("\\s"))); Collection<String> testClasses = new HashSet<String>(Arrays.asList(test.attr("class").toLowerCase().split("\\s"))); for (String[] pair : toMatch) { if (reference.any) { if (!CommonUtil.isEmpty(pair[0]) && pair[0].equalsIgnoreCase(pair[1])) { return true; } } else { if (!CommonUtil.isEmpty(pair[0]) && !pair[0].equalsIgnoreCase(pair[1])) { return false; } } } if (!refClasses.isEmpty()) { for (String testClass : testClasses) { if (reference.any) { if (refClasses.contains(testClass)) { return true; } } else { if (!refClasses.contains(testClass)) { return false; } } } } return !reference.any; }
public static boolean isItem(Node node, HtmlNode matchResult, HtmlNode matchParent) { if (matchResult != null) { return matches(matchResult, node); } if (matchParent != null) { return node.parent() != null && matches(matchParent, node.parent()); } for (int i = 0; i < items.length; i++) { if (node.nodeName().equalsIgnoreCase(items[i])) { return true; } } return false; }
private static String cleanHtml(final Node node) { if (node instanceof Element) { Element element = ((Element) node); StringBuilder accum = new StringBuilder(); accum.append("<").append(element.tagName()); for (Attribute attribute : element.attributes()) { if (!(attribute.getKey().startsWith("_"))) { accum.append(" "); accum.append(attribute.getKey()); accum.append("=\""); accum.append(attribute.getValue()); accum.append('"'); } } if (element.childNodes().isEmpty() && element.tag().isEmpty()) { accum.append(" />"); } else { accum.append(">"); for (Node child : element.childNodes()) accum.append(cleanHtml(child)); accum.append("</").append(element.tagName()).append(">"); } return accum.toString(); } else if (node instanceof TextNode) { return ((TextNode) node).getWholeText(); } else if (node instanceof XmlDeclaration) { // HACK if (node.childNodes().isEmpty()) { return ""; } return node.outerHtml(); } else if (node instanceof Comment) { // HACK: elide comments for now. return ""; } else if (node instanceof DataNode && node.childNodes().isEmpty()) { // No child nodes are defined but we have to handle content if such exists, example // <script language="JavaScript">var a = { name: "${user.name}"}</script> String content = node.attr("data"); if (Strings.empty(content)) { return ""; } return content; } else { return node.outerHtml(); } }
public Map<String, String> attempt(Element element) { Map<String, String> attributes = new HashMap<String, String>(); for (Entry<String, Matcher> entry : matchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), decode(element.text())); } } for (Entry<String, Matcher> entry : textMatchers.entrySet()) { if (entry.getValue().test(element)) { Node textNode = element.nextSibling(); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) { if (entry.getValue().test(element)) { TextNode textNode = element.textNodes().get(0); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), element.html()); } } for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element)); } } for (Entry<String, Object[]> entry : attrMatchers.entrySet()) { Object[] objects = entry.getValue(); Matcher matcher = (Matcher) objects[0]; String attr = (String) objects[1]; if (matcher.test(element)) { attributes.put(entry.getKey(), element.attr(attr)); } } return attributes; }
public List<String> getNextNeighbors(Node nextSib) { if (nextSib == null) { return null; } String data; StringBuilder sb = new StringBuilder(); String[] neighbors; List<String> retList = new ArrayList<String>(); int count = 0; while (count < 5) { data = getData(nextSib); if (data == null) { break; } neighbors = data.split(" "); for (String s : neighbors) { if (count == 5) { break; } if (!s.matches("^[a-zA-Z0-9]+$")) { s = s.replaceAll("[^\\p{Alpha}\\p{Digit}]+", ""); } sb.append(s.toLowerCase()); count++; sb.append(" "); } nextSib = nextSib.nextSibling(); } for (String s : sb.toString().split(" ")) { retList.add(s); } return retList; }
/** * Insert the specified node into the DOM after this node (i.e. as a following sibling). * * @param node to add after this node * @return this node, for chaining * @see #before(Node) */ public Node after(Node node) { Validate.notNull(node); Validate.notNull(parentNode); parentNode.addChildren(siblingIndex() + 1, node); return this; }
public List<String> getPrevNeighbors(Node prevSib) { if (prevSib == null) { return null; } String data; StringBuilder sb = new StringBuilder(); String[] neighbors; List<String> retList = new ArrayList<String>(); int count = 0; while (count < 5) { data = getData(prevSib); if (data == null) { break; } neighbors = data.split(" "); for (int i = neighbors.length - 1; i >= 0; i--) { if (count == 5) { break; } String word = neighbors[i]; if (!word.matches("^[a-zA-Z0-9]+$")) { word = word.replaceAll("[^\\p{Alpha}\\p{Digit}]+", ""); } sb.append(word.toLowerCase()); count++; sb.append(" "); } prevSib = prevSib.previousSibling(); } for (String s : sb.toString().split(" ")) { retList.add(s); } return retList; }
@Override public int hashCode() { int result = parentNode != null ? parentNode.hashCode() : 0; // not children, or will block stack as they go back up to parent) result = 31 * result + (attributes != null ? attributes.hashCode() : 0); return result; }
private void checkUriConsistency(PageCompilingContext pc, Node element) { String uriAttrib = element.attr("action"); if (null == uriAttrib) uriAttrib = element.attr("src"); if (null == uriAttrib) uriAttrib = element.attr("href"); if (null != uriAttrib) { // Verify that such a uri exists in the page book, // only if it is contextual--ignore abs & relative URIs. if (uriAttrib.startsWith("/")) if (null == pageBook.nonCompilingGet(uriAttrib)) pc.warnings.add( CompileError.in(element.outerHtml()) .near(element.siblingIndex()) // TODO - line number .causedBy(CompileErrors.UNRESOLVABLE_FORM_ACTION, uriAttrib)); } }
private void addSiblingHtml(int index, String html) { Validate.notNull(html); Validate.notNull(parentNode); Element context = parent() instanceof Element ? (Element) parent() : null; List<Node> nodes = Parser.parseFragment(html, context, baseUri()); parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()])); }
/** Called to push a new lexical scope onto the stack. */ private boolean lexicalClimb(PageCompilingContext pc, Node node) { if (node.attr(ANNOTATION).length() > 1) { // Setup a new lexical scope (symbol table changes on each scope encountered). if (REPEAT_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY)) || CHOOSE_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY))) { String[] keyAndContent = {node.attr(ANNOTATION_KEY), node.attr(ANNOTATION_CONTENT)}; pc.lexicalScopes.push(new MvelEvaluatorCompiler(parseRepeatScope(pc, keyAndContent, node))); return true; } // Setup a new lexical scope for compiling against embedded pages (closures). final PageBook.Page embed = pageBook.forName(node.attr(ANNOTATION_KEY)); if (null != embed) { final Class<?> embedClass = embed.pageClass(); MvelEvaluatorCompiler compiler = new MvelEvaluatorCompiler(embedClass); checkEmbedAgainst( pc, compiler, Parsing.toBindMap(node.attr(ANNOTATION_CONTENT)), embedClass, node); pc.lexicalScopes.push(compiler); return true; } } return false; }
// hit when the node is first seen public void head(Node node, int depth) { String name = node.nodeName(); if (node instanceof TextNode) { append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. } else if (name.equals("li")) append("<li>"); else if (name.equals("ul")) { append("<ul>"); } }
static void markVisible(Node node) { if (node != null) { if (node.nodeName().equals("select")) { node.traverse( new NodeVisitor() { @Override public void tail(Node n, int d) {} @Override public void head(Node n, int d) { n.attr("class", hiddenMarker.matcher(n.attr("class")).replaceAll("")); } }); } node.attr("class", hiddenMarker.matcher(node.attr("class")).replaceAll("")); markVisible(node.parent()); } }
public static boolean isContent(Node node, HtmlNode matchResult, HtmlNode matchParent) { if (matchParent != null) { return matches(matchParent, node); } if (matchResult != null) { for (Node child : node.childNodes()) { if (matches(matchResult, child)) { return true; } } return false; } for (int i = 0; i < content.length; i++) { if (node.nodeName().equalsIgnoreCase(content[i])) { return true; } } return false; }
/** * Removes this node from the DOM, and moves its children up into the node's parent. This has the * effect of dropping the node but keeping its children. * * <p>For example, with the input html:<br> * {@code <div>One <span>Two <b>Three</b></span></div>}<br> * Calling {@code element.unwrap()} on the {@code span} element will result in the html:<br> * {@code <div>One Two <b>Three</b></div>}<br> * and the {@code "Two "} {@link TextNode} being returned. * * @return the first child of this node, after the node has been unwrapped. Null if the node had * no children. * @see #remove() * @see #wrap(String) */ public Node unwrap() { Validate.notNull(parentNode); int index = siblingIndex; Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null; parentNode.addChildren(index, this.childNodesAsArray()); this.remove(); return firstChild; }