Example #1
    public void head(Node source, int depth) {
      if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
          ElementMeta meta = createSafeElement(sourceEl);
          Element destChild = meta.el;

          numDiscarded += meta.numAttribsDiscarded;
          destination = destChild;
        } else if (source
            != root) { // not a safe tag, so don't add. don't count root against discarded.
      } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
      } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
        DataNode sourceData = (DataNode) source;
        DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
      } else { // else, we don't care about comments, xml proc instructions, etc
 private void recurse(Element element) {
   ElementAction action = classifyElement(element);
   if (action == ElementAction.Whitespace || action == ElementAction.Sentence) {
   for (Node childNode : element.childNodes()) {
     // n.b., cdata not possible if we are coming from TagSoup. If we also handle
     // real xhtml by directly parsing it, then we have another story on our hands.
     // though we could use canonical XML to get rid of them.
     if (childNode instanceof TextNode && action != ElementAction.Banned) {
       TextNode textContent = (TextNode) childNode;
       String textString = textContent.text();
       append(textContent, textString);
     } else if (childNode instanceof Element) {
       recurse((Element) childNode);
   if (action == ElementAction.Whitespace) {
   } else if (action == ElementAction.Sentence) {
   } else if (action == ElementAction.Mark) {
     Mark mark = new Mark();
Example #3
 private String getTextNodeText(TextNode tn, boolean normalText) {
   String input = normalText ? tn.text() : tn.getWholeText();
   Node prev = tn.previousSibling();
   Node next = tn.nextSibling();
   boolean parentIsBlock = isBlock(tn.parent());
   if (isBlock(prev)) {
     input = ltrim(input);
   } else if (prev == null && parentIsBlock) {
     input = ltrim(input);
   } else if (normalText && prev instanceof TextNode) {
     TextNode tprev = (TextNode) prev;
     if (EMPTY_MATCHER.matcher(tprev.text()).matches()) {
       input = ltrim(input);
   if (input.length() > 0) {
     if (isBlock(next)) {
       input = rtrim(input);
     } else if (next == null && parentIsBlock) {
       input = rtrim(input);
     } else if (normalText && next instanceof TextNode) {
       TextNode tnext = (TextNode) next;
       if (EMPTY_MATCHER.matcher(tnext.text()).matches()) {
         input = rtrim(input);
   return input;
Example #4
 public void parsesUnterminatedComments() {
   String html = "<p>Hello<!-- <tr><td>";
   Document doc = Jsoup.parse(html);
   Element p = doc.getElementsByTag("p").get(0);
   assertEquals("Hello", p.text());
   TextNode text = (TextNode) p.childNode(0);
   assertEquals("Hello", text.getWholeText());
   Comment comment = (Comment) p.childNode(1);
   assertEquals(" <tr><td>", comment.getData());
Example #5
  public void parsesComments() {
    String html =
        "<html><head></head><body><!-- <table><tr><td></table> --><p>Hello</p></body></html>";
    Document doc = Jsoup.parse(html);

    Element body = doc.child(1);
    Comment comment = (Comment) body.childNode(0);
    assertEquals(" <table><tr><td></table> ", comment.getData());
    Element p = body.child(0);
    TextNode text = (TextNode) p.childNode(0);
    assertEquals("Hello", text.getWholeText());
  private static Count modify(Element e, Count c) {
    List<Node> o = e.childNodes();
    if (o.size() == 0 && e.textNodes().size() == 0) return new Count(c.getCount(), c.getPgCount());
    for (Node n : o) {
      if (n instanceof TextNode) {
        TextNode nd = (TextNode) n;
        String[] arr = nd.text().trim().split("\\s");
        String txt = "";
        List<Node> nodes = new ArrayList<Node>();
        int j = 0;
        TextNode ndTemp = new TextNode("", " ");
        nodes.add(j, ndTemp);
        for (int i = 0; i < arr.length; i++) {
          if (arr[i].length() > 0) c.incrementCount();
          if (c.getCount() > PAGE_COUNT) {
            ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " ");
            nodes.add(j, new Element(Tag.valueOf("pageid=" + c.getPgCount()), ""));
            nodes.add(j, new TextNode(" " + arr[i] + " ", ""));
            // "<!--page id="+c.getPgCount()+ "--!>" + " " +  arr[i]);
            // txt = txt + " " + "<!--page id="+c.getPgCount()+ "--!>"  + " " + arr[i]; //<div
            // style='visibility:hidden'>Page="+pageCount+"</div>

          } else {
            // txt = txt + " " + arr[i];
            ((TextNode) nodes.get(j)).text(((TextNode) nodes.get(j)).text() + " " + arr[i]);
        if (nodes.size() > 1) {
          Element etemp = new Element(Tag.valueOf("span"), "");
          for (Node d : nodes) {
        // nd.text(ndTemp.text());

      } else if (n instanceof Element) {
        Count ctemp = modify((Element) n, c);

    return c;
Example #7
  public Map<String, String> attempt(Element element) {
    Map<String, String> attributes = new HashMap<String, String>();
    for (Entry<String, Matcher> entry : matchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), decode(element.text()));

    for (Entry<String, Matcher> entry : textMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        Node textNode = element.nextSibling();
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));

    for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        TextNode textNode = element.textNodes().get(0);
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));

    for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), element.html());

    for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element));

    for (Entry<String, Object[]> entry : attrMatchers.entrySet()) {
      Object[] objects = entry.getValue();
      Matcher matcher = (Matcher) objects[0];
      String attr = (String) objects[1];
      if (matcher.test(element)) {
        attributes.put(entry.getKey(), element.attr(attr));
    return attributes;
 void appendTextSkipHidden(Element e, StringBuilder accum) {
   for (Node child : e.childNodes()) {
     if (unlikely(child)) continue;
     if (child instanceof TextNode) {
       TextNode textNode = (TextNode) child;
       String txt = textNode.text();
     } else if (child instanceof Element) {
       Element element = (Element) child;
       if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
         accum.append(" ");
       else if (element.tagName().equals("br")) accum.append(" ");
       appendTextSkipHidden(element, accum);
 private static void parseReplyTime(Topic.Builder topicBuilder, TextNode textNode) {
   final String text = textNode.text();
   final Matcher matcher = PATTERN_REPLY_TIME.matcher(text);
   if (!matcher.find()) {
     throw new FatalException("match reply time for topic failed: " + text);
   final String time = matcher.group(1);
  /** Walks the DOM recursively, and converts elements into corresponding sitebricks widgets. */
  private <N extends Node> WidgetChain walk(PageCompilingContext pc, N node) {
    WidgetChain widgetChain = Chains.proceeding();
    for (Node n : node.childNodes()) {
      if (n instanceof Element) {
        final Element child = (Element) n;

        // push form if this is a form tag
        if (child.tagName().equals("form")) pc.form = (Element) n;

        // setup a lexical scope if we're going into a repeat widget (by reading the previous node)
        final boolean shouldPopScope = lexicalClimb(pc, child);

        // continue recursing down, perform a post-order, depth-first traversal of the DOM
        WidgetChain childsChildren;
        try {
          childsChildren = walk(pc, child);

          // process the widget itself into a Renderable with child tree
          widgetChain.addWidget(widgetize(pc, child, childsChildren));
        } finally {
          lexicalDescend(pc, child, shouldPopScope);

      } else if (n instanceof TextNode) {
        TextNode child = (TextNode) n;
        Renderable textWidget;

        // setup a lexical scope if we're going into a repeat widget (by reading the previous node)
        final boolean shouldPopScope = lexicalClimb(pc, child);

        // construct the text widget
        try {
          textWidget = registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek());

          // if there are no annotations, add the text widget to the chain
          if (!child.hasAttr(ANNOTATION_KEY)) {
          } else {
            // construct a new widget chain for this text node
            WidgetChain childsChildren = Chains.proceeding().addWidget(textWidget);

            // make a new widget for the annotation, making the text chain the child
            String widgetName = child.attr(ANNOTATION_KEY).toLowerCase();
            Renderable annotationWidget =

        } catch (ExpressionCompileException e) {

        if (shouldPopScope) pc.lexicalScopes.pop();

      } else if ((n instanceof Comment) || (n instanceof DataNode)) {
        // process as raw text widget
        try {
          widgetChain.addWidget(registry.textWidget(cleanHtml(n), pc.lexicalScopes.peek()));
        } catch (ExpressionCompileException e) {

      } else if (n instanceof XmlDeclaration) {
        try {
                  ((XmlDeclaration) n).getWholeDeclaration(), pc.lexicalScopes.peek()));
        } catch (ExpressionCompileException e) {

    // return computed chain, or a terminal
    return widgetChain;
Example #11
  private static void appendNormalisedText(StringBuilder accum, TextNode textNode) {
    String text = textNode.getWholeText();

    if (preserveWhitespace(textNode.parentNode())) accum.append(text);
    else StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsWhitespace(accum));