コード例 #1
0
  // @todo(dallison) handle rtl text
  private static void recurseNode(Node node, StringBuilder builder) {
    // boundry case for text
    if (node instanceof TextNode) {
      builder.append(((TextNode) node).text());
      builder.append(" ");
      return;
    }

    // check for title text
    if (node.hasAttr(TITLE)) {
      builder.append(node.attr(TITLE));
      builder.append(" ");
    }

    // if the current node has alt text append it
    if (node.hasAttr(ALT_TEXT)) {
      builder.append(node.attr(ALT_TEXT));
      // Add trailing white space to separate elements
      builder.append(" ");
    }

    // recurse into the child nodes.
    for (Node child : node.childNodes()) {
      recurseNode(child, builder);
    }
  }
コード例 #2
0
  private static Collection<Node> extractImageNodes(Element aInContent) {
    Collection<Node> lImageNodes = new LinkedList<>();

    Elements lImageElements = aInContent.getElementsByTag("img");
    if (!lImageElements.isEmpty()) {
      int i = 0;
      for (Element lImageElement : lImageElements) {
        i++;
        if (lImageElement.hasClass("float-left")) {
          if (!lImageElement.hasClass("alignleft")) {
            lImageElement.addClass("alignleft");
          }
        } else if (lImageElement.hasClass("float-right")) {
          if (!lImageElement.hasClass("alignright")) {
            lImageElement.addClass("alignright");
          }
        }

        if (i > 1) {
          lImageElement.removeAttr("width");
          lImageElement.removeAttr("height");
        }

        Node lThisNode = toNode(lImageElement);
        lImageNodes.add(lThisNode.clone());
      }
    }

    return lImageNodes;
  }
コード例 #3
0
ファイル: Cleaner.java プロジェクト: haggisandchips/jsoup
    public void head(Node source, int depth) {
      if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
          ElementMeta meta = createSafeElement(sourceEl);
          Element destChild = meta.el;
          destination.appendChild(destChild);

          numDiscarded += meta.numAttribsDiscarded;
          destination = destChild;
        } else if (source
            != root) { // not a safe tag, so don't add. don't count root against discarded.
          numDiscarded++;
        }
      } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText(), source.baseUri());
        destination.appendChild(destText);
      } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
        DataNode sourceData = (DataNode) source;
        DataNode destData = new DataNode(sourceData.getWholeData(), source.baseUri());
        destination.appendChild(destData);
      } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
      }
    }
コード例 #4
0
ファイル: ParseLayoutTest.java プロジェクト: jmauix/vaadin
  @Test
  public void designIsSerializedWithCorrectPrefixesAndPackageNames() throws IOException {
    ByteArrayOutputStream out = serializeDesign(ctx);

    // Check the mapping from prefixes to package names using the html tree
    String[] expectedPrefixes = {"my"};
    String[] expectedPackageNames = {"com.addon.mypackage"};
    int index = 0;

    Document doc = Jsoup.parse(out.toString("UTF-8"));
    Element head = doc.head();
    for (Node child : head.childNodes()) {
      if ("meta".equals(child.nodeName())) {
        String name = child.attributes().get("name");
        if ("package-mapping".equals(name)) {
          String content = child.attributes().get("content");
          String[] parts = content.split(":");
          assertEquals("Unexpected prefix.", expectedPrefixes[index], parts[0]);
          assertEquals("Unexpected package name.", expectedPackageNames[index], parts[1]);
          index++;
        }
      }
    }
    assertEquals("Unexpected number of prefix - package name pairs.", 1, index);
  }
コード例 #5
0
  // Ensures that embed bound properties are writable
  private void checkEmbedAgainst(
      PageCompilingContext pc,
      EvaluatorCompiler compiler,
      Map<String, String> properties,
      Class<?> embedClass,
      Node node) {

    // TODO also type check them against expressions
    for (String property : properties.keySet()) {
      try {
        if (!compiler.isWritable(property)) {
          pc.errors.add(
              CompileError.in(node.outerHtml())
                  // TODO we need better line number detection if there is whitespace between the
                  // annotation and tag.
                  .near(node.siblingIndex() - 1) // TODO -  line number of the annotation
                  .causedBy(
                      CompileErrors.PROPERTY_NOT_WRITEABLE,
                      String.format(
                          "Property %s#%s was not writable. Did you forget to create "
                              + "a setter or @Visible annotation?",
                          embedClass.getSimpleName(), property)));
        }
      } catch (ExpressionCompileException ece) {
        pc.errors.add(
            CompileError.in(node.outerHtml())
                .near(node.siblingIndex()) // TODO - line number
                .causedBy(CompileErrors.ERROR_COMPILING_PROPERTY));
      }
    }
  }
コード例 #6
0
ファイル: Node.java プロジェクト: saeg/experiments
 protected void removeChild(Node out) {
   Validate.isTrue(out.parentNode == this);
   int index = out.siblingIndex();
   childNodes.remove(index);
   reindexChildren();
   out.parentNode = null;
 }
コード例 #7
0
  public List<Node> findSiblings(Node node) {
    Preconditions.checkNotNull(node);

    Node parent = node.parent();
    if (null == parent) return null;

    return parent.childNodes();
  }
コード例 #8
0
  boolean unlikely(Node e) {
    if (e.attr("class") != null
        && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption")) return true;

    String style = e.attr("style");
    String clazz = e.attr("class");
    return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
  }
コード例 #9
0
ファイル: Node.java プロジェクト: saeg/experiments
 protected void addChildren(Node... children) {
   // most used. short circuit addChildren(int), which hits reindex children and array copy
   for (Node child : children) {
     reparentChild(child);
     childNodes.add(child);
     child.setSiblingIndex(childNodes.size() - 1);
   }
 }
コード例 #10
0
 private static Node toNode(Element aInElement) {
   int i = aInElement.siblingIndex();
   Node lNode = aInElement.parent().childNode(i);
   if (!lNode.nodeName().equals(aInElement.tagName())) {
     throw new RuntimeException(lNode.nodeName() + " != " + aInElement.tagName());
   }
   return lNode;
 }
コード例 #11
0
ファイル: NodeUtil.java プロジェクト: sawantuday/ScreenSlicer
 public static boolean isEmpty(Node node, boolean doFilter) {
   return node == null
       || node.nodeName().equals("#comment")
       || node.nodeName().equals("#data")
       || node.nodeName().equals("style")
       || node.nodeName().equals("script")
       || isHidden(node)
       || (doFilter && isFiltered(node))
       || (node.nodeName().equals("#text") && CommonUtil.isEmpty(node.toString(), true));
 }
コード例 #12
0
ファイル: ScanHtml.java プロジェクト: InnovTeam/HtmlConverter
 private static void removeComments(Node node) {
   for (int i = 0; i < node.childNodes().size(); ) {
     Node child = node.childNode(i);
     if (child.nodeName().equals("#comment")) child.remove();
     else {
       removeComments(child);
       i++;
     }
   }
 }
コード例 #13
0
 private Element cleanupElement(Element el) {
   Tag newTag = null;
   String newText = null;
   if (el.nodeName().equals("img")) {
     newTag = Tag.valueOf("x");
     newText = el.attr("src");
   }
   if (el.nodeName().equals("em")) {
     newTag = Tag.valueOf("b");
   }
   if (el.nodeName().equals("a")) {
     String clazz = el.attr("class");
     if (clazz.equals("user")) {
       newTag = Tag.valueOf("x");
       newText = "@" + el.text().trim();
     } else if (clazz.startsWith("postimg video")) {
       newTag = Tag.valueOf("x");
       newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src");
     } else if (clazz.startsWith("postimg")) {
       newTag = Tag.valueOf("x");
     } else if (clazz.equals("post")) {
       newTag = Tag.valueOf("x");
     } else {
       newTag = Tag.valueOf("x");
       newText = el.attr("href");
     }
   }
   if (el.nodeName().equals("div")) {
     newTag = Tag.valueOf("x");
   }
   Element nel;
   if (newTag == null) {
     // el = el;
     nel = new Element(el.tag(), "");
     //            for(List<Node> children = nel.childNodes(); children.size() > 0; children =
     // nel.childNodes()) {
     //                children.get(0).remove();
     //            }
   } else {
     nel = new Element(newTag, "");
   }
   if (newText != null) {
     nel.appendChild(new TextNode(newText, ""));
   } else {
     List<Node> children = el.childNodes();
     for (Node child : children) {
       if (child instanceof Element) {
         nel.appendChild(cleanupElement((Element) child));
       } else {
         nel.appendChild(new TextNode(child.toString(), ""));
       }
     }
   }
   return nel;
 }
コード例 #14
0
ファイル: Node.java プロジェクト: saeg/experiments
  protected void replaceChild(Node out, Node in) {
    Validate.isTrue(out.parentNode == this);
    Validate.notNull(in);
    if (in.parentNode != null) in.parentNode.removeChild(in);

    Integer index = out.siblingIndex();
    childNodes.set(index, in);
    in.parentNode = this;
    in.setSiblingIndex(index);
    out.parentNode = null;
  }
コード例 #15
0
ファイル: NodeUtil.java プロジェクト: sawantuday/ScreenSlicer
 public static int nearestBlock(Node node) {
   int nearest = 0;
   Node parent = node.parent();
   while (parent != null) {
     ++nearest;
     if (NodeUtil.isProximityBlock(parent.nodeName())) {
       return nearest;
     }
     parent = parent.parent();
   }
   return Integer.MAX_VALUE;
 }
コード例 #16
0
ファイル: NodeUtil.java プロジェクト: sawantuday/ScreenSlicer
 public static boolean matches(HtmlNode reference, Node test) {
   if (test == null) {
     return false;
   }
   if (!CommonUtil.isEmpty(reference.id)) {
     return reference.id.equalsIgnoreCase(test.attr("id"));
   }
   if (!CommonUtil.isEmpty(reference.name)) {
     return reference.name.equalsIgnoreCase(test.attr("name"));
   }
   List<String[]> toMatch = new ArrayList<String[]>();
   toMatch.add(new String[] {reference.tagName, test.nodeName()});
   toMatch.add(new String[] {reference.type, test.attr("type")});
   toMatch.add(new String[] {reference.value, test.attr("value")});
   toMatch.add(new String[] {reference.title, test.attr("title")});
   toMatch.add(new String[] {reference.role, test.attr("role")});
   toMatch.add(new String[] {reference.alt, test.attr("alt")});
   toMatch.add(new String[] {reference.href, test.attr("href")});
   if (test instanceof Element) {
     toMatch.add(
         new String[] {
           CommonUtil.strip(reference.innerText, false),
           CommonUtil.strip(((Element) test).text(), false)
         });
   }
   String refClassesString = CommonUtil.toString(reference.classes, " ");
   Collection<String> refClasses =
       new HashSet<String>(Arrays.asList(refClassesString.toLowerCase().split("\\s")));
   Collection<String> testClasses =
       new HashSet<String>(Arrays.asList(test.attr("class").toLowerCase().split("\\s")));
   for (String[] pair : toMatch) {
     if (reference.any) {
       if (!CommonUtil.isEmpty(pair[0]) && pair[0].equalsIgnoreCase(pair[1])) {
         return true;
       }
     } else {
       if (!CommonUtil.isEmpty(pair[0]) && !pair[0].equalsIgnoreCase(pair[1])) {
         return false;
       }
     }
   }
   if (!refClasses.isEmpty()) {
     for (String testClass : testClasses) {
       if (reference.any) {
         if (refClasses.contains(testClass)) {
           return true;
         }
       } else {
         if (!refClasses.contains(testClass)) {
           return false;
         }
       }
     }
   }
   return !reference.any;
 }
コード例 #17
0
ファイル: NodeUtil.java プロジェクト: sawantuday/ScreenSlicer
 public static boolean isItem(Node node, HtmlNode matchResult, HtmlNode matchParent) {
   if (matchResult != null) {
     return matches(matchResult, node);
   }
   if (matchParent != null) {
     return node.parent() != null && matches(matchParent, node.parent());
   }
   for (int i = 0; i < items.length; i++) {
     if (node.nodeName().equalsIgnoreCase(items[i])) {
       return true;
     }
   }
   return false;
 }
コード例 #18
0
  private static String cleanHtml(final Node node) {
    if (node instanceof Element) {
      Element element = ((Element) node);
      StringBuilder accum = new StringBuilder();
      accum.append("<").append(element.tagName());
      for (Attribute attribute : element.attributes()) {
        if (!(attribute.getKey().startsWith("_"))) {
          accum.append(" ");
          accum.append(attribute.getKey());
          accum.append("=\"");
          accum.append(attribute.getValue());
          accum.append('"');
        }
      }

      if (element.childNodes().isEmpty() && element.tag().isEmpty()) {
        accum.append(" />");
      } else {
        accum.append(">");
        for (Node child : element.childNodes()) accum.append(cleanHtml(child));

        accum.append("</").append(element.tagName()).append(">");
      }
      return accum.toString();
    } else if (node instanceof TextNode) {
      return ((TextNode) node).getWholeText();
    } else if (node instanceof XmlDeclaration) {

      // HACK
      if (node.childNodes().isEmpty()) {
        return "";
      }
      return node.outerHtml();
    } else if (node instanceof Comment) {
      // HACK: elide comments for now.
      return "";
    } else if (node instanceof DataNode && node.childNodes().isEmpty()) {
      // No child nodes are defined but we have to handle content if such exists, example
      // <script language="JavaScript">var a =  { name: "${user.name}"}</script>

      String content = node.attr("data");
      if (Strings.empty(content)) {
        return "";
      }

      return content;
    } else {
      return node.outerHtml();
    }
  }
コード例 #19
0
ファイル: AttributeCatcher.java プロジェクト: letorn/crawler
  public Map<String, String> attempt(Element element) {
    Map<String, String> attributes = new HashMap<String, String>();
    for (Entry<String, Matcher> entry : matchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), decode(element.text()));
      }
    }

    for (Entry<String, Matcher> entry : textMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        Node textNode = element.nextSibling();
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));
        }
      }
    }

    for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        TextNode textNode = element.textNodes().get(0);
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));
        }
      }
    }

    for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), element.html());
      }
    }

    for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element));
      }
    }

    for (Entry<String, Object[]> entry : attrMatchers.entrySet()) {
      Object[] objects = entry.getValue();
      Matcher matcher = (Matcher) objects[0];
      String attr = (String) objects[1];
      if (matcher.test(element)) {
        attributes.put(entry.getKey(), element.attr(attr));
      }
    }
    return attributes;
  }
コード例 #20
0
  public List<String> getNextNeighbors(Node nextSib) {
    if (nextSib == null) {
      return null;
    }

    String data;
    StringBuilder sb = new StringBuilder();
    String[] neighbors;
    List<String> retList = new ArrayList<String>();
    int count = 0;
    while (count < 5) {
      data = getData(nextSib);
      if (data == null) {
        break;
      }
      neighbors = data.split(" ");
      for (String s : neighbors) {
        if (count == 5) {
          break;
        }
        if (!s.matches("^[a-zA-Z0-9]+$")) {
          s = s.replaceAll("[^\\p{Alpha}\\p{Digit}]+", "");
        }
        sb.append(s.toLowerCase());
        count++;
        sb.append(" ");
      }
      nextSib = nextSib.nextSibling();
    }
    for (String s : sb.toString().split(" ")) {
      retList.add(s);
    }
    return retList;
  }
コード例 #21
0
ファイル: Node.java プロジェクト: saeg/experiments
  /**
   * Insert the specified node into the DOM after this node (i.e. as a following sibling).
   *
   * @param node to add after this node
   * @return this node, for chaining
   * @see #before(Node)
   */
  public Node after(Node node) {
    Validate.notNull(node);
    Validate.notNull(parentNode);

    parentNode.addChildren(siblingIndex() + 1, node);
    return this;
  }
コード例 #22
0
  public List<String> getPrevNeighbors(Node prevSib) {
    if (prevSib == null) {
      return null;
    }

    String data;
    StringBuilder sb = new StringBuilder();
    String[] neighbors;
    List<String> retList = new ArrayList<String>();
    int count = 0;
    while (count < 5) {
      data = getData(prevSib);
      if (data == null) {
        break;
      }
      neighbors = data.split(" ");
      for (int i = neighbors.length - 1; i >= 0; i--) {
        if (count == 5) {
          break;
        }
        String word = neighbors[i];
        if (!word.matches("^[a-zA-Z0-9]+$")) {
          word = word.replaceAll("[^\\p{Alpha}\\p{Digit}]+", "");
        }
        sb.append(word.toLowerCase());
        count++;
        sb.append(" ");
      }
      prevSib = prevSib.previousSibling();
    }
    for (String s : sb.toString().split(" ")) {
      retList.add(s);
    }
    return retList;
  }
コード例 #23
0
ファイル: Node.java プロジェクト: saeg/experiments
 @Override
 public int hashCode() {
   int result = parentNode != null ? parentNode.hashCode() : 0;
   // not children, or will block stack as they go back up to parent)
   result = 31 * result + (attributes != null ? attributes.hashCode() : 0);
   return result;
 }
コード例 #24
0
  private void checkUriConsistency(PageCompilingContext pc, Node element) {
    String uriAttrib = element.attr("action");
    if (null == uriAttrib) uriAttrib = element.attr("src");
    if (null == uriAttrib) uriAttrib = element.attr("href");

    if (null != uriAttrib) {

      // Verify that such a uri exists in the page book,
      // only if it is contextual--ignore abs & relative URIs.
      if (uriAttrib.startsWith("/"))
        if (null == pageBook.nonCompilingGet(uriAttrib))
          pc.warnings.add(
              CompileError.in(element.outerHtml())
                  .near(element.siblingIndex()) // TODO - line number
                  .causedBy(CompileErrors.UNRESOLVABLE_FORM_ACTION, uriAttrib));
    }
  }
コード例 #25
0
ファイル: Node.java プロジェクト: saeg/experiments
  private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
  }
コード例 #26
0
  /** Called to push a new lexical scope onto the stack. */
  private boolean lexicalClimb(PageCompilingContext pc, Node node) {
    if (node.attr(ANNOTATION).length() > 1) {

      // Setup a new lexical scope (symbol table changes on each scope encountered).
      if (REPEAT_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY))
          || CHOOSE_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY))) {

        String[] keyAndContent = {node.attr(ANNOTATION_KEY), node.attr(ANNOTATION_CONTENT)};
        pc.lexicalScopes.push(new MvelEvaluatorCompiler(parseRepeatScope(pc, keyAndContent, node)));
        return true;
      }

      // Setup a new lexical scope for compiling against embedded pages (closures).
      final PageBook.Page embed = pageBook.forName(node.attr(ANNOTATION_KEY));
      if (null != embed) {
        final Class<?> embedClass = embed.pageClass();
        MvelEvaluatorCompiler compiler = new MvelEvaluatorCompiler(embedClass);
        checkEmbedAgainst(
            pc, compiler, Parsing.toBindMap(node.attr(ANNOTATION_CONTENT)), embedClass, node);

        pc.lexicalScopes.push(compiler);
        return true;
      }
    }

    return false;
  }
コード例 #27
0
 // hit when the node is first seen
 public void head(Node node, int depth) {
   String name = node.nodeName();
   if (node instanceof TextNode) {
     append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
   } else if (name.equals("li")) append("<li>");
   else if (name.equals("ul")) {
     append("<ul>");
   }
 }
コード例 #28
0
ファイル: NodeUtil.java プロジェクト: sawantuday/ScreenSlicer
  static void markVisible(Node node) {
    if (node != null) {
      if (node.nodeName().equals("select")) {
        node.traverse(
            new NodeVisitor() {
              @Override
              public void tail(Node n, int d) {}

              @Override
              public void head(Node n, int d) {
                n.attr("class", hiddenMarker.matcher(n.attr("class")).replaceAll(""));
              }
            });
      }
      node.attr("class", hiddenMarker.matcher(node.attr("class")).replaceAll(""));
      markVisible(node.parent());
    }
  }
コード例 #29
0
ファイル: NodeUtil.java プロジェクト: sawantuday/ScreenSlicer
 public static boolean isContent(Node node, HtmlNode matchResult, HtmlNode matchParent) {
   if (matchParent != null) {
     return matches(matchParent, node);
   }
   if (matchResult != null) {
     for (Node child : node.childNodes()) {
       if (matches(matchResult, child)) {
         return true;
       }
     }
     return false;
   }
   for (int i = 0; i < content.length; i++) {
     if (node.nodeName().equalsIgnoreCase(content[i])) {
       return true;
     }
   }
   return false;
 }
コード例 #30
0
ファイル: Node.java プロジェクト: saeg/experiments
  /**
   * Removes this node from the DOM, and moves its children up into the node's parent. This has the
   * effect of dropping the node but keeping its children.
   *
   * <p>For example, with the input html:<br>
   * {@code <div>One <span>Two <b>Three</b></span></div>}<br>
   * Calling {@code element.unwrap()} on the {@code span} element will result in the html:<br>
   * {@code <div>One Two <b>Three</b></div>}<br>
   * and the {@code "Two "} {@link TextNode} being returned.
   *
   * @return the first child of this node, after the node has been unwrapped. Null if the node had
   *     no children.
   * @see #remove()
   * @see #wrap(String)
   */
  public Node unwrap() {
    Validate.notNull(parentNode);

    int index = siblingIndex;
    Node firstChild = childNodes.size() > 0 ? childNodes.get(0) : null;
    parentNode.addChildren(index, this.childNodesAsArray());
    this.remove();

    return firstChild;
  }