Пример #1
0
  /**
   * @param attributes A list of attribs
   * @return Returns a mutable map parsed out of the attribute list
   */
  public static Map<String, String> parseAttribs(Attributes attributes) {

    Map<String, String> attrs = new LinkedHashMap<String, String>(attributes.size() + 4);

    for (Attribute a : attributes.asList())
      if (!SKIP_ATTR.contains(a.getKey())) attrs.put(a.getKey(), a.getValue());

    return attrs;
  }
Пример #2
0
  private static String cleanHtml(final Node node) {
    if (node instanceof Element) {
      Element element = ((Element) node);
      StringBuilder accum = new StringBuilder();
      accum.append("<").append(element.tagName());
      for (Attribute attribute : element.attributes()) {
        if (!(attribute.getKey().startsWith("_"))) {
          accum.append(" ");
          accum.append(attribute.getKey());
          accum.append("=\"");
          accum.append(attribute.getValue());
          accum.append('"');
        }
      }

      if (element.childNodes().isEmpty() && element.tag().isEmpty()) {
        accum.append(" />");
      } else {
        accum.append(">");
        for (Node child : element.childNodes()) accum.append(cleanHtml(child));

        accum.append("</").append(element.tagName()).append(">");
      }
      return accum.toString();
    } else if (node instanceof TextNode) {
      return ((TextNode) node).getWholeText();
    } else if (node instanceof XmlDeclaration) {

      // HACK
      if (node.childNodes().isEmpty()) {
        return "";
      }
      return node.outerHtml();
    } else if (node instanceof Comment) {
      // HACK: elide comments for now.
      return "";
    } else if (node instanceof DataNode && node.childNodes().isEmpty()) {
      // No child nodes are defined but we have to handle content if such exists, example
      // <script language="JavaScript">var a =  { name: "${user.name}"}</script>

      String content = node.attr("data");
      if (Strings.empty(content)) {
        return "";
      }

      return content;
    } else {
      return node.outerHtml();
    }
  }
Пример #3
0
  @SuppressLint("DefaultLocale")
  private String improveHtml(final String html) {

    final Document document = Jsoup.parse(html);

    for (final Element e : document.getAllElements()) {
      if (e.hasAttr("style")) {

        for (final Attribute a : e.attributes()) {
          if (a.getKey().compareTo("style") == 0) {
            final String[] items = a.getValue().trim().split(";");
            String newValue = "";
            for (final String item : items) {
              if (!item.toLowerCase(Locale.ENGLISH).contains("font-family:")
                  && !item.toLowerCase(Locale.ENGLISH).contains("font-size:")) {
                newValue = newValue.concat(item).concat(";");
              }
            }
            a.setValue(newValue);
          }
        }
      }
    }

    return document.body().html();
  }
Пример #4
0
  /** Produce predictable html (attributes in alphabetical order), always include close tags */
  private String elementToHtml(Element producedElem, StringBuilder sb) {
    ArrayList<String> names = new ArrayList<String>();
    for (Attribute a : producedElem.attributes().asList()) {
      names.add(a.getKey());
    }
    Collections.sort(names);

    sb.append("<" + producedElem.tagName() + "");
    for (String attrName : names) {
      sb.append(" ")
          .append(attrName)
          .append("=")
          .append("\'")
          .append(producedElem.attr(attrName))
          .append("\'");
    }
    sb.append(">");
    for (Node child : producedElem.childNodes()) {
      if (child instanceof Element) {
        elementToHtml((Element) child, sb);
      } else if (child instanceof TextNode) {
        String text = ((TextNode) child).text();
        sb.append(text.trim());
      }
    }
    sb.append("</").append(producedElem.tagName()).append(">");
    return sb.toString();
  }
Пример #5
0
  private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) {
    // try to resolve relative urls to abs, and optionally update the attribute so output html has
    // abs.
    // rels without a baseuri get removed
    String value = el.absUrl(attr.getKey());
    if (value.length() == 0)
      value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown
    // protocols
    if (!preserveRelativeLinks) attr.setValue(value);

    for (Protocol protocol : protocols) {
      String prot = protocol.toString() + ":";
      if (value.toLowerCase().startsWith(prot)) {
        return true;
      }
    }
    return false;
  }
Пример #6
0
  /**
   * Test if the supplied attribute is allowed by this whitelist for this tag
   *
   * @param tagName tag to consider allowing the attribute in
   * @param el element under test, to confirm protocol
   * @param attr attribute under test
   * @return true if allowed
   */
  protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
    TagName tag = TagName.valueOf(tagName);
    AttributeKey key = AttributeKey.valueOf(attr.getKey());

    if (attributes.containsKey(tag)) {
      if (attributes.get(tag).contains(key)) {
        if (protocols.containsKey(tag)) {
          Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag);
          // ok if not defined protocol; otherwise test
          return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key));
        } else { // attribute found, no protocols defined, so OK
          return true;
        }
      }
    }
    // no attributes defined for tag, try :all tag
    return !tagName.equals(":all") && isSafeAttribute(":all", el, attr);
  }
Пример #7
0
  public static String ConvertHtmlToEnml(String html) // , Note note)
      {
    String[] prohibitedArray =
        new String[] {
          "applet",
          "base",
          "basefont",
          "bgsound",
          "blink",
          "body",
          "button",
          "dir",
          "embed",
          "fieldset",
          "form",
          "frame",
          "frameset",
          "head",
          "html",
          "iframe",
          "ilayer",
          "input",
          "isindex",
          "label",
          "layer",
          "legend",
          "link",
          "marquee",
          "menu",
          "meta",
          "noframes",
          "noscript",
          "object",
          "optgroup",
          "option",
          "param",
          "plaintext",
          "script",
          "select",
          "style",
          "textarea",
          "xml",
          "image"
        };

    String[] disableAttributesArray =
        new String[] {"id", "class", "accesskey", "data", "dynsrc", "tabindex", "sizset"};

    List<String> prohibited = Arrays.asList(prohibitedArray);
    List<String> disableAttributes = Arrays.asList(disableAttributesArray);

    Document doc = Jsoup.parse(html);

    // var imgs = new ImageRecordDbPersistence().LoadFromFile().ToList();
    ImageRecordDalHelper helper = new ImageRecordDalHelper();

    Elements nodes = doc.getAllElements();
    int total = nodes.size() - 1;
    for (int j = total; j >= 0; j--) {
      // remove all prohibited node
      if (prohibited.contains(nodes.get(j).tagName())) {
        if (!(nodes.get(j).childNodeSize() > 0)) nodes.get(j).remove();
        else {
          for (Element child : nodes.get(j).children()) nodes.get(j).parent().appendChild(child);
          nodes.get(j).remove();
        }
      }

      // remove disabled attribute
      if (nodes.get(j).attributes().size() > 0) {
        int count = nodes.get(j).attributes().size() - 1;
        List<Attribute> attributes = nodes.get(j).attributes().asList();
        // int count = disableAttributes.size();
        for (int i = count; i >= 0; i--) {
          if (disableAttributes.contains(attributes.get(i).getKey())) {
            nodes.get(j).removeAttr(attributes.get(i).getKey());
            continue;
          }

          // deal with on*
          if (attributes.get(i).getKey().startsWith("on")) {
            nodes.get(j).removeAttr(attributes.get(i).getKey());
            continue;
          }

          if (attributes.get(i).getKey().startsWith("sizcache")) {
            nodes.get(j).removeAttr(attributes.get(i).getKey());
            continue;
          }

          if (attributes.get(i).getKey().startsWith("f-size")) {
            nodes.get(j).removeAttr(attributes.get(i).getKey());
            continue;
          }
        }
      }

      // deal with relative href
      if (nodes.get(j).tagName().equals("a")) {
        if (nodes.get(j).attributes().size() > 0 && nodes.get(j).hasAttr("href")) {
          String href = nodes.get(j).attr("href");

          if (!href.startsWith("http") || !href.startsWith("https") || !href.startsWith("www")) {
            nodes.get(j).removeAttr("href");
          }
        }
      }

      // deal with cached img, replace with online src
      if (nodes.get(j).tagName().equals("img") && nodes.get(j).attributes().size() > 0) {
        if (nodes.get(j).hasAttr("xsrc")) {
          if (nodes.get(j).attr("xsrc").startsWith("mnt:")) {
            String src = nodes.get(j).attr("xsrc");
            for (Attribute attr : nodes.get(j).attributes().asList()) {
              nodes.get(j).removeAttr(attr.getKey());
            }

            // nodes.get(j).attr("src", imgs.First(r => src.Contains(r.StoredName)).OriginUrl);
            nodes.get(j).attr("src", helper.GetImageRecordEntityByStoreName(src).OriginUrl);
          } else {
            String xsrc = nodes.get(j).attr("xsrc");
            ;
            for (Attribute attr : nodes.get(j).attributes().asList()) {
              nodes.get(j).removeAttr(attr.getKey());
            }
            nodes.get(j).attr("src", xsrc);
          }
        } else {
          for (Attribute attr : nodes.get(j).attributes().asList()) {
            nodes.get(j).removeAttr(attr.getKey());
          }
        }

        // better reading experience in mobile client and web client
        nodes.get(j).attr("style", "max-height:100%; max-width:100%;");
      }

      if (nodes.get(j).tagName().equals("a") && nodes.get(j).attributes().size() > 0) {
        if (nodes.get(j).hasAttr("href")) {
          String href = nodes.get(j).attr("href");
          for (Attribute attr : nodes.get(j).attributes().asList()) {
            nodes.get(j).removeAttr(attr.getKey());
          }
          nodes.get(j).attr("href", href);
        } else {
          for (Attribute attr : nodes.get(j).attributes().asList()) {
            nodes.get(j).removeAttr(attr.getKey());
          }
        }
      }
    }

    char[] xmlChar = doc.html().toCharArray();
    for (int i = 0; i < xmlChar.length; ++i) {
      if (xmlChar[i] > 0xFFFD) {
        // 或者直接替换掉0xb
        xmlChar[i] = ' '; // 用空格替换
      } else if (xmlChar[i] < 0x20 && xmlChar[i] != 't' & xmlChar[i] != 'n' & xmlChar[i] != 'r') {
        // 或者直接替换掉0xb
        xmlChar[i] = ' '; // 用空格替换
      }
    }

    helper.Close();

    return new String(xmlChar).replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "");
  }