/** * @param attributes A list of attribs * @return Returns a mutable map parsed out of the attribute list */ public static Map<String, String> parseAttribs(Attributes attributes) { Map<String, String> attrs = new LinkedHashMap<String, String>(attributes.size() + 4); for (Attribute a : attributes.asList()) if (!SKIP_ATTR.contains(a.getKey())) attrs.put(a.getKey(), a.getValue()); return attrs; }
private static String cleanHtml(final Node node) { if (node instanceof Element) { Element element = ((Element) node); StringBuilder accum = new StringBuilder(); accum.append("<").append(element.tagName()); for (Attribute attribute : element.attributes()) { if (!(attribute.getKey().startsWith("_"))) { accum.append(" "); accum.append(attribute.getKey()); accum.append("=\""); accum.append(attribute.getValue()); accum.append('"'); } } if (element.childNodes().isEmpty() && element.tag().isEmpty()) { accum.append(" />"); } else { accum.append(">"); for (Node child : element.childNodes()) accum.append(cleanHtml(child)); accum.append("</").append(element.tagName()).append(">"); } return accum.toString(); } else if (node instanceof TextNode) { return ((TextNode) node).getWholeText(); } else if (node instanceof XmlDeclaration) { // HACK if (node.childNodes().isEmpty()) { return ""; } return node.outerHtml(); } else if (node instanceof Comment) { // HACK: elide comments for now. return ""; } else if (node instanceof DataNode && node.childNodes().isEmpty()) { // No child nodes are defined but we have to handle content if such exists, example // <script language="JavaScript">var a = { name: "${user.name}"}</script> String content = node.attr("data"); if (Strings.empty(content)) { return ""; } return content; } else { return node.outerHtml(); } }
@SuppressLint("DefaultLocale") private String improveHtml(final String html) { final Document document = Jsoup.parse(html); for (final Element e : document.getAllElements()) { if (e.hasAttr("style")) { for (final Attribute a : e.attributes()) { if (a.getKey().compareTo("style") == 0) { final String[] items = a.getValue().trim().split(";"); String newValue = ""; for (final String item : items) { if (!item.toLowerCase(Locale.ENGLISH).contains("font-family:") && !item.toLowerCase(Locale.ENGLISH).contains("font-size:")) { newValue = newValue.concat(item).concat(";"); } } a.setValue(newValue); } } } } return document.body().html(); }
/** Produce predictable html (attributes in alphabetical order), always include close tags */ private String elementToHtml(Element producedElem, StringBuilder sb) { ArrayList<String> names = new ArrayList<String>(); for (Attribute a : producedElem.attributes().asList()) { names.add(a.getKey()); } Collections.sort(names); sb.append("<" + producedElem.tagName() + ""); for (String attrName : names) { sb.append(" ") .append(attrName) .append("=") .append("\'") .append(producedElem.attr(attrName)) .append("\'"); } sb.append(">"); for (Node child : producedElem.childNodes()) { if (child instanceof Element) { elementToHtml((Element) child, sb); } else if (child instanceof TextNode) { String text = ((TextNode) child).text(); sb.append(text.trim()); } } sb.append("</").append(producedElem.tagName()).append(">"); return sb.toString(); }
private boolean testValidProtocol(Element el, Attribute attr, Set<Protocol> protocols) { // try to resolve relative urls to abs, and optionally update the attribute so output html has // abs. // rels without a baseuri get removed String value = el.absUrl(attr.getKey()); if (value.length() == 0) value = attr.getValue(); // if it could not be made abs, run as-is to allow custom unknown // protocols if (!preserveRelativeLinks) attr.setValue(value); for (Protocol protocol : protocols) { String prot = protocol.toString() + ":"; if (value.toLowerCase().startsWith(prot)) { return true; } } return false; }
/** * Test if the supplied attribute is allowed by this whitelist for this tag * * @param tagName tag to consider allowing the attribute in * @param el element under test, to confirm protocol * @param attr attribute under test * @return true if allowed */ protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) { TagName tag = TagName.valueOf(tagName); AttributeKey key = AttributeKey.valueOf(attr.getKey()); if (attributes.containsKey(tag)) { if (attributes.get(tag).contains(key)) { if (protocols.containsKey(tag)) { Map<AttributeKey, Set<Protocol>> attrProts = protocols.get(tag); // ok if not defined protocol; otherwise test return !attrProts.containsKey(key) || testValidProtocol(el, attr, attrProts.get(key)); } else { // attribute found, no protocols defined, so OK return true; } } } // no attributes defined for tag, try :all tag return !tagName.equals(":all") && isSafeAttribute(":all", el, attr); }
public static String ConvertHtmlToEnml(String html) // , Note note) { String[] prohibitedArray = new String[] { "applet", "base", "basefont", "bgsound", "blink", "body", "button", "dir", "embed", "fieldset", "form", "frame", "frameset", "head", "html", "iframe", "ilayer", "input", "isindex", "label", "layer", "legend", "link", "marquee", "menu", "meta", "noframes", "noscript", "object", "optgroup", "option", "param", "plaintext", "script", "select", "style", "textarea", "xml", "image" }; String[] disableAttributesArray = new String[] {"id", "class", "accesskey", "data", "dynsrc", "tabindex", "sizset"}; List<String> prohibited = Arrays.asList(prohibitedArray); List<String> disableAttributes = Arrays.asList(disableAttributesArray); Document doc = Jsoup.parse(html); // var imgs = new ImageRecordDbPersistence().LoadFromFile().ToList(); ImageRecordDalHelper helper = new ImageRecordDalHelper(); Elements nodes = doc.getAllElements(); int total = nodes.size() - 1; for (int j = total; j >= 0; j--) { // remove all prohibited node if (prohibited.contains(nodes.get(j).tagName())) { if (!(nodes.get(j).childNodeSize() > 0)) nodes.get(j).remove(); else { for (Element child : nodes.get(j).children()) nodes.get(j).parent().appendChild(child); nodes.get(j).remove(); } } // remove disabled attribute if (nodes.get(j).attributes().size() > 0) { int count = nodes.get(j).attributes().size() - 1; List<Attribute> attributes = nodes.get(j).attributes().asList(); // int count = disableAttributes.size(); for (int i = count; i >= 0; i--) { if (disableAttributes.contains(attributes.get(i).getKey())) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } // deal with on* if (attributes.get(i).getKey().startsWith("on")) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } if (attributes.get(i).getKey().startsWith("sizcache")) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } if (attributes.get(i).getKey().startsWith("f-size")) { nodes.get(j).removeAttr(attributes.get(i).getKey()); continue; } } } // deal with relative href if (nodes.get(j).tagName().equals("a")) { if (nodes.get(j).attributes().size() > 0 && nodes.get(j).hasAttr("href")) { String href = nodes.get(j).attr("href"); if (!href.startsWith("http") || !href.startsWith("https") || !href.startsWith("www")) { nodes.get(j).removeAttr("href"); } } } // deal with cached img, replace with online src if (nodes.get(j).tagName().equals("img") && nodes.get(j).attributes().size() > 0) { if (nodes.get(j).hasAttr("xsrc")) { if (nodes.get(j).attr("xsrc").startsWith("mnt:")) { String src = nodes.get(j).attr("xsrc"); for (Attribute attr : nodes.get(j).attributes().asList()) { nodes.get(j).removeAttr(attr.getKey()); } // nodes.get(j).attr("src", imgs.First(r => src.Contains(r.StoredName)).OriginUrl); nodes.get(j).attr("src", helper.GetImageRecordEntityByStoreName(src).OriginUrl); } else { String xsrc = nodes.get(j).attr("xsrc"); ; for (Attribute attr : nodes.get(j).attributes().asList()) { nodes.get(j).removeAttr(attr.getKey()); } nodes.get(j).attr("src", xsrc); } } else { for (Attribute attr : nodes.get(j).attributes().asList()) { nodes.get(j).removeAttr(attr.getKey()); } } // better reading experience in mobile client and web client nodes.get(j).attr("style", "max-height:100%; max-width:100%;"); } if (nodes.get(j).tagName().equals("a") && nodes.get(j).attributes().size() > 0) { if (nodes.get(j).hasAttr("href")) { String href = nodes.get(j).attr("href"); for (Attribute attr : nodes.get(j).attributes().asList()) { nodes.get(j).removeAttr(attr.getKey()); } nodes.get(j).attr("href", href); } else { for (Attribute attr : nodes.get(j).attributes().asList()) { nodes.get(j).removeAttr(attr.getKey()); } } } } char[] xmlChar = doc.html().toCharArray(); for (int i = 0; i < xmlChar.length; ++i) { if (xmlChar[i] > 0xFFFD) { // 或者直接替换掉0xb xmlChar[i] = ' '; // 用空格替换 } else if (xmlChar[i] < 0x20 && xmlChar[i] != 't' & xmlChar[i] != 'n' & xmlChar[i] != 'r') { // 或者直接替换掉0xb xmlChar[i] = ' '; // 用空格替换 } } helper.Close(); return new String(xmlChar).replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", ""); }