/** * Extracts the raw text from the HTML input, compressing its whitespace and removing all * attributes, scripts, and styles. * * <p>For example, raw text returned by this method can be stored in a search index. * * @param html the HTML text * @return the raw text from the HTML input, or <code>null</code> if the HTML input is <code>null * </code> */ @Override public String extractText(String html) { if (html == null) { return null; } Source source = new Source(html); TextExtractor textExtractor = source.getTextExtractor(); return textExtractor.toString(); }
/** * 根据html获取摘要文本(取100字以内),和img图片uuid(图片取3张) * * @param htmlcontent * @return */ public static String[] getSummaryAndImgByHTML(String htmlcontent) { if (StringUtils.isEmpty(htmlcontent)) { return new String[] {null, null}; } int summaryCount = 100; int maxCount = 3; // 不能替换导致多个空格被改成一个空格. Source source = new Source(htmlcontent); // Renderer renderer=source.getRenderer(); // 修改解析不彻底bug.<http://bbs.mobby.cn/wechat-login.html> <http://bbs.mobby.cn/wechat-login.html> // 有的宝宝不会自己小便,经常尿湿裤子,每次幼儿园的老师都会告状“今天又尿裤子了”,听到这样的话家长也会比较尴尬。遇到这样的情况,家长该怎么办呢? // // renderer.setMaxLineLength(99999);//设置一行最长个数,默认76字符 // String text=renderer.toString(); TextExtractor textExtractor = new TextExtractor(source); String text = textExtractor.toString(); List<Element> imglist = source.getAllElements(HTMLElementName.IMG); String url = ProjectProperties.getProperty( "share_url_getEmot", "http://kd.wenjienet.com/px-rest/i/emoji/"); List listimguuids = new ArrayList(); for (Element img : imglist) { String srcV = img.getAttributeValue("src"); if (StringUtils.contains(srcV, url)) { // 过滤表情 continue; } String imguuid = PxStringUtil.imgUrlToUuid(srcV); if (StringUtils.isNotBlank(imguuid)) { listimguuids.add(imguuid); } if (listimguuids.size() >= maxCount) break; } String summary = PxStringUtil.getSubString(text, summaryCount); String imguuids = null; if (listimguuids.size() > 0) imguuids = StringUtils.join(listimguuids, ','); return new String[] {summary, imguuids}; }