예제 #1
0
  /**
   * Extracts the raw text from the HTML input, compressing its whitespace and removing all
   * attributes, scripts, and styles.
   *
   * <p>For example, raw text returned by this method can be stored in a search index.
   *
   * @param html the HTML text
   * @return the raw text from the HTML input, or <code>null</code> if the HTML input is <code>null
   *     </code>
   */
  @Override
  public String extractText(String html) {
    if (html == null) {
      return null;
    }

    Source source = new Source(html);

    TextExtractor textExtractor = source.getTextExtractor();

    return textExtractor.toString();
  }
예제 #2
0
  /**
   * 根据html获取摘要文本(取100字以内),和img图片uuid(图片取3张)
   *
   * @param htmlcontent
   * @return
   */
  public static String[] getSummaryAndImgByHTML(String htmlcontent) {
    if (StringUtils.isEmpty(htmlcontent)) {
      return new String[] {null, null};
    }

    int summaryCount = 100;
    int maxCount = 3;

    // 不能替换导致多个空格被改成一个空格.
    Source source = new Source(htmlcontent);
    //		Renderer renderer=source.getRenderer();

    // 修改解析不彻底bug.<http://bbs.mobby.cn/wechat-login.html> <http://bbs.mobby.cn/wechat-login.html>
    // 有的宝宝不会自己小便,经常尿湿裤子,每次幼儿园的老师都会告状“今天又尿裤子了”,听到这样的话家长也会比较尴尬。遇到这样的情况,家长该怎么办呢?
    //
    //		renderer.setMaxLineLength(99999);//设置一行最长个数,默认76字符
    //		String text=renderer.toString();

    TextExtractor textExtractor = new TextExtractor(source);
    String text = textExtractor.toString();

    List<Element> imglist = source.getAllElements(HTMLElementName.IMG);
    String url =
        ProjectProperties.getProperty(
            "share_url_getEmot", "http://kd.wenjienet.com/px-rest/i/emoji/");
    List listimguuids = new ArrayList();

    for (Element img : imglist) {
      String srcV = img.getAttributeValue("src");
      if (StringUtils.contains(srcV, url)) { // 过滤表情
        continue;
      }
      String imguuid = PxStringUtil.imgUrlToUuid(srcV);
      if (StringUtils.isNotBlank(imguuid)) {
        listimguuids.add(imguuid);
      }
      if (listimguuids.size() >= maxCount) break;
    }

    String summary = PxStringUtil.getSubString(text, summaryCount);
    String imguuids = null;
    if (listimguuids.size() > 0) imguuids = StringUtils.join(listimguuids, ',');

    return new String[] {summary, imguuids};
  }