Esempio n. 1
0
 public static String normalizeDayTitle(String dayTitle) {
   dayTitle = StringUtil.trim(dayTitle.replaceAll("[\\s\u00A0\u3000]+|( )+", " "));
   while (dayTitle.matches("^第.{1,3}天.*")
       || dayTitle.matches("^[Dd]\\d+.*")
       || dayTitle.matches("^[::\\.。!!??、]+.*")) {
     dayTitle = StringUtil.trim(dayTitle.replaceAll("^第.{1,3}天|^[Dd]\\d+|^[::\\.。!!??、]*", ""));
   }
   return dayTitle;
 }
Esempio n. 2
0
  public static List<String> getDescs(Node node) {
    if (node == null) {
      return new ArrayList<String>();
    }
    StringBuilder sb = new StringBuilder();
    List<String> result = new ArrayList<String>();
    NodeList children = node.getChildNodes();
    for (int i = 0; i < children.getLength(); i++) {
      Node child = children.item(i);
      if ("#text".equals(child.getNodeName())) {
        String content = child.getTextContent();
        if (content.replaceAll("\\s|\u00A0|\u3000|&nbsp;?", "").isEmpty()) {
          continue;
        }
        sb.append(StringUtil.trim(content)).append(" ");
      } else {
        getInnerDescs(child, sb);
      }
    }
    String html = sb.toString();
    BufferedReader br = null;
    try {
      br = new BufferedReader(new StringReader(html));
      String line = null;
      while ((line = br.readLine()) != null) {
        if (line.isEmpty()) {
          continue;
        }
        result.add(line);
      }
    } catch (Exception e) {
    } finally {
      if (br != null) {
        try {
          br.close();
        } catch (IOException e) {
        }
      }
    }

    return result;
  }
Esempio n. 3
0
 public static void getInnerDescs(Node node, StringBuilder sb) {
   if ("STYLE".equals(node.getNodeName())) {
     return;
   }
   NodeList children = node.getChildNodes();
   if (HtmlUtil.isNewLineTag(node.getNodeName().toUpperCase())) {
     sb.append("\r\n");
   }
   for (int i = 0; i < children.getLength(); i++) {
     Node child = children.item(i);
     if ("#text".equals(child.getNodeName())) {
       String content = child.getTextContent();
       if (content.replaceAll("\\s|\u00A0|\u3000|&nbsp;?", "").isEmpty()) {
         continue;
       }
       sb.append(StringUtil.trim(content)).append(" ");
     } else {
       getInnerDescs(child, sb);
     }
   }
   if (HtmlUtil.isNewLineTag(node.getNodeName().toUpperCase())) {
     sb.append("\r\n");
   }
 }