public static String normalizeDayTitle(String dayTitle) { dayTitle = StringUtil.trim(dayTitle.replaceAll("[\\s\u00A0\u3000]+|( )+", " ")); while (dayTitle.matches("^第.{1,3}天.*") || dayTitle.matches("^[Dd]\\d+.*") || dayTitle.matches("^[::\\.。!!??、]+.*")) { dayTitle = StringUtil.trim(dayTitle.replaceAll("^第.{1,3}天|^[Dd]\\d+|^[::\\.。!!??、]*", "")); } return dayTitle; }
public static List<String> getDescs(Node node) { if (node == null) { return new ArrayList<String>(); } StringBuilder sb = new StringBuilder(); List<String> result = new ArrayList<String>(); NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); if ("#text".equals(child.getNodeName())) { String content = child.getTextContent(); if (content.replaceAll("\\s|\u00A0|\u3000| ?", "").isEmpty()) { continue; } sb.append(StringUtil.trim(content)).append(" "); } else { getInnerDescs(child, sb); } } String html = sb.toString(); BufferedReader br = null; try { br = new BufferedReader(new StringReader(html)); String line = null; while ((line = br.readLine()) != null) { if (line.isEmpty()) { continue; } result.add(line); } } catch (Exception e) { } finally { if (br != null) { try { br.close(); } catch (IOException e) { } } } return result; }
public static void getInnerDescs(Node node, StringBuilder sb) { if ("STYLE".equals(node.getNodeName())) { return; } NodeList children = node.getChildNodes(); if (HtmlUtil.isNewLineTag(node.getNodeName().toUpperCase())) { sb.append("\r\n"); } for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); if ("#text".equals(child.getNodeName())) { String content = child.getTextContent(); if (content.replaceAll("\\s|\u00A0|\u3000| ?", "").isEmpty()) { continue; } sb.append(StringUtil.trim(content)).append(" "); } else { getInnerDescs(child, sb); } } if (HtmlUtil.isNewLineTag(node.getNodeName().toUpperCase())) { sb.append("\r\n"); } }