public static void dealOnePage(String url, int starNo) { try { Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection()); NodeList tableSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF")); parser = new Parser(new Lexer(tableSet.toHtml())); NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr")); parser = new Parser(new Lexer(tdSet.toHtml())); PrototypicalNodeFactory p = new PrototypicalNodeFactory(); p.registerTag(new SpanTag()); parser.setNodeFactory(p); NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span")); int index = 0; for (int i = 5; i < spanSet.size(); i = i + 5) { String str = spanSet.elementAt(i).toPlainTextString(); String now = "" + (starNo * 100 + index); index++; while (str.compareTo(now) != 0) { System.out.println(now); now = "" + (starNo * 100 + index); index++; } // System.out.println(str); } } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
/** * @param <T> 标签类型 * @param html 需要解析的文本html * @param tagType 标签类型 class * @param attr 该标签应该有的树形 * @param value 属性的值 (Ϊnull ��Ϊ��ƥ��) * @return */ public static <T extends TagNode> List<T> parseTags( String html, final Class<T> tagType, final String attr, final String value, final boolean test) { Parser parser = new Parser(); try { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new PreTag()); parser.setNodeFactory(factory); parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { if (test) logger.info(node.getClass()); if (node.getClass() == tagType) { if (attr == null) return true; T tn = (T) node; String attrv = tn.getAttribute(attr); if (value == null && attrv != null) { // || attrv.equals(value) return true; } if (test) logger.info(attrv); if (value != null && attrv != null && attrv.equals(value)) return true; } return false; } }); List<T> tags = new ArrayList<T>(); for (int i = 0; i < tagList.size(); i++) { tags.add((T) tagList.elementAt(i)); } return tags; } catch (ParserException e) { e.printStackTrace(); } return null; }
public static NodeList parseAllTags(String html) { Parser parser = new Parser(); try { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new PreTag()); parser.setNodeFactory(factory); parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { return true; } }); return tagList; } catch (ParserException e) { e.printStackTrace(); } return null; }
/** * 生成预览内容 * * @param html * @param max_count * @return */ public static String preview(String html, int max_count) { if (html.length() <= max_count * 1.1) return html; Parser parser = new Parser(); StringBuffer prvContent = new StringBuffer(); try { parser.setEncoding(Globals.ENC_8859_1); parser.setInputHTML(html); parser.setNodeFactory(factory); NodeList nodes = parser.extractAllNodesThatMatch(nfilter); Node node = null; for (int i = 0; i < nodes.size(); i++) { if (prvContent.length() >= max_count) { if (node instanceof TagNode) { TagNode tmp_node = (TagNode) node; boolean isEnd = tmp_node.isEndTag(); if (!isEnd) { prvContent.setLength(prvContent.length() - tmp_node.getText().length() - 2); } } // 补齐所有未关闭的标签 Node parent = node; // System.out.println("current node is . "+parent.getText()); do { // System.out.println(parent.getClass().getName()+":"+parent.getText()); parent = parent.getParent(); // System.out.println("parent = "+parent); if (parent == null) break; if (!(parent instanceof TagNode)) continue; // System.out.println("Parent node is no ended. "+parent.getText()); prvContent.append(((TagNode) parent).getEndTag().toHtml()); } while (true); break; } node = nodes.elementAt(i); if (node instanceof TagNode) { TagNode tag = (TagNode) node; prvContent.append('<'); prvContent.append(tag.getText()); prvContent.append('>'); // System.out.println("TAG: " + '<'+tag.getText()+'>'); } else if (node instanceof TextNode) { int space = max_count - prvContent.length(); if (space > 10) { TextNode text = (TextNode) node; if (text.getText().length() < 10) prvContent.append(text.getText()); else prvContent.append( StringUtils.abbreviate(text.getText(), max_count - prvContent.length())); // System.out.println("TEXT: " + text.getText()); } } } return prvContent.toString(); } catch (ParserException e) { e.printStackTrace(); } finally { parser = null; } return html; }