/** * 对新闻URL进行解析并采集数据 * * @param url 新闻连接。 */ public void parser(String url) { String title = ""; // 新闻标题 String source = ""; // 新闻来源 String sourceTime = ""; // 新闻来源时间 // String author = ""; //新闻作者 String Content = ""; // 新闻内容 // String collectTime = ""; //新闻采集时间-系统时间 try { parser = new Parser(url); parser.setEncoding("GB2312"); // 标题 NodeFilter titleFilter = new TagNameFilter("h1"); NodeList titleNodeList = parser.parse(titleFilter); title = parserUtil.getNodeListText(titleNodeList); parser.reset(); // 每次获取都必须reset,不然后面获取不到数据 System.out.println(title); // 来源 NodeFilter sourceFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name")); NodeList sourceNodeList = parser.parse(sourceFilter); source = parserUtil.getNodeListText(sourceNodeList); parser.reset(); System.out.println(source); // 来源时间 NodeFilter sourceTimeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date")); NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter); String str = parserUtil.getNodeListText(sourceTimeNodeList); sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", ""); parser.reset(); System.out.println(sourceTime); // 正文 NodeFilter ContentTimeFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody")); NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter); NodeList childList = ContentTimeNodeList.elementAt(0).getChildren(); childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分 // childList.keepAllNodesThatMatch(new RegexFilter(" 相关专题")); Content = parserUtil.getNodeListHTML(ContentTimeNodeList); // Content = ParserUtil.getPlainText(Content); System.out.println(Content); parser.reset(); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
public Set<Hit> parseHits(Parser htmlParser, String referringUrl) { if (plugins.size() == 0) return null; HashSet<Hit> resultSet = null; for (ParserPlugin plugin : plugins) { htmlParser.reset(); Set<Hit> parsed = plugin.parseHits(htmlParser, referringUrl); if (parsed != null) { if (resultSet == null) resultSet = new HashSet<Hit>(); resultSet.addAll(parsed); } } return resultSet; }
/** * Assign the underlying node filter for this wrapper. * * @param filter The filter to wrap. * @param context The parser to use for conditioning this filter. Some filters need contextual * information to provide to the user, i.e. for tag names or attribute names or values, so the * Parser context is provided. */ public void setNodeFilter(NodeFilter filter, Parser context) { Set set; mFilter = (TagNameFilter) filter; set = new HashSet(); context.reset(); try { for (NodeIterator iterator = context.elements(); iterator.hasMoreNodes(); ) addName(set, iterator.nextNode()); } catch (ParserException pe) { // oh well, we tried } for (Iterator iterator = set.iterator(); iterator.hasNext(); ) mName.addItem(iterator.next()); mName.setSelectedItem(mFilter.getName()); }