/** * 对新闻URL进行解析并采集数据 * * @param url 新闻连接。 */ public void parser(String url) { String title = ""; // 新闻标题 String source = ""; // 新闻来源 String sourceTime = ""; // 新闻来源时间 // String author = ""; //新闻作者 String Content = ""; // 新闻内容 // String collectTime = ""; //新闻采集时间-系统时间 try { parser = new Parser(url); parser.setEncoding("GB2312"); // 标题 NodeFilter titleFilter = new TagNameFilter("h1"); NodeList titleNodeList = parser.parse(titleFilter); title = parserUtil.getNodeListText(titleNodeList); parser.reset(); // 每次获取都必须reset,不然后面获取不到数据 System.out.println(title); // 来源 NodeFilter sourceFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name")); NodeList sourceNodeList = parser.parse(sourceFilter); source = parserUtil.getNodeListText(sourceNodeList); parser.reset(); System.out.println(source); // 来源时间 NodeFilter sourceTimeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date")); NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter); String str = parserUtil.getNodeListText(sourceTimeNodeList); sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", ""); parser.reset(); System.out.println(sourceTime); // 正文 NodeFilter ContentTimeFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody")); NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter); NodeList childList = ContentTimeNodeList.elementAt(0).getChildren(); childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分 // childList.keepAllNodesThatMatch(new RegexFilter(" 相关专题")); Content = parserUtil.getNodeListHTML(ContentTimeNodeList); // Content = ParserUtil.getPlainText(Content); System.out.println(Content); parser.reset(); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@Override public void crawl(Parser parser) throws ParserException { List<LCOdds> data = new ArrayList<LCOdds>(); NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT)); for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) { NodeList cells = it.nextNode().getChildren(); cells.keepAllNodesThatMatch(tdFilter); LCOdds lc = parseRow(cells); if (null != lc) { data.add(lc); } } // persist if (data.size() < 1) { log.warn(" -- [ 06_LC_2 ] data is empty !"); } storeData("lc_odds", data); }