/** * 对新闻URL进行解析并采集数据 * * @param url 新闻连接。 */ public void parser(String url) { String title = ""; // 新闻标题 String source = ""; // 新闻来源 String sourceTime = ""; // 新闻来源时间 // String author = ""; //新闻作者 String Content = ""; // 新闻内容 // String collectTime = ""; //新闻采集时间-系统时间 try { parser = new Parser(url); parser.setEncoding("GB2312"); // 标题 NodeFilter titleFilter = new TagNameFilter("h1"); NodeList titleNodeList = parser.parse(titleFilter); title = parserUtil.getNodeListText(titleNodeList); parser.reset(); // 每次获取都必须reset,不然后面获取不到数据 System.out.println(title); // 来源 NodeFilter sourceFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name")); NodeList sourceNodeList = parser.parse(sourceFilter); source = parserUtil.getNodeListText(sourceNodeList); parser.reset(); System.out.println(source); // 来源时间 NodeFilter sourceTimeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date")); NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter); String str = parserUtil.getNodeListText(sourceTimeNodeList); sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", ""); parser.reset(); System.out.println(sourceTime); // 正文 NodeFilter ContentTimeFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody")); NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter); NodeList childList = ContentTimeNodeList.elementAt(0).getChildren(); childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分 // childList.keepAllNodesThatMatch(new RegexFilter(" 相关专题")); Content = parserUtil.getNodeListHTML(ContentTimeNodeList); // Content = ParserUtil.getPlainText(Content); System.out.println(Content); parser.reset(); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
// 获取页面指定内容的Link public static List getLinksByConditions(String result, String coditions, String codeKind) { List links = null; Parser parser; NodeList nodelist; // 页面编码配置 To do by shengf parser = Parser.createParser(result, codeKind); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); int count = 1; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if (link.toHtml().indexOf(coditions) != -1) { links.add(link); count++; if (count > CatchNum) { return links; } } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; }
private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
public static void setViewState(String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); viewState = node.getAttribute("value"); }
public static void setEventValidation(String html) throws ParserException { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter( new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); eventValidation = node.getAttribute("value"); }
/* * 获得新闻的日期 */ public String getNewsDate(NodeFilter dateFilter, Parser parser) { String newsDate = null; try { NodeList dateList = parser.parse(dateFilter); for (int i = 0; i < dateList.size(); i++) { Div dateTag = (Div) dateList.elementAt(i); newsDate = dateTag.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsDate; }
/** * 获得新闻的责任编辑,也就是作者。 * * @param newsauthorFilter * @param parser * @return */ public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) { String newsAuthor = ""; try { NodeList authorList = parser.parse(newsauthorFilter); for (int i = 0; i < authorList.size(); i++) { Div authorSpan = (Div) authorList.elementAt(i); newsAuthor = authorSpan.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsAuthor; }
private String[] processBlog(InputStream in) throws BlogCrawlingException { // using a set here to avoid duplicates Set<String> linksToBlogs = new TreeSet<String>(); try { Page page = new Page(in, null); Parser parser = new Parser(new Lexer(page)); // register a filter to extract all the anchor tags TagNameFilter anchorTagsFilter = new TagNameFilter("a"); StringBuffer buf = new StringBuffer(); NodeList anchorTagsList = parser.parse(anchorTagsFilter); for (int i = 0; i < anchorTagsList.size(); i++) { Node node = anchorTagsList.elementAt(i); LinkTag tag = (LinkTag) node; String linkURL = tag.getLink(); if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) { // logger.info(" *BLOG Detected* ==> " + linkURL); System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL); linksToBlogs.add(linkURL); } else { System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL); } } String[] links = new String[linksToBlogs.size()]; int count = 0; for (String linksToBlog : linksToBlogs) { links[count++] = linksToBlog; } return links; } catch (ParserException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (UnsupportedEncodingException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } }
/** * 获得新闻的标题 * * @param titleFilter * @param parser * @return */ public String getTitle(NodeFilter titleFilter, Parser parser) { String titleName = ""; try { NodeList titleNodeList = parser.parse(titleFilter); for (int i = 0; i < titleNodeList.size(); i++) { HeadingTag title = (HeadingTag) titleNodeList.elementAt(i); titleName = title.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return titleName; }
private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter viewStateFilter = new AndFilter( new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView")); NodeList nodes = parser.parse(viewStateFilter); TableTag node = (TableTag) nodes.elementAt(0); TableRow[] rows = node.getRows(); for (int i = 1; i < rows.length; i++) { TableColumn[] cols = rows[i].getColumns(); TableColumn col = cols[3]; LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2); if (tag == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } String href = tag.getAttribute("href"); if (href == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } int start = href.indexOf("standardid="); int end = href.indexOf("&"); String standardId = href.substring(start, end).replaceAll("standardid=", ""); List<String> lst = map.get(pageNo); if (lst == null) { lst = new ArrayList<String>(); } lst.add(standardId); map.put(pageNo, lst); } }
/** * @param <T> 标签类型 * @param html 需要解析的文本html * @param tagType 标签类型 class * @param attr 该标签应该有的树形 * @param value 属性的值 (Ϊnull ��Ϊ��ƥ��) * @return */ public static <T extends TagNode> List<T> parseTags( String html, final Class<T> tagType, final String attr, final String value, final boolean test) { Parser parser = new Parser(); try { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new PreTag()); parser.setNodeFactory(factory); parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { if (test) logger.info(node.getClass()); if (node.getClass() == tagType) { if (attr == null) return true; T tn = (T) node; String attrv = tn.getAttribute(attr); if (value == null && attrv != null) { // || attrv.equals(value) return true; } if (test) logger.info(attrv); if (value != null && attrv != null && attrv.equals(value)) return true; } return false; } }); List<T> tags = new ArrayList<T>(); for (int i = 0; i < tagList.size(); i++) { tags.add((T) tagList.elementAt(i)); } return tags; } catch (ParserException e) { e.printStackTrace(); } return null; }
/** Test a better method of modifying an HTML page. */ public void testPageModification() throws Exception { Parser parser = Parser.createParser(HTML_WITH_LINK, null); NodeList list = parser.parse(null); // no filter // make an inner class that does the same thing as the UrlModifyingVisitor NodeVisitor visitor = new NodeVisitor() { String linkPrefix = "localhost://"; public void visitTag(Tag tag) { if (tag instanceof LinkTag) ((LinkTag) tag).setLink(linkPrefix + ((LinkTag) tag).getLink()); else if (tag instanceof ImageTag) ((ImageTag) tag).setImageURL(linkPrefix + ((ImageTag) tag).getImageURL()); } }; list.visitAllNodesWith(visitor); String result = list.toHtml(); assertStringEquals("Expected HTML", MODIFIED_HTML, result); }
@Override public void crawl(Parser parser) throws ParserException { List<LCOdds> data = new ArrayList<LCOdds>(); NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT)); for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) { NodeList cells = it.nextNode().getChildren(); cells.keepAllNodesThatMatch(tdFilter); LCOdds lc = parseRow(cells); if (null != lc) { data.add(lc); } } // persist if (data.size() < 1) { log.warn(" -- [ 06_LC_2 ] data is empty !"); } storeData("lc_odds", data); }
private static void addDetailToMap(String key, String text) throws Exception { Parser parser = Parser.createParser(text, "gb2312"); TagNameFilter tableFiler = new TagNameFilter("table"); NodeList nodes = parser.parse(tableFiler); TableTag node = (TableTag) nodes.elementAt(5); TableRow[] rows = node.getRows(); for (int i = 1; i < 11; i++) { TableColumn[] cols = rows[i].getColumns(); StringBuffer txt1 = new StringBuffer(); StringBuffer txt2 = new StringBuffer(); NodeList span1 = cols[1].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span1.size(); j++) { if (span1.elementAt(j) instanceof TextNode) { txt1.append(span1.elementAt(j).getText()).append(" "); } } NodeList span2 = cols[3].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span2.size(); j++) { if (span2.elementAt(j) instanceof TextNode) { txt2.append(span2.elementAt(j).getText()).append(" "); } } List<String> lst = detailMap.get(key); if (lst == null) { lst = new ArrayList<String>(); } lst.add(txt1.toString().trim()); lst.add(txt2.toString().trim()); detailMap.put(key, lst); } }
// 土地交易单独处理 public static List getLinksByConditions2(String result, String coditions, String codeKind) { List links = null; Parser parser; NodeList nodelist; parser = Parser.createParser(result, codeKind); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); int count = 1; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if ((link.toHtml().indexOf(coditions) != -1) && (link.getChildrenHTML().indexOf("查看") == -1)) { // System.out.println(link.toHtml()); // System.out.println(link.getLink()); // System.out.println("test:" + link.getChildrenHTML()); // Node nextNode = link.getParent().getNextSibling(); // System.out.println(nextNode.getChildren().toHtml().replaceAll("/r/n","").trim()); // nextNode = // nextNode.getNextSibling().getNextSibling(); // System.out.println(nextNode.getChildren().toHtml().replaceAll("/r/n","").trim()); links.add(link); count++; if (count > CatchNum) { return links; } } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; }
/** * 提取具有某个属性值的标签列表 * * @param html 被提取的html文本 * @param tagType 标签的类型 * @param attributeName 某个属性的名称 * @param attributeValue 属性应取的值 * @return */ public static <T extends TagNode> List<T> parseTags( String html, final Class<T> tagType, final String attributeName, final String attributeValue) { try { // 创建一个html解释器 Parser parser = new Parser(); parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { if (node.getClass() == tagType) { T tn = (T) node; String attrValue = tn.getAttribute(attributeName); if (attrValue != null && attrValue.equals(attributeValue)) { return true; } } return false; } }); List<T> tags = new ArrayList<T>(); for (int i = 0; i < tagList.size(); i++) { T t = (T) tagList.elementAt(i); tags.add(t); } return tags; } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; }
public void extractLinks() throws Exception { logger.debug("Extracting links " + pageInfo.getUrl()); String content = pageInfo.getContent(); if (content == null || content.length() == 0) { return; } URI uri = new URI(pageInfo.getUrl()); Parser parser = new Parser(); parser.setInputHTML(content); NodeList nodeList = parser.parse(new TagNameFilter("A")); logger.debug("get links from " + pageInfo.getUrl() + " size : " + nodeList.size()); for (int i = 0; i < nodeList.size(); i++) { Node node = nodeList.elementAt(i); LinkTag tag = (LinkTag) node; String linkHref = tag.extractLink(); if (linkHref.indexOf("http") != linkHref.lastIndexOf("http")) { continue; } try { URI linkUri = uri.resolve(linkHref); String link = linkUri.toString(); if (link != null && link.length() > 0) { for (FilterRule fr : crawler.getFilterRules()) { CrawlAction ca = fr.judge(link); if (ca == CrawlAction.STORE || ca == CrawlAction.FOLLOW || ca == CrawlAction.FOLLOW_STORE) { logger.debug("linkUri : " + link + " -- ca : " + ca.toString()); pageInfo.getLinks().add(link); } } } } catch (Exception ignore) { } } }
public static NodeList parseAllTags(String html) { Parser parser = new Parser(); try { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new PreTag()); parser.setNodeFactory(factory); parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { return true; } }); return tagList; } catch (ParserException e) { e.printStackTrace(); } return null; }
public static <T extends TagNode> List<T> parseTags( Parser parser, final Class<T> tagType, final String attr, final String value) { // Parser parser = new Parser(); try { // parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { if (tagType == null || node.getClass() == tagType) { T tn = (T) node; String attrv = tn.getAttribute(attr); if (node instanceof Div) logger.info(attrv); if (value == null && attrv != null) { // || attrv.equals(value) return true; } if (value != null && attrv != null && attrv.equals(value)) return true; } return false; } }); List<T> tags = new ArrayList<T>(); for (int i = 0; i < tagList.size(); i++) { tags.add((T) tagList.elementAt(i)); } return tags; } catch (ParserException e) { e.printStackTrace(); } return null; }
/** * 处理目标 超链接节点 * * @param htmlPageContent * @param preUrl * @throws Exception */ public void dealLinkNodes(String htmlPageContent, String preUrl) { try { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")); NodeList nodeList = parser.parse(filter); LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0)); NodeIterator it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); if (node instanceof LinkTag) { if (!filterHandler.isLinkTagFilter(((LinkTag) node))) { LoggerUtil.debug( "ParserHandler ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText()); CrawlQueue.getQueueManager() .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl); } } } } catch (Exception e) { } }