public void checkprice() throws Exception { // System.out.println("checking Aptamil url [" + page.url + "]"); URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.connect(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeClassFilter div_filter = new NodeClassFilter(Div.class); OrFilter filters = new OrFilter(); filters.setPredicates(new NodeFilter[] {div_filter}); NodeList list = parser.extractAllNodesThatMatch(filters); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; String divclass = d.getAttribute("class"); if ("pl_addToBasket".equalsIgnoreCase(divclass)) { // return getName(d); } } } }
/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
// 获取页面指定内容的Link public static List getLinksByConditions(String result, String coditions, String codeKind) { List links = null; Parser parser; NodeList nodelist; // 页面编码配置 To do by shengf parser = Parser.createParser(result, codeKind); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); int count = 1; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if (link.toHtml().indexOf(coditions) != -1) { links.add(link); count++; if (count > CatchNum) { return links; } } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; }
public void testUrlModificationWithVisitor() throws Exception { Parser parser = Parser.createParser(HTML_WITH_LINK, null); UrlModifyingVisitor visitor = new UrlModifyingVisitor("localhost://"); parser.visitAllNodesWith(visitor); String result = visitor.getModifiedResult(); assertStringEquals("Expected HTML", MODIFIED_HTML, result); }
// If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } }
private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
public static void dealOnePage(String url, int starNo) { try { Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection()); NodeList tableSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF")); parser = new Parser(new Lexer(tableSet.toHtml())); NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr")); parser = new Parser(new Lexer(tdSet.toHtml())); PrototypicalNodeFactory p = new PrototypicalNodeFactory(); p.registerTag(new SpanTag()); parser.setNodeFactory(p); NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span")); int index = 0; for (int i = 5; i < spanSet.size(); i = i + 5) { String str = spanSet.elementAt(i).toPlainTextString(); String now = "" + (starNo * 100 + index); index++; while (str.compareTo(now) != 0) { System.out.println(now); now = "" + (starNo * 100 + index); index++; } // System.out.println(str); } } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
@Test public void test5() { try { Parser parser = new Parser(); parser.setURL("www.zhihu.com"); } catch (Exception e) { } }
// 获取一个网站上的链接,filter来过滤链接 public static Set<String> extracLinks(String url, Cobweb cobweb) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding(cobweb.getCharSet()); // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤<a> 标签和<frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL if (cobweb.accept(linkUrl)) { links.add( // java.net.URLEncoder.encode(linkUrl)); linkUrl .replaceAll("\\?", "\\%3F") // 转码 .replaceAll("\\&", "\\%26") .replaceAll("\\|", "\\%124") .replaceAll("\\#", "")); } ; } else { // <frame>标签 // 提取frame 里src 属性的链接,如<frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); if (cobweb.accept(frameUrl)) { links.add(frameUrl); } } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * 获取滚动品牌 * * @param path * @param city * @param fileName * @return */ public static Map<String, String> getBrandInfo(String path, String city, String fileName) { Map<String, String> brandMap = new LinkedHashMap<String, String>(); try { StringBuilder filePath = new StringBuilder(); filePath.append(PATH); filePath.append(city); filePath.append(INCLUDE); filePath.append(fileName); filePath.append(STUFF); // 开始解析 Parser parser = new Parser(filePath.toString()); // 过滤出<a></a>标签 NodeFilter divFilter = new NodeClassFilter(Div.class); NodeList classList = parser.extractAllNodesThatMatch(divFilter); NodeList hrefList = null; NodeList imgList = null; Node picNode = null; Node hrefNode = null; Node imgNode = null; String classStr = ""; String hrefStr = ""; String imgStr = ""; String imgClass = ""; for (int i = 0; i < classList.size(); i++) { picNode = classList.elementAt(i); classStr = ((Div) picNode).getAttribute("class"); if ("business_list_pic".equalsIgnoreCase(classStr)) { hrefList = picNode.getChildren(); for (int j = 0; j < hrefList.size(); j++) { hrefNode = hrefList.elementAt(j); if (hrefNode instanceof LinkTag) { hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id hrefStr = MParseBrand.getBrandId(hrefStr); imgList = hrefNode.getChildren(); for (int k = 0; k < imgList.size(); k++) { imgNode = imgList.elementAt(k); if (imgNode instanceof ImageTag) { imgClass = ((ImageTag) imgNode).getAttribute("class"); if (null != imgClass) { imgStr = ((ImageTag) imgNode).getAttribute("src"); if (null == imgStr) { imgStr = ((ImageTag) imgNode).getAttribute("original"); } } } } brandMap.put(hrefStr, imgStr); } } } } } catch (Exception e) { e.printStackTrace(); } return brandMap; }
public static void setEventValidation(String html) throws ParserException { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter( new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); eventValidation = node.getAttribute("value"); }
public static void setViewState(String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); viewState = node.getAttribute("value"); }
/** @throws ParserException */ private void parseHtml() throws ParserException { htmlTags = new ArrayList(); Parser parser = new Parser(); parser.setInputHTML(fDocument.get()); for (NodeIterator e = parser.elements(); e.hasMoreNodes(); ) { Node node = e.nextNode(); VHtmlNodeVisitor htmlNodeVisitor = new VHtmlNodeVisitor(); node.accept(htmlNodeVisitor); } }
public static List<String> getLinks(String url) throws ParserException { Parser htmlParser = new Parser(url); List<String> links = new LinkedList<String>(); NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int m = 0; m < tagNodeList.size(); m++) { LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m); String linkName = loopLinks.getLink(); links.add(linkName); } return links; }
/** Test regular expression matching: */ public void testRegularExpression() throws Exception { String target = "\n" + "\n" + "Most recently, in the Western Conference final, the Flames knocked off \n" + "the San Jose Sharks, the Pacific Division champions, to become the first \n" + "Canadian team to reach the Stanley Cup Championship series since 1994."; String html = "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>" + "<body><h1>CBC SPORTS ONLINE</h1>\n" + "The Calgary Flames have already defeated three NHL division winners \n" + "during their improbable playoff run. If they are to hoist the Stanley \n" + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n" + "\n" + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n" + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n" + "</font></td></tr></table>\n" + "\n" + "\n" + "In the post-season's first round, the Flames defeated the Vancouver \n" + "Canucks, the Northwest Division winners, in seven tough games. <p>\n" + "\n" + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n" + "Division, but also boasted the NHL's best overall record during the \n" + "regular season, who fell to the Flames. <p>" + target + "<p>\n" + "\n" + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n" + "of the NHL's Southeast Division and the Eastern Conference's best team \n" + "during the regular season. <p>\n" + "\n" + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n" + "Conference final. <p>\n" + "</body></html>\n"; Lexer lexer; Parser parser; RegexFilter filter; NodeIterator iterator; int count; lexer = new Lexer(html); parser = new Parser(lexer); filter = new RegexFilter( "(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?"); count = 0; for (iterator = parser.extractAllNodesThatMatch(filter).elements(); iterator.hasMoreNodes(); ) { assertEquals("text wrong", target, iterator.nextNode().toHtml()); count++; } assertEquals("wrong count", 1, count); }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, LinkFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); // parser.setEncoding("utf8"); // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接 NodeFilter frameFilter = new NodeFilter() { /** */ private static final long serialVersionUID = 1L; public boolean accept(Node node) { if (node.getText().startsWith("iframe") && node.getText().contains("src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤 <a> 标签和 <frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) // <a> 标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // url可能出现在src,href等属性中 if (filter.accept(linkUrl)) links.add(linkUrl); } else // <frame> 标签 { // 提取 frame 里 src 属性的链接如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src=\""); frame = frame.substring(start); int end = frame.indexOf("\">"); if (end == -1) { end = frame.indexOf("?"); } String frameUrl = frame.substring(5, end - 1); if (filter.accept(frameUrl)) links.add(frameUrl); } } } catch (ParserException e) { e.printStackTrace(); } return links; }
private String[] processBlog(InputStream in) throws BlogCrawlingException { // using a set here to avoid duplicates Set<String> linksToBlogs = new TreeSet<String>(); try { Page page = new Page(in, null); Parser parser = new Parser(new Lexer(page)); // register a filter to extract all the anchor tags TagNameFilter anchorTagsFilter = new TagNameFilter("a"); StringBuffer buf = new StringBuffer(); NodeList anchorTagsList = parser.parse(anchorTagsFilter); for (int i = 0; i < anchorTagsList.size(); i++) { Node node = anchorTagsList.elementAt(i); LinkTag tag = (LinkTag) node; String linkURL = tag.getLink(); if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) { // logger.info(" *BLOG Detected* ==> " + linkURL); System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL); linksToBlogs.add(linkURL); } else { System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL); } } String[] links = new String[linksToBlogs.size()]; int count = 0; for (String linksToBlog : linksToBlogs) { links[count++] = linksToBlog; } return links; } catch (ParserException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (UnsupportedEncodingException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } }
/** * Assign the underlying node filter for this wrapper. * * @param filter The filter to wrap. * @param context The parser to use for conditioning this filter. Some filters need contextual * information to provide to the user, i.e. for tag names or attribute names or values, so the * Parser context is provided. */ public void setNodeFilter(NodeFilter filter, Parser context) { Set set; mFilter = (TagNameFilter) filter; set = new HashSet(); context.reset(); try { for (NodeIterator iterator = context.elements(); iterator.hasMoreNodes(); ) addName(set, iterator.nextNode()); } catch (ParserException pe) { // oh well, we tried } for (Iterator iterator = set.iterator(); iterator.hasNext(); ) mName.addItem(iterator.next()); mName.setSelectedItem(mFilter.getName()); }
private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter viewStateFilter = new AndFilter( new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView")); NodeList nodes = parser.parse(viewStateFilter); TableTag node = (TableTag) nodes.elementAt(0); TableRow[] rows = node.getRows(); for (int i = 1; i < rows.length; i++) { TableColumn[] cols = rows[i].getColumns(); TableColumn col = cols[3]; LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2); if (tag == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } String href = tag.getAttribute("href"); if (href == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } int start = href.indexOf("standardid="); int end = href.indexOf("&"); String standardId = href.substring(start, end).replaceAll("standardid=", ""); List<String> lst = map.get(pageNo); if (lst == null) { lst = new ArrayList<String>(); } lst.add(standardId); map.put(pageNo, lst); } }
/** * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。 * * @param url 新闻连接。 */ public void parser(String url) { try { parser = new Parser(url); // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new // HasAttributeFilter("class", "TRS_PreAppend")); // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。 NodeFilter innerFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal")); NodeFilter xk = new HasParentFilter(innerFilter); NodeList nodes = parser.extractAllNodesThatMatch(xk); System.out.println(nodes.size()); for (int i = 0; i < nodes.size(); i++) { Node time = nodes.elementAt(i); // System.out.println(time.toPlainTextString().trim().replace(" ", // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", "")); System.out.println( replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", ""))); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } }
public static void setInnerHTML(Element root, String html) { // remove old root childs OverrideNodeList<Node> list = (OverrideNodeList<Node>) root.getChildNodes(); list.getList().clear(); if (html != null) { Parser parser = Parser.createParser(html, "UTF-8"); try { parser.visitAllNodesWith(new GwtNodeVisitor(root)); } catch (ParserException e) { throw new RuntimeException( "error while parsing <" + root.getTagName() + "> element's innerHTML : " + html, e); } } }
/** * 对新闻URL进行解析并采集数据 * * @param url 新闻连接。 */ public void parser(String url) { String title = ""; // 新闻标题 String source = ""; // 新闻来源 String sourceTime = ""; // 新闻来源时间 // String author = ""; //新闻作者 String Content = ""; // 新闻内容 // String collectTime = ""; //新闻采集时间-系统时间 try { parser = new Parser(url); parser.setEncoding("GB2312"); // 标题 NodeFilter titleFilter = new TagNameFilter("h1"); NodeList titleNodeList = parser.parse(titleFilter); title = parserUtil.getNodeListText(titleNodeList); parser.reset(); // 每次获取都必须reset,不然后面获取不到数据 System.out.println(title); // 来源 NodeFilter sourceFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name")); NodeList sourceNodeList = parser.parse(sourceFilter); source = parserUtil.getNodeListText(sourceNodeList); parser.reset(); System.out.println(source); // 来源时间 NodeFilter sourceTimeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date")); NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter); String str = parserUtil.getNodeListText(sourceTimeNodeList); sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", ""); parser.reset(); System.out.println(sourceTime); // 正文 NodeFilter ContentTimeFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody")); NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter); NodeList childList = ContentTimeNodeList.elementAt(0).getChildren(); childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分 // childList.keepAllNodesThatMatch(new RegexFilter(" 相关专题")); Content = parserUtil.getNodeListHTML(ContentTimeNodeList); // Content = ParserUtil.getPlainText(Content); System.out.println(Content); parser.reset(); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
public boolean checkprice() { System.out.println("checking amazon url:" + page.url); try { URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); // OrFilter lastFilter = new OrFilter(); // lastFilter.setPredicates(new NodeFilter[] { // new NodeClassFilter(TableTag.class), // new NodeClassFilter(Div.class) }); // // NodeList list = parser.extractAllNodesThatMatch(lastFilter); NodeList list = parser.extractAllNodesThatMatch(new NodeClassFilter(Div.class)); System.out.println("size:" + list.size()); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; System.out.println(d.getAttribute("id")); if (d.getAttribute("id").startsWith("result_")) { // found one product try { AmazonProduct product = new AmazonProduct(); product.name = d.getAttribute("name"); getPriceAndLabel(d, product); } catch (Exception e) { e.printStackTrace(); } } } } } catch (Exception e) { System.out.println(e.getMessage()); } return false; }
public static void main(String[] args) throws Exception { RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); int count = 1; for (int i = 0; i <= 16; i++) { int index = i; // System.out.println(index); HttpGet httpGet = new HttpGet(url3 + index + url4); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); } Parser parser = Parser.createParser(htmls, "utf-8"); AndFilter dFilter = new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content")); NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter); for (int k = 0; k < nodes3.size(); k++) { htmls = nodes3.elementAt(k).toHtml(); parser = Parser.createParser(htmls, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes4.elementAt(j); // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/")) { // .replaceAll("<span[\\s\\S]*/span>","") String temp = link.toHtml(); System.out.println( "{\"" + count + "\",\"http://www.chi.ac.uk/" + link.getAttribute("href") + "\",\"" + html2Str(temp).replace("\r\n", "").trim() + "\",\"0\"},"); count++; } } } } // System.out.println("DONE."); }
/** * 获取文章链接 * * @param url * @throws Exception */ void docByHTML(String content, String pre) throws Exception { Parser parser = new Parser(); parser.setInputHTML(content); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { if (null == articleDocCache.get(getKey(link.getLink()))) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } else { logger.info(">> 已存在 [" + link.getLink() + "] 地址"); } } } } } }
/** * 获取文章链接 * * @param url * @throws Exception */ void doc(String url, String pre) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } } } } }
/** * @param <T> 标签类型 * @param html 需要解析的文本html * @param tagType 标签类型 class * @param attr 该标签应该有的树形 * @param value 属性的值 (Ϊnull ��Ϊ��ƥ��) * @return */ public static <T extends TagNode> List<T> parseTags( String html, final Class<T> tagType, final String attr, final String value, final boolean test) { Parser parser = new Parser(); try { PrototypicalNodeFactory factory = new PrototypicalNodeFactory(); factory.registerTag(new PreTag()); parser.setNodeFactory(factory); parser.setInputHTML(html); NodeList tagList = parser.parse( new NodeFilter() { @Override public boolean accept(Node node) { if (test) logger.info(node.getClass()); if (node.getClass() == tagType) { if (attr == null) return true; T tn = (T) node; String attrv = tn.getAttribute(attr); if (value == null && attrv != null) { // || attrv.equals(value) return true; } if (test) logger.info(attrv); if (value != null && attrv != null && attrv.equals(value)) return true; } return false; } }); List<T> tags = new ArrayList<T>(); for (int i = 0; i < tagList.size(); i++) { tags.add((T) tagList.elementAt(i)); } return tags; } catch (ParserException e) { e.printStackTrace(); } return null; }
/** Test a better method of modifying an HTML page. */ public void testPageModification() throws Exception { Parser parser = Parser.createParser(HTML_WITH_LINK, null); NodeList list = parser.parse(null); // no filter // make an inner class that does the same thing as the UrlModifyingVisitor NodeVisitor visitor = new NodeVisitor() { String linkPrefix = "localhost://"; public void visitTag(Tag tag) { if (tag instanceof LinkTag) ((LinkTag) tag).setLink(linkPrefix + ((LinkTag) tag).getLink()); else if (tag instanceof ImageTag) ((ImageTag) tag).setImageURL(linkPrefix + ((ImageTag) tag).getImageURL()); } }; list.visitAllNodesWith(visitor); String result = list.toHtml(); assertStringEquals("Expected HTML", MODIFIED_HTML, result); }
public void testSelectors() throws Exception { String html = "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>>moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>"; Lexer l; Parser p; CssSelectorNodeFilter it; NodeIterator i; int count; l = new Lexer(html); p = new Parser(l); it = new CssSelectorNodeFilter("li + li"); count = 0; for (i = p.extractAllNodesThatMatch(it).elements(); i.hasMoreNodes(); ) { assertEquals("tag name wrong", "LI", ((Tag) i.nextNode()).getTagName()); count++; } assertEquals("wrong count", 2, count); }