private NodeList getMatchingTags(NodeFilter filter) throws Exception { String html = examiner.html(); Parser parser = new Parser(new Lexer(new Page(html))); NodeList list = parser.parse(null); NodeList matches = list.extractAllNodesThatMatch(filter, true); return matches; }
/** * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。 * * @param url 新闻连接。 */ public void parser(String url) { try { parser = new Parser(url); // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new // HasAttributeFilter("class", "TRS_PreAppend")); // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。 NodeFilter innerFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal")); NodeFilter xk = new HasParentFilter(innerFilter); NodeList nodes = parser.extractAllNodesThatMatch(xk); System.out.println(nodes.size()); for (int i = 0; i < nodes.size(); i++) { Node time = nodes.elementAt(i); // System.out.println(time.toPlainTextString().trim().replace(" ", // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", "")); System.out.println( replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", ""))); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } }
/** * 递归钻取正文信息 * * @param nodeP * @return */ @SuppressWarnings("unchecked") protected List<Node> extractHtml(Node nodeP, String type) throws Exception { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { return null; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { tableList.add(node); } } else { List tempList = extractHtml(node, type); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { TableContext tc = new TableContext(); tc.setLinkList(new ArrayList()); tc.setTextBuffer(new StringBuffer()); tableNumber++; tc.setTableRow(tableNumber); Iterator ti = tableList.iterator(); // 得到设置的搜索URL String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL); while (ti.hasNext()) { Node node = (Node) ti.next(); if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag) node; if (!"1".equalsIgnoreCase(type)) { linkTag.setAttribute( "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href"))); } tc.getLinkList().add(linkTag); } else { tc.getTextBuffer().append(node.getText()); } } return tableList; } return null; }
// If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } }
/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
// 获取页面指定内容的Link public static List getLinksByConditions(String result, String coditions, String codeKind) { List links = null; Parser parser; NodeList nodelist; // 页面编码配置 To do by shengf parser = Parser.createParser(result, codeKind); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); int count = 1; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if (link.toHtml().indexOf(coditions) != -1) { links.add(link); count++; if (count > CatchNum) { return links; } } } } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; }
public void checkprice() throws Exception { // System.out.println("checking Aptamil url [" + page.url + "]"); URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.connect(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeClassFilter div_filter = new NodeClassFilter(Div.class); OrFilter filters = new OrFilter(); filters.setPredicates(new NodeFilter[] {div_filter}); NodeList list = parser.extractAllNodesThatMatch(filters); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; String divclass = d.getAttribute("class"); if ("pl_addToBasket".equalsIgnoreCase(divclass)) { // return getName(d); } } } }
public List<Newsitem> parseContent(String content) throws Exception { List<Newsitem> newsitems = new ArrayList<Newsitem>(); Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news"); NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem"); for (int i = 0; i < nodes.size(); i++) { NewsitemImpl newsitem = new NewsitemImpl(); Tag itemTable = (Tag) nodes.elementAt(i); Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle"); newsitem.setTitle(titleTag.toPlainTextString()); Node descriptionSpan = titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling(); newsitem.setDescription( descriptionSpan .toPlainTextString() .replaceAll("[^\\u0000-\\u00FF]", " ") .replace(" Read More...", "") .trim()); Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0); newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href")); newsitems.add(newsitem); } return newsitems; }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
/** * Extract the object <code>PARAM</code> tags from the child list. * * @return The list of object parameters (keys and values are String objects). */ public HashMap createObjectParamsTable() { NodeList kids; Node node; Tag tag; String paramName; String paramValue; HashMap ret; ret = new HashMap(); kids = getChildren(); if (null != kids) for (int i = 0; i < kids.size(); i++) { node = children.elementAt(i); if (node instanceof Tag) { tag = (Tag) node; if (tag.getTagName().equals("PARAM")) { paramName = tag.getAttribute("NAME"); if (null != paramName && 0 != paramName.length()) { paramValue = tag.getAttribute("VALUE"); ret.put(paramName.toUpperCase(), paramValue); } } } } return (ret); }
public Row(CompositeTag rowNode) { this.rowNode = rowNode; NodeList nodeList = rowNode.getChildren(); for (int i = 0; i < nodeList.size(); i++) { Node node = nodeList.elementAt(i); if (node instanceof TableColumn) cells.add(new Cell((TableColumn) node)); } }
// 获取一个网站上的链接,filter来过滤链接 public static Set<String> extracLinks(String url, Cobweb cobweb) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding(cobweb.getCharSet()); // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤<a> 标签和<frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL if (cobweb.accept(linkUrl)) { links.add( // java.net.URLEncoder.encode(linkUrl)); linkUrl .replaceAll("\\?", "\\%3F") // 转码 .replaceAll("\\&", "\\%26") .replaceAll("\\|", "\\%124") .replaceAll("\\#", "")); } ; } else { // <frame>标签 // 提取frame 里src 属性的链接,如<frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); if (cobweb.accept(frameUrl)) { links.add(frameUrl); } } } } catch (ParserException e) { e.printStackTrace(); } return links; }
private void setExecutionResult(ExecutionResult executionResult) { NodeList cells = rowNode.getChildren(); for (int i = 0; i < cells.size(); i++) { Node cell = cells.elementAt(i); if (cell instanceof Tag) { Tag tag = (Tag) cell; tag.setAttribute("class", executionResult.toString(), '"'); } } }
/** * Get the list of options in this <code>SELECT</code> tag. * * @return The {@.html <OPTION>} tags contained by this tag. */ public OptionTag[] getOptionTags() { NodeList list; OptionTag[] ret; list = searchFor(OptionTag.class, true); ret = new OptionTag[list.size()]; list.copyToNodeArray(ret); return (ret); }
public static void setViewState(String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); viewState = node.getAttribute("value"); }
public HtmlTable(TableTag tableNode) { this.tableNode = tableNode; NodeList nodeList = tableNode.getChildren(); for (int i = 0; i < nodeList.size(); i++) { Node node = nodeList.elementAt(i); if (node instanceof TableRow || node instanceof TableHeader) { rows.add(new Row((CompositeTag) node)); } } }
public static void setEventValidation(String html) throws ParserException { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter( new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); eventValidation = node.getAttribute("value"); }
public static List<String> getLinks(String url) throws ParserException { Parser htmlParser = new Parser(url); List<String> links = new LinkedList<String>(); NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int m = 0; m < tagNodeList.size(); m++) { LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m); String linkName = loopLinks.getLink(); links.add(linkName); } return links; }
/** * Add the tag name and it's children's tag names to the set of tag names. * * @param set The set to add to. * @param node The node to get the names from. */ protected void addName(Set set, Node node) { NodeList children; if (node instanceof Tag) { set.add(((Tag) node).getTagName()); if (node instanceof CompositeTag) { children = ((CompositeTag) node).getChildren(); if (null != children) for (int i = 0; i < children.size(); i++) addName(set, children.elementAt(i)); } } }
/** * 对新闻URL进行解析并采集数据 * * @param url 新闻连接。 */ public void parser(String url) { String title = ""; // 新闻标题 String source = ""; // 新闻来源 String sourceTime = ""; // 新闻来源时间 // String author = ""; //新闻作者 String Content = ""; // 新闻内容 // String collectTime = ""; //新闻采集时间-系统时间 try { parser = new Parser(url); parser.setEncoding("GB2312"); // 标题 NodeFilter titleFilter = new TagNameFilter("h1"); NodeList titleNodeList = parser.parse(titleFilter); title = parserUtil.getNodeListText(titleNodeList); parser.reset(); // 每次获取都必须reset,不然后面获取不到数据 System.out.println(title); // 来源 NodeFilter sourceFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name")); NodeList sourceNodeList = parser.parse(sourceFilter); source = parserUtil.getNodeListText(sourceNodeList); parser.reset(); System.out.println(source); // 来源时间 NodeFilter sourceTimeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date")); NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter); String str = parserUtil.getNodeListText(sourceTimeNodeList); sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", ""); parser.reset(); System.out.println(sourceTime); // 正文 NodeFilter ContentTimeFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody")); NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter); NodeList childList = ContentTimeNodeList.elementAt(0).getChildren(); childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分 // childList.keepAllNodesThatMatch(new RegexFilter(" 相关专题")); Content = parserUtil.getNodeListHTML(ContentTimeNodeList); // Content = ParserUtil.getPlainText(Content); System.out.println(Content); parser.reset(); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** Test tag name filtering. */ public void testTagName() throws ParserException { String guts; String html; NodeList list; guts = "<booty>Now is the time for all good men..</booty>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new TagNameFilter("booty")); assertEquals("only one element", 1, list.size()); assertSuperType("should be Tag", Tag.class, list.elementAt(0)); assertStringEquals("name", "BOOTY", ((Tag) (list.elementAt(0))).getTagName()); }
/** * 获取滚动品牌 * * @param path * @param city * @param fileName * @return */ public static Map<String, String> getBrandInfo(String path, String city, String fileName) { Map<String, String> brandMap = new LinkedHashMap<String, String>(); try { StringBuilder filePath = new StringBuilder(); filePath.append(PATH); filePath.append(city); filePath.append(INCLUDE); filePath.append(fileName); filePath.append(STUFF); // 开始解析 Parser parser = new Parser(filePath.toString()); // 过滤出<a></a>标签 NodeFilter divFilter = new NodeClassFilter(Div.class); NodeList classList = parser.extractAllNodesThatMatch(divFilter); NodeList hrefList = null; NodeList imgList = null; Node picNode = null; Node hrefNode = null; Node imgNode = null; String classStr = ""; String hrefStr = ""; String imgStr = ""; String imgClass = ""; for (int i = 0; i < classList.size(); i++) { picNode = classList.elementAt(i); classStr = ((Div) picNode).getAttribute("class"); if ("business_list_pic".equalsIgnoreCase(classStr)) { hrefList = picNode.getChildren(); for (int j = 0; j < hrefList.size(); j++) { hrefNode = hrefList.elementAt(j); if (hrefNode instanceof LinkTag) { hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id hrefStr = MParseBrand.getBrandId(hrefStr); imgList = hrefNode.getChildren(); for (int k = 0; k < imgList.size(); k++) { imgNode = imgList.elementAt(k); if (imgNode instanceof ImageTag) { imgClass = ((ImageTag) imgNode).getAttribute("class"); if (null != imgClass) { imgStr = ((ImageTag) imgNode).getAttribute("src"); if (null == imgStr) { imgStr = ((ImageTag) imgNode).getAttribute("original"); } } } } brandMap.put(hrefStr, imgStr); } } } } } catch (Exception e) { e.printStackTrace(); } return brandMap; }
/* * 获得新闻的日期 */ public String getNewsDate(NodeFilter dateFilter, Parser parser) { String newsDate = null; try { NodeList dateList = parser.parse(dateFilter); for (int i = 0; i < dateList.size(); i++) { Div dateTag = (Div) dateList.elementAt(i); newsDate = dateTag.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsDate; }
/** * 获得新闻的责任编辑,也就是作者。 * * @param newsauthorFilter * @param parser * @return */ public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) { String newsAuthor = ""; try { NodeList authorList = parser.parse(newsauthorFilter); for (int i = 0; i < authorList.size(); i++) { Div authorSpan = (Div) authorList.elementAt(i); newsAuthor = authorSpan.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsAuthor; }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, LinkFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); // parser.setEncoding("utf8"); // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接 NodeFilter frameFilter = new NodeFilter() { /** */ private static final long serialVersionUID = 1L; public boolean accept(Node node) { if (node.getText().startsWith("iframe") && node.getText().contains("src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤 <a> 标签和 <frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) // <a> 标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // url可能出现在src,href等属性中 if (filter.accept(linkUrl)) links.add(linkUrl); } else // <frame> 标签 { // 提取 frame 里 src 属性的链接如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src=\""); frame = frame.substring(start); int end = frame.indexOf("\">"); if (end == -1) { end = frame.indexOf("?"); } String frameUrl = frame.substring(5, end - 1); if (filter.accept(frameUrl)) links.add(frameUrl); } } } catch (ParserException e) { e.printStackTrace(); } return links; }
private String[] processBlog(InputStream in) throws BlogCrawlingException { // using a set here to avoid duplicates Set<String> linksToBlogs = new TreeSet<String>(); try { Page page = new Page(in, null); Parser parser = new Parser(new Lexer(page)); // register a filter to extract all the anchor tags TagNameFilter anchorTagsFilter = new TagNameFilter("a"); StringBuffer buf = new StringBuffer(); NodeList anchorTagsList = parser.parse(anchorTagsFilter); for (int i = 0; i < anchorTagsList.size(); i++) { Node node = anchorTagsList.elementAt(i); LinkTag tag = (LinkTag) node; String linkURL = tag.getLink(); if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) { // logger.info(" *BLOG Detected* ==> " + linkURL); System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL); linksToBlogs.add(linkURL); } else { System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL); } } String[] links = new String[linksToBlogs.size()]; int count = 0; for (String linksToBlog : linksToBlogs) { links[count++] = linksToBlog; } return links; } catch (ParserException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (UnsupportedEncodingException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } catch (IOException e) { e.printStackTrace(); throw new BlogCrawlingException(e); } }
/** * 获得新闻的标题 * * @param titleFilter * @param parser * @return */ public String getTitle(NodeFilter titleFilter, Parser parser) { String titleName = ""; try { NodeList titleNodeList = parser.parse(titleFilter); for (int i = 0; i < titleNodeList.size(); i++) { HeadingTag title = (HeadingTag) titleNodeList.elementAt(i); titleName = title.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return titleName; }
private int colspan(Row row) { NodeList rowNodes = row.rowNode.getChildren(); int colspan = 0; for (int i = 0; i < rowNodes.size(); i++) { if (rowNodes.elementAt(i) instanceof TableColumn) { String s = ((TableColumn) rowNodes.elementAt(i)).getAttribute("colspan"); if (s != null) { colspan += Integer.parseInt(s); } else { colspan++; } } } return colspan; }
/** Test attribute filtering. */ public void testAttribute() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new HasAttributeFilter("id")); assertEquals("only one element", 1, list.size()); assertType("should be LinkTag", LinkTag.class, list.elementAt(0)); LinkTag link = (LinkTag) list.elementAt(0); assertEquals("attribute value", "target", link.getAttribute("id")); }
private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter viewStateFilter = new AndFilter( new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView")); NodeList nodes = parser.parse(viewStateFilter); TableTag node = (TableTag) nodes.elementAt(0); TableRow[] rows = node.getRows(); for (int i = 1; i < rows.length; i++) { TableColumn[] cols = rows[i].getColumns(); TableColumn col = cols[3]; LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2); if (tag == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } String href = tag.getAttribute("href"); if (href == null) { List<Integer> lst = error.get(pageNo); if (lst == null) { lst = new ArrayList<Integer>(); } lst.add(i); error.put(pageNo, lst); continue; } int start = href.indexOf("standardid="); int end = href.indexOf("&"); String standardId = href.substring(start, end).replaceAll("standardid=", ""); List<String> lst = map.get(pageNo); if (lst == null) { lst = new ArrayList<String>(); } lst.add(standardId); map.put(pageNo, lst); } }