/** * 获取滚动品牌 * * @param path * @param city * @param fileName * @return */ public static Map<String, String> getBrandInfo(String path, String city, String fileName) { Map<String, String> brandMap = new LinkedHashMap<String, String>(); try { StringBuilder filePath = new StringBuilder(); filePath.append(PATH); filePath.append(city); filePath.append(INCLUDE); filePath.append(fileName); filePath.append(STUFF); // 开始解析 Parser parser = new Parser(filePath.toString()); // 过滤出<a></a>标签 NodeFilter divFilter = new NodeClassFilter(Div.class); NodeList classList = parser.extractAllNodesThatMatch(divFilter); NodeList hrefList = null; NodeList imgList = null; Node picNode = null; Node hrefNode = null; Node imgNode = null; String classStr = ""; String hrefStr = ""; String imgStr = ""; String imgClass = ""; for (int i = 0; i < classList.size(); i++) { picNode = classList.elementAt(i); classStr = ((Div) picNode).getAttribute("class"); if ("business_list_pic".equalsIgnoreCase(classStr)) { hrefList = picNode.getChildren(); for (int j = 0; j < hrefList.size(); j++) { hrefNode = hrefList.elementAt(j); if (hrefNode instanceof LinkTag) { hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id hrefStr = MParseBrand.getBrandId(hrefStr); imgList = hrefNode.getChildren(); for (int k = 0; k < imgList.size(); k++) { imgNode = imgList.elementAt(k); if (imgNode instanceof ImageTag) { imgClass = ((ImageTag) imgNode).getAttribute("class"); if (null != imgClass) { imgStr = ((ImageTag) imgNode).getAttribute("src"); if (null == imgStr) { imgStr = ((ImageTag) imgNode).getAttribute("original"); } } } } brandMap.put(hrefStr, imgStr); } } } } } catch (Exception e) { e.printStackTrace(); } return brandMap; }
/** Test attribute filtering. */ public void testAttribute() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new HasAttributeFilter("id")); assertEquals("only one element", 1, list.size()); assertType("should be LinkTag", LinkTag.class, list.elementAt(0)); LinkTag link = (LinkTag) list.elementAt(0); assertEquals("attribute value", "target", link.getAttribute("id")); }
private int colspan(Row row) { NodeList rowNodes = row.rowNode.getChildren(); int colspan = 0; for (int i = 0; i < rowNodes.size(); i++) { if (rowNodes.elementAt(i) instanceof TableColumn) { String s = ((TableColumn) rowNodes.elementAt(i)).getAttribute("colspan"); if (s != null) { colspan += Integer.parseInt(s); } else { colspan++; } } } return colspan; }
/** * Set the enclosed <code>PARAM<code> children. * @param newObjectParams The new parameters. */ public void setObjectParams(HashMap newObjectParams) { NodeList kids; Node node; Tag tag; String paramName; String paramValue; List attributes; TextNode string; kids = getChildren(); if (null == kids) kids = new NodeList(); else // erase objectParams from kids for (int i = 0; i < kids.size(); ) { node = kids.elementAt(i); if (node instanceof Tag) if (((Tag) node).getTagName().equals("PARAM")) { kids.remove(i); // remove whitespace too if (i < kids.size()) { node = kids.elementAt(i); if (node instanceof TextNode) { string = (TextNode) node; if (0 == string.getText().trim().length()) kids.remove(i); } } } else i++; else i++; } // add newObjectParams to kids for (Iterator e = newObjectParams.entrySet().iterator(); e.hasNext(); ) { Map.Entry entry = (Entry) e.next(); attributes = new ArrayList(); // should the tag copy the attributes? paramName = (String) entry.getKey(); paramValue = (String) entry.getValue(); attributes.add(new Attribute("PARAM", null)); attributes.add(new Attribute(" ")); attributes.add(new Attribute("VALUE", paramValue, '"')); attributes.add(new Attribute(" ")); attributes.add(new Attribute("NAME", paramName.toUpperCase(), '"')); tag = new TagNode(null, 0, 0, attributes); kids.add(tag); } // set kids as new children setChildren(kids); }
public static void dealOnePage(String url, int starNo) { try { Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection()); NodeList tableSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF")); parser = new Parser(new Lexer(tableSet.toHtml())); NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr")); parser = new Parser(new Lexer(tdSet.toHtml())); PrototypicalNodeFactory p = new PrototypicalNodeFactory(); p.registerTag(new SpanTag()); parser.setNodeFactory(p); NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span")); int index = 0; for (int i = 5; i < spanSet.size(); i = i + 5) { String str = spanSet.elementAt(i).toPlainTextString(); String now = "" + (starNo * 100 + index); index++; while (str.compareTo(now) != 0) { System.out.println(now); now = "" + (starNo * 100 + index); index++; } // System.out.println(str); } } catch (ParserException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
public void checkprice() throws Exception { // System.out.println("checking Aptamil url [" + page.url + "]"); URL url = new URL(page.url); HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection(); urlConnection.setConnectTimeout(Constant.connect_timeout); urlConnection.connect(); Parser parser = new Parser(urlConnection); parser.setEncoding(Constant.ENCODE); NodeClassFilter div_filter = new NodeClassFilter(Div.class); OrFilter filters = new OrFilter(); filters.setPredicates(new NodeFilter[] {div_filter}); NodeList list = parser.extractAllNodesThatMatch(filters); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof Div) { Div d = (Div) tag; String divclass = d.getAttribute("class"); if ("pl_addToBasket".equalsIgnoreCase(divclass)) { // return getName(d); } } } }
/** * 获取文章链接 * * @param url * @throws Exception */ void docByHTML(String content, String pre) throws Exception { Parser parser = new Parser(); parser.setInputHTML(content); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { if (null == articleDocCache.get(getKey(link.getLink()))) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } else { logger.info(">> 已存在 [" + link.getLink() + "] 地址"); } } } } } }
/** Test node class filtering. */ public void testNodeClass() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the time for all good men..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new NodeClassFilter(BodyTag.class)); assertEquals("only one element", 1, list.size()); assertType("should be BodyTag", BodyTag.class, list.elementAt(0)); BodyTag body = (BodyTag) list.elementAt(0); assertEquals("only one child", 1, body.getChildCount()); assertSuperType("should be Text", Text.class, body.getChildren().elementAt(0)); assertStringEquals("html", guts, body.toHtml()); }
/** * 获取新闻的内容 * * @param newsContentFilter * @param parser * @return content 新闻内容 */ public String getNewsContent(NodeFilter newsContentFilter, Parser parser) { String content = null; StringBuilder builder = new StringBuilder(); try { NodeList newsContentList = parser.parse(newsContentFilter); for (int i = 0; i < newsContentList.size(); i++) { Div newsContenTag = (Div) newsContentList.elementAt(i); builder = builder.append(newsContenTag.getStringText()); } content = builder.toString(); // 转换为String 类型。 if (content != null) { parser.reset(); parser = Parser.createParser(content, "utf8"); StringBean sb = new StringBean(); sb.setCollapse(true); parser.visitAllNodesWith(sb); content = sb.getStrings(); // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} } // showTurnAD444(intTurnAD444); }catch(e){}"; content = content.replaceAll("\\\".*[a-z].*\\}", ""); content = content.replace("[我来说两句]", ""); } else { System.out.println("没有得到新闻内容!"); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return content; }
/** * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。 * * @param url 新闻连接。 */ public void parser(String url) { try { parser = new Parser(url); // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new // HasAttributeFilter("class", "TRS_PreAppend")); // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。 NodeFilter innerFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal")); NodeFilter xk = new HasParentFilter(innerFilter); NodeList nodes = parser.extractAllNodesThatMatch(xk); System.out.println(nodes.size()); for (int i = 0; i < nodes.size(); i++) { Node time = nodes.elementAt(i); // System.out.println(time.toPlainTextString().trim().replace(" ", // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", "")); System.out.println( replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", ""))); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } }
/** Test string filtering. */ public void testString() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <time>men</time>..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new StringFilter("Time")); assertEquals("only one element", 1, list.size()); assertSuperType("should be String", Text.class, list.elementAt(0)); assertStringEquals("name", "time", ((Text) list.elementAt(0)).getText()); // test case sensitivity list = parser.extractAllNodesThatMatch(new StringFilter("Time", true)); assertEquals("should be no elements", 0, list.size()); }
public List<Newsitem> parseContent(String content) throws Exception { List<Newsitem> newsitems = new ArrayList<Newsitem>(); Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news"); NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem"); for (int i = 0; i < nodes.size(); i++) { NewsitemImpl newsitem = new NewsitemImpl(); Tag itemTable = (Tag) nodes.elementAt(i); Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle"); newsitem.setTitle(titleTag.toPlainTextString()); Node descriptionSpan = titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling(); newsitem.setDescription( descriptionSpan .toPlainTextString() .replaceAll("[^\\u0000-\\u00FF]", " ") .replace(" Read More...", "") .trim()); Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0); newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href")); newsitems.add(newsitem); } return newsitems; }
/** * 获取文章链接 * * @param url * @throws Exception */ void doc(String url, String pre) throws Exception { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GB2312"); NodeFilter fileter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(fileter); if (list != null && list.size() > 0) { Parser p1 = new Parser(); p1.setInputHTML(list.toHtml()); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList linkList = p1.extractAllNodesThatMatch(linkFilter); if (linkList != null && linkList.size() > 0) { for (int i = 0; i < linkList.size(); i++) { LinkTag link = (LinkTag) linkList.elementAt(i); LinkBean bean = null; if (link.getLink().toLowerCase().startsWith(pre) && !link.getLinkText().equalsIgnoreCase("详细内容")) { bean = new LinkBean(); bean.setLink(link.getLink()); bean.setName(link.getLinkText()); LINKHASH.put(link.getLink(), bean); } } } } }
public static void main(String[] args) throws Exception { RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build(); CloseableHttpClient httpclient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build(); int count = 1; for (int i = 0; i <= 16; i++) { int index = i; // System.out.println(index); HttpGet httpGet = new HttpGet(url3 + index + url4); HttpResponse response = httpclient.execute(httpGet); HttpEntity entity = response.getEntity(); String htmls = null; if (entity != null) { htmls = EntityUtils.toString(entity).replace("\t", " "); } Parser parser = Parser.createParser(htmls, "utf-8"); AndFilter dFilter = new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content")); NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter); for (int k = 0; k < nodes3.size(); k++) { htmls = nodes3.elementAt(k).toHtml(); parser = Parser.createParser(htmls, "utf-8"); AndFilter ProfessionNameFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href")); NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter); for (int j = 0; j < nodes4.size(); j++) { LinkTag link = (LinkTag) nodes4.elementAt(j); // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/")) { // .replaceAll("<span[\\s\\S]*/span>","") String temp = link.toHtml(); System.out.println( "{\"" + count + "\",\"http://www.chi.ac.uk/" + link.getAttribute("href") + "\",\"" + html2Str(temp).replace("\r\n", "").trim() + "\",\"0\"},"); count++; } } } } // System.out.println("DONE."); }
public Row(CompositeTag rowNode) { this.rowNode = rowNode; NodeList nodeList = rowNode.getChildren(); for (int i = 0; i < nodeList.size(); i++) { Node node = nodeList.elementAt(i); if (node instanceof TableColumn) cells.add(new Cell((TableColumn) node)); } }
private LCOdds parseRow(NodeList cells) { if (cells.size() == 8) { try { LCOdds lc = new LCOdds(Constants.PLAY_06_LC_2, "2,1"); parseMatchCode(lc, cells.elementAt(0)); parseOfftime(lc, formater, cells.elementAt(3)); for (int i = 4; i <= 5; i++) { lc.addOdd(parseOdd(cells.elementAt(i))); } return lc; } catch (Exception e) { warn(log, e); } } return null; }
// 获取一个网站上的链接,filter来过滤链接 public static Set<String> extracLinks(String url, Cobweb cobweb) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding(cobweb.getCharSet()); // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤<a> 标签和<frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL if (cobweb.accept(linkUrl)) { links.add( // java.net.URLEncoder.encode(linkUrl)); linkUrl .replaceAll("\\?", "\\%3F") // 转码 .replaceAll("\\&", "\\%26") .replaceAll("\\|", "\\%124") .replaceAll("\\#", "")); } ; } else { // <frame>标签 // 提取frame 里src 属性的链接,如<frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); if (cobweb.accept(frameUrl)) { links.add(frameUrl); } } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** Test child filtering. */ public void testChild() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new HasChildFilter(new TagNameFilter("b"))); assertEquals("only one element", 1, list.size()); assertType("should be LinkTag", LinkTag.class, list.elementAt(0)); LinkTag link = (LinkTag) list.elementAt(0); assertEquals("three children", 3, link.getChildCount()); assertSuperType("should be TagNode", Tag.class, link.getChildren().elementAt(0)); Tag tag = (Tag) link.getChildren().elementAt(0); assertStringEquals("name", "B", tag.getTagName()); }
private void setExecutionResult(ExecutionResult executionResult) { NodeList cells = rowNode.getChildren(); for (int i = 0; i < cells.size(); i++) { Node cell = cells.elementAt(i); if (cell instanceof Tag) { Tag tag = (Tag) cell; tag.setAttribute("class", executionResult.toString(), '"'); } } }
public static void setEventValidation(String html) throws ParserException { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter( new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); eventValidation = node.getAttribute("value"); }
public static void setViewState(String html) throws Exception { Parser parser = Parser.createParser(html, "gb2312"); AndFilter filter = new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE")); NodeList nodes = parser.parse(filter); InputTag node = (InputTag) nodes.elementAt(0); viewState = node.getAttribute("value"); }
public HtmlTable(TableTag tableNode) { this.tableNode = tableNode; NodeList nodeList = tableNode.getChildren(); for (int i = 0; i < nodeList.size(); i++) { Node node = nodeList.elementAt(i); if (node instanceof TableRow || node instanceof TableHeader) { rows.add(new Row((CompositeTag) node)); } } }
/** Test and filtering. */ public void testAnd() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=one><b>time</b></a> for all good <a id=two><b>men</b></a>..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch( new AndFilter( new HasChildFilter(new TagNameFilter("b")), new HasChildFilter(new StringFilter("men")))); assertEquals("only one element", 1, list.size()); assertType("should be LinkTag", LinkTag.class, list.elementAt(0)); LinkTag link = (LinkTag) list.elementAt(0); assertEquals("attribute value", "two", link.getAttribute("id")); }
public static List<String> getLinks(String url) throws ParserException { Parser htmlParser = new Parser(url); List<String> links = new LinkedList<String>(); NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class)); for (int m = 0; m < tagNodeList.size(); m++) { LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m); String linkName = loopLinks.getLink(); links.add(linkName); } return links; }
private static void addDetailToMap(String key, String text) throws Exception { Parser parser = Parser.createParser(text, "gb2312"); TagNameFilter tableFiler = new TagNameFilter("table"); NodeList nodes = parser.parse(tableFiler); TableTag node = (TableTag) nodes.elementAt(5); TableRow[] rows = node.getRows(); for (int i = 1; i < 11; i++) { TableColumn[] cols = rows[i].getColumns(); StringBuffer txt1 = new StringBuffer(); StringBuffer txt2 = new StringBuffer(); NodeList span1 = cols[1].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span1.size(); j++) { if (span1.elementAt(j) instanceof TextNode) { txt1.append(span1.elementAt(j).getText()).append(" "); } } NodeList span2 = cols[3].getChildren().elementAt(1).getChildren(); for (int j = 0; j < span2.size(); j++) { if (span2.elementAt(j) instanceof TextNode) { txt2.append(span2.elementAt(j).getText()).append(" "); } } List<String> lst = detailMap.get(key); if (lst == null) { lst = new ArrayList<String>(); } lst.add(txt1.toString().trim()); lst.add(txt2.toString().trim()); detailMap.put(key, lst); } }
/** * Add the tag name and it's children's tag names to the set of tag names. * * @param set The set to add to. * @param node The node to get the names from. */ protected void addName(Set set, Node node) { NodeList children; if (node instanceof Tag) { set.add(((Tag) node).getTagName()); if (node instanceof CompositeTag) { children = ((CompositeTag) node).getChildren(); if (null != children) for (int i = 0; i < children.size(); i++) addName(set, children.elementAt(i)); } } }
/** * 对新闻URL进行解析并采集数据 * * @param url 新闻连接。 */ public void parser(String url) { String title = ""; // 新闻标题 String source = ""; // 新闻来源 String sourceTime = ""; // 新闻来源时间 // String author = ""; //新闻作者 String Content = ""; // 新闻内容 // String collectTime = ""; //新闻采集时间-系统时间 try { parser = new Parser(url); parser.setEncoding("GB2312"); // 标题 NodeFilter titleFilter = new TagNameFilter("h1"); NodeList titleNodeList = parser.parse(titleFilter); title = parserUtil.getNodeListText(titleNodeList); parser.reset(); // 每次获取都必须reset,不然后面获取不到数据 System.out.println(title); // 来源 NodeFilter sourceFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name")); NodeList sourceNodeList = parser.parse(sourceFilter); source = parserUtil.getNodeListText(sourceNodeList); parser.reset(); System.out.println(source); // 来源时间 NodeFilter sourceTimeFilter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date")); NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter); String str = parserUtil.getNodeListText(sourceTimeNodeList); sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", ""); parser.reset(); System.out.println(sourceTime); // 正文 NodeFilter ContentTimeFilter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody")); NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter); NodeList childList = ContentTimeNodeList.elementAt(0).getChildren(); childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分 // childList.keepAllNodesThatMatch(new RegexFilter(" 相关专题")); Content = parserUtil.getNodeListHTML(ContentTimeNodeList); // Content = ParserUtil.getPlainText(Content); System.out.println(Content); parser.reset(); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
/** Test tag name filtering. */ public void testTagName() throws ParserException { String guts; String html; NodeList list; guts = "<booty>Now is the time for all good men..</booty>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new TagNameFilter("booty")); assertEquals("only one element", 1, list.size()); assertSuperType("should be Tag", Tag.class, list.elementAt(0)); assertStringEquals("name", "BOOTY", ((Tag) (list.elementAt(0))).getTagName()); }
/** * Accept tags with children acceptable to the filter. * * @param node The node to check. * @return <code>true</code> if the node has an acceptable child, <code>false</code> otherwise. */ public boolean accept(Node node) { CompositeTag tag; NodeList children; boolean ret; ret = false; if (node instanceof CompositeTag) { tag = (CompositeTag) node; children = tag.getChildren(); if (null != children) { for (int i = 0; !ret && i < children.size(); i++) if (getChildFilter().accept(children.elementAt(i))) ret = true; // do recursion after all children are checked // to get breadth first traversal if (!ret && getRecursive()) for (int i = 0; !ret && i < children.size(); i++) if (accept(children.elementAt(i))) ret = true; } } return (ret); }
/* * 获得新闻的日期 */ public String getNewsDate(NodeFilter dateFilter, Parser parser) { String newsDate = null; try { NodeList dateList = parser.parse(dateFilter); for (int i = 0; i < dateList.size(); i++) { Div dateTag = (Div) dateList.elementAt(i); newsDate = dateTag.getStringText(); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } return newsDate; }