public void testParentConnections() throws ParserException { String tag1 = "<custom>"; String tag2 = "<custom>something</custom>"; String tag3 = "</custom>"; createParser(tag1 + tag2 + tag3); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new CustomTag(false), new AnotherTag(false), })); parseAndAssertNodeCount(3); CustomTag customTag = (CustomTag) node[0]; assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml()); assertNull("first custom tag should have no parent", customTag.getParent()); customTag = (CustomTag) node[1]; assertStringEquals("second custom tag html", tag2, customTag.toHtml()); assertNull("second custom tag should have no parent", customTag.getParent()); Node firstChild = customTag.childAt(0); assertType("firstChild", Text.class, firstChild); Node parent = firstChild.getParent(); assertNotNull("first child parent should not be null", parent); assertSame("parent and custom tag should be the same", customTag, parent); Tag endTag = (Tag) node[2]; assertStringEquals("third custom tag html", tag3, endTag.toHtml()); assertNull("end tag should have no parent", endTag.getParent()); }
public List<Newsitem> parseContent(String content) throws Exception { List<Newsitem> newsitems = new ArrayList<Newsitem>(); Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news"); NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem"); for (int i = 0; i < nodes.size(); i++) { NewsitemImpl newsitem = new NewsitemImpl(); Tag itemTable = (Tag) nodes.elementAt(i); Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle"); newsitem.setTitle(titleTag.toPlainTextString()); Node descriptionSpan = titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling(); newsitem.setDescription( descriptionSpan .toPlainTextString() .replaceAll("[^\\u0000-\\u00FF]", " ") .replace(" Read More...", "") .trim()); Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0); newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href")); newsitems.add(newsitem); } return newsitems; }
public void testCompositeTagWithAnotherTagChild() throws ParserException { String childtag = "<Another/>"; createParser("<Custom>" + childtag + "</Custom>"); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new CustomTag(), new AnotherTag(true), })); parseAndAssertNodeCount(1); assertType("node", CustomTag.class, node[0]); CustomTag customTag = (CustomTag) node[0]; assertEquals("child count", 1, customTag.getChildCount()); assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag()); assertEquals("starting loc", 0, customTag.getStartPosition()); assertEquals("ending loc", 8, customTag.getEndPosition()); assertEquals("custom tag starting loc", 0, customTag.getStartPosition()); assertEquals("custom tag ending loc", 27, customTag.getEndTag().getEndPosition()); Node child = customTag.childAt(0); assertType("child", AnotherTag.class, child); AnotherTag tag = (AnotherTag) child; assertEquals("another tag start pos", 8, tag.getStartPosition()); assertEquals("another tag ending pos", 18, tag.getEndPosition()); assertEquals("custom end tag start pos", 18, customTag.getEndTag().getStartPosition()); assertStringEquals("child html", childtag, child.toHtml()); }
/** * 递归钻取正文信息 * * @param nodeP * @return */ @SuppressWarnings("unchecked") protected List<Node> extractHtml(Node nodeP, String type) throws Exception { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { return null; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { tableList.add(node); } } else { List tempList = extractHtml(node, type); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { TableContext tc = new TableContext(); tc.setLinkList(new ArrayList()); tc.setTextBuffer(new StringBuffer()); tableNumber++; tc.setTableRow(tableNumber); Iterator ti = tableList.iterator(); // 得到设置的搜索URL String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL); while (ti.hasNext()) { Node node = (Node) ti.next(); if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag) node; if (!"1".equalsIgnoreCase(type)) { linkTag.setAttribute( "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href"))); } tc.getLinkList().add(linkTag); } else { tc.getTextBuffer().append(node.getText()); } } return tableList; } return null; }
/** * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。 * * @param url 新闻连接。 */ public void parser(String url) { try { parser = new Parser(url); // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new // HasAttributeFilter("class", "TRS_PreAppend")); // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。 NodeFilter innerFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal")); NodeFilter xk = new HasParentFilter(innerFilter); NodeList nodes = parser.extractAllNodesThatMatch(xk); System.out.println(nodes.size()); for (int i = 0; i < nodes.size(); i++) { Node time = nodes.elementAt(i); // System.out.println(time.toPlainTextString().trim().replace(" ", // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", "")); System.out.println( replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", ""))); } } catch (ParserException ex) { Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex); } }
// 获取一个网站上的链接,filter来过滤链接 public static Set<String> extracLinks(String url, Cobweb cobweb) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding(cobweb.getCharSet()); // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤<a> 标签和<frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL if (cobweb.accept(linkUrl)) { links.add( // java.net.URLEncoder.encode(linkUrl)); linkUrl .replaceAll("\\?", "\\%3F") // 转码 .replaceAll("\\&", "\\%26") .replaceAll("\\|", "\\%124") .replaceAll("\\#", "")); } ; } else { // <frame>标签 // 提取frame 里src 属性的链接,如<frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); if (cobweb.accept(frameUrl)) { links.add(frameUrl); } } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * 获取滚动品牌 * * @param path * @param city * @param fileName * @return */ public static Map<String, String> getBrandInfo(String path, String city, String fileName) { Map<String, String> brandMap = new LinkedHashMap<String, String>(); try { StringBuilder filePath = new StringBuilder(); filePath.append(PATH); filePath.append(city); filePath.append(INCLUDE); filePath.append(fileName); filePath.append(STUFF); // 开始解析 Parser parser = new Parser(filePath.toString()); // 过滤出<a></a>标签 NodeFilter divFilter = new NodeClassFilter(Div.class); NodeList classList = parser.extractAllNodesThatMatch(divFilter); NodeList hrefList = null; NodeList imgList = null; Node picNode = null; Node hrefNode = null; Node imgNode = null; String classStr = ""; String hrefStr = ""; String imgStr = ""; String imgClass = ""; for (int i = 0; i < classList.size(); i++) { picNode = classList.elementAt(i); classStr = ((Div) picNode).getAttribute("class"); if ("business_list_pic".equalsIgnoreCase(classStr)) { hrefList = picNode.getChildren(); for (int j = 0; j < hrefList.size(); j++) { hrefNode = hrefList.elementAt(j); if (hrefNode instanceof LinkTag) { hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id hrefStr = MParseBrand.getBrandId(hrefStr); imgList = hrefNode.getChildren(); for (int k = 0; k < imgList.size(); k++) { imgNode = imgList.elementAt(k); if (imgNode instanceof ImageTag) { imgClass = ((ImageTag) imgNode).getAttribute("class"); if (null != imgClass) { imgStr = ((ImageTag) imgNode).getAttribute("src"); if (null == imgStr) { imgStr = ((ImageTag) imgNode).getAttribute("original"); } } } } brandMap.put(hrefStr, imgStr); } } } } } catch (Exception e) { e.printStackTrace(); } return brandMap; }
/** @throws ParserException */ private void parseHtml() throws ParserException { htmlTags = new ArrayList(); Parser parser = new Parser(); parser.setInputHTML(fDocument.get()); for (NodeIterator e = parser.elements(); e.hasMoreNodes(); ) { Node node = e.nextNode(); VHtmlNodeVisitor htmlNodeVisitor = new VHtmlNodeVisitor(); node.accept(htmlNodeVisitor); } }
/** * 提取网页中所有的IssueComment元素 * * @param source */ private List<IssueCommentEvent> processComment( NodeList nodeList, List<IssueCommentEvent> icList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().matches("div id=\"issuecomment-.*\".*+")) { IssueCommentEvent i = new IssueCommentEvent(); // TODO 解析comment工作 Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); i.setActor(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); i.setCommentBody(contentNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; i.setCreatedAt(time); System.out.println(time); } icList.add(i); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processComment(childList, icList); } } } return icList; }
/** * 处理开启pullrequest的需求 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"issue-")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("open"); Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); pullRequestEvent.setBody(commentNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author"); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processOpenPull(childList, pList); } } } return pList; }
/** * Retrieves the value of a table cell. Appends the text of child nodes of the cell. In case of * composite tags like span or div the inner text is appended. */ public static String getValue(TagNode cell) { String value = EMPTY; for (Node child : cell.getChildren().toNodeArray()) { if (child instanceof CompositeTag) { value += ((CompositeTag) child).getStringText(); } else { value = value + child.getText(); } } return value.trim().replaceAll(" ", EMPTY); }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, LinkFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); // parser.setEncoding("utf8"); // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接 NodeFilter frameFilter = new NodeFilter() { /** */ private static final long serialVersionUID = 1L; public boolean accept(Node node) { if (node.getText().startsWith("iframe") && node.getText().contains("src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤 <a> 标签和 <frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) // <a> 标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // url可能出现在src,href等属性中 if (filter.accept(linkUrl)) links.add(linkUrl); } else // <frame> 标签 { // 提取 frame 里 src 属性的链接如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src=\""); frame = frame.substring(start); int end = frame.indexOf("\">"); if (end == -1) { end = frame.indexOf("?"); } String frameUrl = frame.substring(5, end - 1); if (filter.accept(frameUrl)) links.add(frameUrl); } } } catch (ParserException e) { e.printStackTrace(); } return links; }
public void testCompositeTagWithOneTextChild() throws ParserException { String html = "<Custom>" + "Hello" + "</Custom>"; createParser(html); CustomTag customTag = parseCustomTag(1); assertEquals("child count", 1, customTag.getChildCount()); assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag()); assertEquals("starting loc", 0, customTag.getStartPosition()); assertEquals("ending loc", 8, customTag.getEndPosition()); assertEquals("starting line position", 0, customTag.getStartingLineNumber()); assertEquals("ending line position", 0, customTag.getEndingLineNumber()); Node child = customTag.childAt(0); assertType("child", Text.class, child); assertStringEquals("child text", "Hello", child.toPlainTextString()); }
public void testCompositeTagWithTagChild() throws ParserException { String childtag = "<Hello>"; createParser("<Custom>" + childtag + "</Custom>"); CustomTag customTag = parseCustomTag(1); assertEquals("child count", 1, customTag.getChildCount()); assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag()); assertEquals("starting loc", 0, customTag.getStartPosition()); assertEquals("ending loc", 8, customTag.getEndPosition()); assertEquals("custom tag starting loc", 0, customTag.getStartPosition()); assertEquals("custom tag ending loc", 24, customTag.getEndTag().getEndPosition()); Node child = customTag.childAt(0); assertType("child", Tag.class, child); assertStringEquals("child html", childtag, child.toHtml()); }
private List<String> getHotTourRefs() { List<String> refs = new ArrayList<String>(); try { Node[] nodes = getNodes(URL); for (Node node : nodes) { if (node.getText().contains("class=\"latestnews\"") && node.getText().length() > 40) { refs.add(URL.substring(0, URL.length() - 2) + node.getText().split("\"")[1]); } } } catch (ParserException e) { e .printStackTrace(); // To change body of catch statement use File | Settings | File // Templates. } return refs; }
/** * Given an input, makes it safe for HTML displaying. Removes any not allowed HTML tag or * attribute, as well unwanted JavaScript statements inside the tags. * * @param contents the input to analyze * @return the modified and safe string */ public String makeSafe(String contents) { if (contents == null || contents.length() == 0) { return contents; } StringBuffer sb = new StringBuffer(contents.length()); try { Lexer lexer = new Lexer(contents); Node node; while ((node = lexer.nextNode()) != null) { boolean isTextNode = node instanceof TextNode; if (isTextNode) { // Text nodes are raw data, so we just // strip off all possible HTML content String text = node.toHtml(); if (text.indexOf('>') > -1 || text.indexOf('<') > -1) { text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); text = text.replaceAll("\"", """); node.setText(text); } } if (isTextNode || (node instanceof Tag && this.isTagWelcome(node))) { sb.append(node.toHtml()); } else { String text = node.toHtml(); text = text.replaceAll("<", "<"); text = text.replaceAll(">", ">"); sb.append(text); } } } catch (Exception e) { throw new ForumException("Error while parsing HTML: " + e, e); } return sb.toString(); }
private void getPriceAndLabel(Node node, AmazonProduct product) throws Exception { NodeList childList = node.getChildren(); List<String> productvalue = new ArrayList<String>(); processNodeList(childList, productvalue); System.out.println(productvalue); product.label = productvalue.get(0); // String price = productvalue.get(3); // product.price = getprice(price); }
private String processHTML(Node node) { String html = node.toHtml(); // String html = node.getChildren().elementAt(3).toHtml(); // html = html + node.getChildren().elementAt(5).toHtml(); // html = html + node.getChildren().elementAt(9).toHtml(); // html = html // .replaceAll("ProductDisplay", // "http://www.abercrombie.com/webapp/wcs/stores/servlet/ProductDisplay"); // html = html.replace("//anf", "http://anf"); return html; }
@Override public CompiledTemplate compile( final String template, final ParserConfiguration parserConfiguration) { Source source = new StringSource(template); Page page = new Page(source); Lexer lexer = new Lexer(page); HTMLNodeVisitor visitor = new HTMLNodeVisitor( ehtAttributeprefix, expressionCompiler, inlineCompilers, parserConfiguration); visitor.beginParsing(); try { for (Node node = lexer.nextNode(); node != null; node = lexer.nextNode()) { node.accept(visitor); } } catch (ParserException e) { throw new RuntimeException(e); } visitor.finishedParsing(); return new CompiledTemplateImpl(visitor.getRootNode()); }
/** * Mainline for command line operation * * @param args [0] The URL to parse. * @exception MalformedURLException If the provided URL cannot be resolved. * @exception ParserException If the parse fails. */ public static void main(String[] args) throws MalformedURLException, ParserException { ConnectionManager manager; Lexer lexer; Node node; if (0 >= args.length) { System.out.println("HTML Lexer v" + getVersion() + "\n"); System.out.println(); System.out.println("usage: java -jar htmllexer.jar <url>"); } else { try { manager = Page.getConnectionManager(); lexer = new Lexer(manager.openConnection(args[0])); while (null != (node = lexer.nextNode(false))) System.out.println(node.toString()); } catch (ParserException pe) { System.out.println(pe.getMessage()); if (null != pe.getThrowable()) System.out.println(pe.getThrowable().getMessage()); } } }
private void processNodeList(NodeList list, List<String> valueList) { // 迭代开始 SimpleNodeIterator iterator = list.elements(); while (iterator.hasMoreNodes()) { Node node = iterator.nextNode(); // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null == childList) { // 得到值节点的值 String result = node.toPlainTextString().trim(); // 若包含关键字,则简单打印出来文本 // System.out.println(result); if (result != null && !"".equals(result)) valueList.add(result); } // end if // 孩子节点不为空,继续迭代该孩子节点 else { processNodeList(childList, valueList); } // end else } // end wile }
/** * 处理对pullrequest的review时,comment的操作, 与processSubPullRequestReviewComment配合一起使用 * * @param nodeList * @param prList * @return */ public List<PullRequestReviewCommentEvent> processPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"diff-for-comment-")) { String discussionId = node.getText().split("\"")[1]; System.out.println(discussionId); NodeList subNodeList = node.getChildren(); prList = processSubPullRequestReviewComment(subNodeList, prList, discussionId); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processPullRequestReviewComment(childList, prList); } } } return prList; }
/** * 处理Reference了当前pullrequest的操作 * * @param source */ public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("ref"); Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\""); pullRequestEvent.setBody( anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString()); Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+"); Matcher artifactMatcher = artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText()); if (artifactMatcher.find()) { String anotherAtifact = artifactMatcher.group(); pullRequestEvent.setPullrequestBaseRef(anotherAtifact); System.out.println(anotherAtifact); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processReference(childList, pList); } } } return pList; }
/** * Given an input, analyze each HTML tag and remove unsecured attributes from them. * * @param contents The content to verify * @return the content, secure. */ public String ensureAllAttributesAreSafe(String contents) { StringBuffer sb = new StringBuffer(contents.length()); try { Lexer lexer = new Lexer(contents); Node node; while ((node = lexer.nextNode()) != null) { if (node instanceof Tag) { Tag tag = (Tag) node; this.checkAndValidateAttributes(tag, false); sb.append(tag.toHtml()); } else { sb.append(node.toHtml()); } } } catch (Exception e) { throw new ForumException("Problems while parsing HTML: " + e, e); } return sb.toString(); }
private boolean getName(Node node) { NodeList childList = node.getChildren(); List<String> productvalue = new ArrayList<String>(); processNodeList(childList, productvalue); // System.out.println(productvalue); int i = 0; while (i < productvalue.size()) { String Quantity = productvalue.get(i); if (Quantity.startsWith("Quantity")) { return true; } i = i + 1; } return false; }
/** * Accept nodes that are a LinkTag and have a URL that matches the pattern supplied in the * constructor. * * @param node The node to check. * @return <code>true</code> if the node is a link with the pattern. */ public boolean accept(Node node) { boolean ret; ret = false; if (LinkTag.class.isAssignableFrom(node.getClass())) { String link = ((LinkTag) node).getLink(); if (mCaseSensitive) { if (link.indexOf(mPattern) > -1) ret = true; } else { if (link.toUpperCase().indexOf(mPattern.toUpperCase()) > -1) ret = true; } } return (ret); }
/** * 处理移除里程碑动作 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processRemoveMileStone( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) { PullRequestEvent p = new PullRequestEvent(); p.setAction("removeMilestone"); Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+"); Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText()); if (milestoneMatcher.find()) { String milestone = milestoneMatcher.group().split("\"")[0]; p.setBody(milestone); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } pList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processRemoveMileStone(childList, pList); } } } return pList; }
protected void emit(ReplayParseContext context, String pre, Node node, String post) throws IOException { OutputStream out = context.getOutputStream(); if (out != null) { // Charset charset = Charset.forName(context.getOutputCharset()); String charset = context.getOutputCharset(); if (pre != null) { out.write(pre.getBytes(charset)); } if (node != null) { out.write(node.toHtml(true).getBytes(charset)); } if (post != null) { out.write(post.getBytes(charset)); } } }
/** * 处理labeled操作 * * @param source */ public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("labeled"); List<Node> lableList = new ArrayList<Node>(); lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList); String lables = ""; for (int i = 0; i < lableList.size(); i++) { lables += lableList.get(i).toPlainTextString(); if (i != lableList.size() - 1) { lables += ","; } } System.out.println(lables); pullRequestEvent.setPullrequestBaseLabels(lables); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processLabled(childList, pList); } } } return pList; }
/** * 处理取消指派某人操作 * * <p>跟之前一样,取消指派的是后面的家伙 * * @param nodeList * @param pList * @return */ private List<PullRequestEvent> processUnassigned( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) { PullRequestEvent pEvent = new PullRequestEvent(); pEvent.setAction("assigned"); Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); if (actorNode != null) { pEvent.setActor(actorNode.toPlainTextString()); } else { pEvent.setActor(assignedNode.toPlainTextString()); } System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pEvent.setCreatedAt(time); } pList.add(pEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processUnassigned(childList, pList); } } } return pList; }