/** * 提取网页中所有的IssueComment元素 * * @param source */ private List<IssueCommentEvent> processComment( NodeList nodeList, List<IssueCommentEvent> icList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().matches("div id=\"issuecomment-.*\".*+")) { IssueCommentEvent i = new IssueCommentEvent(); // TODO 解析comment工作 Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); i.setActor(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); i.setCommentBody(contentNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; i.setCreatedAt(time); System.out.println(time); } icList.add(i); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processComment(childList, icList); } } } return icList; }
/** * 处理开启pullrequest的需求 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"issue-")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("open"); Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); pullRequestEvent.setBody(commentNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author"); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processOpenPull(childList, pList); } } } return pList; }
public List<PullRequestReviewCommentEvent> processSubPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList, String discussionId) { SimpleNodeIterator sni2 = nodeList.elements(); while (sni2.hasMoreNodes()) { Node node2 = sni2.nextNode(); if (node2.getText().contains("div id=\"discussion_r")) { PullRequestReviewCommentEvent p = new PullRequestReviewCommentEvent(); // TODO 解析comment工作 p.setDiscussionId(discussionId); Node actorNode = DownloadUtil.getSomeChild(node2, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node2, "div class=\"comment-body"); p.setCommentBody(contentNode.toPlainTextString()); System.out.println(contentNode.toPlainTextString().trim()); Node timeNode = DownloadUtil.getSomeChild(node2, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } prList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node2.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processSubPullRequestReviewComment(childList, prList, discussionId); } } } return prList; }
/** * 提取网页中的删除操作 * * @param nodeList * @param dList * @return */ public List<DeleteEvent> processDelete(NodeList nodeList, List<DeleteEvent> dList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("discussion-item-head_ref_deleted")) { DeleteEvent d = new DeleteEvent(); // TODO 解析comment工作 Node deleteNode = DownloadUtil.getSomeChild(node, "span title=\""); d.setRef(deleteNode.getText().split("\"")[1]); System.out.println(deleteNode.getText().split("\"")[1]); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); d.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; d.setDeleteAt(time); } dList.add(d); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processDelete(childList, dList); } } } return dList; }
/** Test scan with data which is of diff nodes type */ public void testScan() throws ParserException { createParser( "<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>", "http://www.yahoo.com"); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new LinkTag(), new ImageTag(), })); parseAndAssertNodeCount(1); assertTrue("Node should be a link node", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; // Get the link data and cross-check Node[] dataNode = new Node[10]; int i = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { dataNode[i++] = e.nextNode(); } assertEquals("Number of data nodes", new Integer(2), new Integer(i)); assertTrue("First data node should be an Image Node", dataNode[0] instanceof ImageTag); assertTrue("Second data node shouls be a String Node", dataNode[1] instanceof Text); // Check the contents of each data node ImageTag imageTag = (ImageTag) dataNode[0]; assertEquals("Image URL", "http://www.yahoo.com/abcd.jpg", imageTag.getImageURL()); Text stringNode = (Text) dataNode[1]; assertEquals("String Contents", "Hello World", stringNode.getText()); }
public void testLinkDataContents() throws ParserException { createParser( "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>", "http://transfer.go.com"); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new LinkTag(), new ImageTag(), })); parseAndAssertNodeCount(1); assertTrue("Node 0 should be a link tag", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; assertEquals( "Link URL", "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689", linkTag.getLink()); assertEquals("Link Text", "", linkTag.getLinkText()); Node[] containedNodes = new Node[10]; int i = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { containedNodes[i++] = e.nextNode(); } assertEquals("There should be 5 contained nodes in the link tag", 5, i); assertTrue( "First contained node should be an image tag", containedNodes[0] instanceof ImageTag); ImageTag imageTag = (ImageTag) containedNodes[0]; assertEquals( "Image Location", "http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif", imageTag.getImageURL()); assertEquals("Image Height", "60", imageTag.getAttribute("HEIGHT")); assertEquals("Image Width", "468", imageTag.getAttribute("WIDTH")); assertEquals("Image Border", "0", imageTag.getAttribute("BORDER")); assertEquals( "Image Alt", "See Signs in Theaters 8-2 - Starring Mel Gibson", imageTag.getAttribute("ALT")); assertTrue("Second contained node should be Tag", containedNodes[1] instanceof Tag); Tag tag1 = (Tag) containedNodes[1]; assertEquals( "Tag Contents", "font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.getText()); assertTrue("Third contained node should be Tag", containedNodes[2] instanceof Tag); Tag tag2 = (Tag) containedNodes[2]; assertEquals("Tag Contents", "b", tag2.getText()); assertTrue("Fourth contained node should be a Tag", containedNodes[3] instanceof Tag); Tag tag = (Tag) containedNodes[3]; assertTrue("Fourth contained node should be an EndTag", tag.isEndTag()); assertEquals("Fourth Tag contents", "/b", tag.getText()); assertTrue("Fifth contained node should be a Tag", containedNodes[4] instanceof Tag); tag = (Tag) containedNodes[4]; assertTrue("Fifth contained node should be an EndTag", tag.isEndTag()); assertEquals("Fifth Tag contents", "/font", tag.getText()); }
/** * Gets a frame by name. Names are checked without case sensitivity and conversion to uppercase is * performed with the locale provided. * * @param name The name of the frame to retrieve. * @param locale The locale to use when converting to uppercase. * @return The specified frame or <code>null</code> if it wasn't found. */ public FrameTag getFrame(String name, Locale locale) { Node node; FrameTag ret; ret = null; name = name.toUpperCase(locale); for (SimpleNodeIterator e = getFrames().elements(); e.hasMoreNodes() && (null == ret); ) { node = e.nextNode(); if (node instanceof FrameTag) { ret = (FrameTag) node; if (!ret.getFrameName().toUpperCase(locale).equals(name)) ret = null; } } return (ret); }
/** * 处理对pullrequest的review时,comment的操作, 与processSubPullRequestReviewComment配合一起使用 * * @param nodeList * @param prList * @return */ public List<PullRequestReviewCommentEvent> processPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"diff-for-comment-")) { String discussionId = node.getText().split("\"")[1]; System.out.println(discussionId); NodeList subNodeList = node.getChildren(); prList = processSubPullRequestReviewComment(subNodeList, prList, discussionId); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processPullRequestReviewComment(childList, prList); } } } return prList; }
private void processNodeList(NodeList list, List<String> valueList) { // 迭代开始 SimpleNodeIterator iterator = list.elements(); while (iterator.hasMoreNodes()) { Node node = iterator.nextNode(); // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null == childList) { // 得到值节点的值 String result = node.toPlainTextString().trim(); // 若包含关键字,则简单打印出来文本 // System.out.println(result); if (result != null && !"".equals(result)) valueList.add(result); } // end if // 孩子节点不为空,继续迭代该孩子节点 else { processNodeList(childList, valueList); } // end else } // end wile }
/** * 处理Reference了当前pullrequest的操作 * * @param source */ public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("ref"); Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\""); pullRequestEvent.setBody( anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString()); Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+"); Matcher artifactMatcher = artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText()); if (artifactMatcher.find()) { String anotherAtifact = artifactMatcher.group(); pullRequestEvent.setPullrequestBaseRef(anotherAtifact); System.out.println(anotherAtifact); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processReference(childList, pList); } } } return pList; }
/** * 处理labeled操作 * * @param source */ public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("labeled"); List<Node> lableList = new ArrayList<Node>(); lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList); String lables = ""; for (int i = 0; i < lableList.size(); i++) { lables += lableList.get(i).toPlainTextString(); if (i != lableList.size() - 1) { lables += ","; } } System.out.println(lables); pullRequestEvent.setPullrequestBaseLabels(lables); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processLabled(childList, pList); } } } return pList; }
public void testBadImageInLinkBug() throws ParserException { createParser( "<a href=\"registration.asp?EventID=1272\"><img border=\"0\" src=\"\\images\\register.gif\"</a>", "http://www.fedpage.com/Event.asp?EventID=1272"); parseAndAssertNodeCount(1); assertTrue("Node should be a HTMLLinkTag", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; // Get the image tag from the link Node insideNodes[] = new Node[10]; int j = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { insideNodes[j++] = e.nextNode(); } assertEquals("Number of contained internal nodes", 1, j); assertTrue(insideNodes[0] instanceof ImageTag); ImageTag imageTag = (ImageTag) insideNodes[0]; assertEquals( "Image Tag Location", "http://www.fedpage.com/images\\register.gif", imageTag.getImageURL()); }
/** * 处理取消指派某人操作 * * <p>跟之前一样,取消指派的是后面的家伙 * * @param nodeList * @param pList * @return */ private List<PullRequestEvent> processUnassigned( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) { PullRequestEvent pEvent = new PullRequestEvent(); pEvent.setAction("assigned"); Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); if (actorNode != null) { pEvent.setActor(actorNode.toPlainTextString()); } else { pEvent.setActor(assignedNode.toPlainTextString()); } System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pEvent.setCreatedAt(time); } pList.add(pEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processUnassigned(childList, pList); } } } return pList; }
/** * 处理移除里程碑动作 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processRemoveMileStone( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) { PullRequestEvent p = new PullRequestEvent(); p.setAction("removeMilestone"); Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+"); Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText()); if (milestoneMatcher.find()) { String milestone = milestoneMatcher.group().split("\"")[0]; p.setBody(milestone); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } pList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processRemoveMileStone(childList, pList); } } } return pList; }
/** * Output a string representing this object tag. * * @return A string showing the contents of the object tag. */ public String toString() { HashMap parameters; Iterator params; String paramName; String paramValue; boolean found; Node node; StringBuffer ret; ret = new StringBuffer(500); ret.append("Object Tag\n"); ret.append("**********\n"); ret.append("ClassId = "); ret.append(getObjectClassId()); ret.append("\n"); ret.append("CodeBase = "); ret.append(getObjectCodeBase()); ret.append("\n"); ret.append("CodeType = "); ret.append(getObjectCodeType()); ret.append("\n"); ret.append("Data = "); ret.append(getObjectData()); ret.append("\n"); ret.append("Height = "); ret.append(getObjectHeight()); ret.append("\n"); ret.append("Standby = "); ret.append(getObjectStandby()); ret.append("\n"); ret.append("Type = "); ret.append(getObjectType()); ret.append("\n"); ret.append("Width = "); ret.append(getObjectWidth()); ret.append("\n"); parameters = getObjectParams(); params = parameters.entrySet().iterator(); if (null == params) ret.append("No Params found.\n"); else for (int cnt = 0; params.hasNext(); cnt++) { Map.Entry entry = (Entry) params.next(); paramName = (String) entry.getKey(); paramValue = (String) entry.getValue(); ret.append(cnt); ret.append(": Parameter name = "); ret.append(paramName); ret.append(", Parameter value = "); ret.append(paramValue); ret.append("\n"); } found = false; for (SimpleNodeIterator e = children(); e.hasMoreNodes(); ) { node = e.nextNode(); if (node instanceof Tag) if (((Tag) node).getTagName().equals("PARAM")) continue; if (!found) ret.append("Miscellaneous items :\n"); else ret.append(" "); found = true; ret.append(node.toString()); } if (found) ret.append("\n"); ret.append("End of Object Tag\n"); ret.append("*****************\n"); return (ret.toString()); }
private void scanPage() throws IOException, ParserException, ParseException { URL u = new URL(this.url); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); Parser parser = new Parser(conn); System.setProperty("sun.net.client.defaultConnectTimeout", "30000000"); // jdk1.4换成这个,连接超时 System.setProperty("sun.net.client.defaultReadTimeout", "30000000"); // jdk1.4换成这个,读操作超时 // con.setConnectTimeout(5000);//jdk 1.5换成这个,连接超时 // con.setReadTimeout(5000);//jdk 1.5换成这个,读操作超时 parser.setEncoding("UTF-8"); NodeFilter filter = new NodeClassFilter(CompositeTag.class); NodeList tags = parser.extractAllNodesThatMatch(filter); SimpleNodeIterator iter = tags.elements(); CompositeTag tag = null; while (iter.hasMoreNodes()) { tag = (CompositeTag) iter.nextNode(); String id = tag.getAttribute("id"); String cls = tag.getAttribute("class"); if ((tag instanceof LinkTag)) { LinkTag lt = (LinkTag) tag; if (cls == null) { continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Category")) { this.category = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Sub-Category")) { this.subCategory = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Brand")) { this.brand = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*PrForm*Free-Shipping")) { this.freight = "Free Shipping!"; } else if (cls.equalsIgnoreCase("link fn")) { this.pname = lt.getStringText(); continue; } } else if ((tag instanceof LabelTag)) { LabelTag lt = (LabelTag) tag; if ((id != null) && (id.startsWith("label")) && (cls != null) && (cls.startsWith("d"))) { String l = lt.getLabel(); l = l.replace("\n", ""); int idx = l.indexOf(40); if (idx > 0) { l = l.substring(0, idx); } this.dimNames.put(cls, l); } } else if (!(tag instanceof SelectTag)) { if ((tag instanceof Span)) { if ((id != null) && (id.equalsIgnoreCase("sku"))) { String sku = tag.getStringText(); this.pid = sku.substring(sku.indexOf(35) + 1); } } else if ((tag instanceof Bullet)) { Bullet b = (Bullet) tag; String text = b.getStringText().trim(); if (text.startsWith("Weight")) { int idx = text.indexOf(":"); this.weight = text.substring(idx + 1).trim(); } } else if ((tag instanceof Div)) { Div div = (Div) tag; if (cls == null) { continue; } if (cls.equalsIgnoreCase("description")) { StringBuilder sb = new StringBuilder(); BulletList bullets = (BulletList) div.getChild(0); SimpleNodeIterator bls = bullets.elements(); while (bls.hasMoreNodes()) { Node n = bls.nextNode(); if ((n instanceof Bullet)) { Bullet bl = (Bullet) n; sb.append(bl.getStringText()); } } this.intro = sb.toString(); } } else if ((this.items == null) && ((tag instanceof ScriptTag))) { this.items = readScript((ScriptTag) tag); } } } }