/** * 提取网页中所有的IssueComment元素 * * @param source */ private List<IssueCommentEvent> processComment( NodeList nodeList, List<IssueCommentEvent> icList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().matches("div id=\"issuecomment-.*\".*+")) { IssueCommentEvent i = new IssueCommentEvent(); // TODO 解析comment工作 Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); i.setActor(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); i.setCommentBody(contentNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; i.setCreatedAt(time); System.out.println(time); } icList.add(i); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processComment(childList, icList); } } } return icList; }
/** * 处理开启pullrequest的需求 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"issue-")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("open"); Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); pullRequestEvent.setBody(commentNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author"); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processOpenPull(childList, pList); } } } return pList; }
public List<PullRequestReviewCommentEvent> processSubPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList, String discussionId) { SimpleNodeIterator sni2 = nodeList.elements(); while (sni2.hasMoreNodes()) { Node node2 = sni2.nextNode(); if (node2.getText().contains("div id=\"discussion_r")) { PullRequestReviewCommentEvent p = new PullRequestReviewCommentEvent(); // TODO 解析comment工作 p.setDiscussionId(discussionId); Node actorNode = DownloadUtil.getSomeChild(node2, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node2, "div class=\"comment-body"); p.setCommentBody(contentNode.toPlainTextString()); System.out.println(contentNode.toPlainTextString().trim()); Node timeNode = DownloadUtil.getSomeChild(node2, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } prList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node2.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processSubPullRequestReviewComment(childList, prList, discussionId); } } } return prList; }
/** * 提取网页中的删除操作 * * @param nodeList * @param dList * @return */ public List<DeleteEvent> processDelete(NodeList nodeList, List<DeleteEvent> dList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("discussion-item-head_ref_deleted")) { DeleteEvent d = new DeleteEvent(); // TODO 解析comment工作 Node deleteNode = DownloadUtil.getSomeChild(node, "span title=\""); d.setRef(deleteNode.getText().split("\"")[1]); System.out.println(deleteNode.getText().split("\"")[1]); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); d.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; d.setDeleteAt(time); } dList.add(d); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processDelete(childList, dList); } } } return dList; }
/** * 递归钻取正文信息 * * @param nodeP * @return */ @SuppressWarnings("unchecked") protected List<Node> extractHtml(Node nodeP, String type) throws Exception { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { return null; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { tableList.add(node); } } else { List tempList = extractHtml(node, type); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { TableContext tc = new TableContext(); tc.setLinkList(new ArrayList()); tc.setTextBuffer(new StringBuffer()); tableNumber++; tc.setTableRow(tableNumber); Iterator ti = tableList.iterator(); // 得到设置的搜索URL String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL); while (ti.hasNext()) { Node node = (Node) ti.next(); if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag) node; if (!"1".equalsIgnoreCase(type)) { linkTag.setAttribute( "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href"))); } tc.getLinkList().add(linkTag); } else { tc.getTextBuffer().append(node.getText()); } } return tableList; } return null; }
private List<String> getHotTourRefs() { List<String> refs = new ArrayList<String>(); try { Node[] nodes = getNodes(URL); for (Node node : nodes) { if (node.getText().contains("class=\"latestnews\"") && node.getText().length() > 40) { refs.add(URL.substring(0, URL.length() - 2) + node.getText().split("\"")[1]); } } } catch (ParserException e) { e .printStackTrace(); // To change body of catch statement use File | Settings | File // Templates. } return refs; }
// 获取一个网站上的链接,filter来过滤链接 public static Set<String> extracLinks(String url, Cobweb cobweb) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding(cobweb.getCharSet()); // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤<a> 标签和<frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL if (cobweb.accept(linkUrl)) { links.add( // java.net.URLEncoder.encode(linkUrl)); linkUrl .replaceAll("\\?", "\\%3F") // 转码 .replaceAll("\\&", "\\%26") .replaceAll("\\|", "\\%124") .replaceAll("\\#", "")); } ; } else { // <frame>标签 // 提取frame 里src 属性的链接,如<frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); if (cobweb.accept(frameUrl)) { links.add(frameUrl); } } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * 处理对pullrequest的review时,comment的操作, 与processSubPullRequestReviewComment配合一起使用 * * @param nodeList * @param prList * @return */ public List<PullRequestReviewCommentEvent> processPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"diff-for-comment-")) { String discussionId = node.getText().split("\"")[1]; System.out.println(discussionId); NodeList subNodeList = node.getChildren(); prList = processSubPullRequestReviewComment(subNodeList, prList, discussionId); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processPullRequestReviewComment(childList, prList); } } } return prList; }
/** * 处理Reference了当前pullrequest的操作 * * @param source */ public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("ref"); Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\""); pullRequestEvent.setBody( anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString()); Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+"); Matcher artifactMatcher = artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText()); if (artifactMatcher.find()) { String anotherAtifact = artifactMatcher.group(); pullRequestEvent.setPullrequestBaseRef(anotherAtifact); System.out.println(anotherAtifact); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processReference(childList, pList); } } } return pList; }
/** * 处理labeled操作 * * @param source */ public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("labeled"); List<Node> lableList = new ArrayList<Node>(); lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList); String lables = ""; for (int i = 0; i < lableList.size(); i++) { lables += lableList.get(i).toPlainTextString(); if (i != lableList.size() - 1) { lables += ","; } } System.out.println(lables); pullRequestEvent.setPullrequestBaseLabels(lables); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processLabled(childList, pList); } } } return pList; }
/** * Retrieves the value of a table cell. Appends the text of child nodes of the cell. In case of * composite tags like span or div the inner text is appended. */ public static String getValue(TagNode cell) { String value = EMPTY; for (Node child : cell.getChildren().toNodeArray()) { if (child instanceof CompositeTag) { value += ((CompositeTag) child).getStringText(); } else { value = value + child.getText(); } } return value.trim().replaceAll(" ", EMPTY); }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, LinkFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); // parser.setEncoding("utf8"); // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接 NodeFilter frameFilter = new NodeFilter() { /** */ private static final long serialVersionUID = 1L; public boolean accept(Node node) { if (node.getText().startsWith("iframe") && node.getText().contains("src=")) { return true; } else { return false; } } }; // OrFilter 来设置过滤 <a> 标签和 <frame> 标签 OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) // <a> 标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // url可能出现在src,href等属性中 if (filter.accept(linkUrl)) links.add(linkUrl); } else // <frame> 标签 { // 提取 frame 里 src 属性的链接如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src=\""); frame = frame.substring(start); int end = frame.indexOf("\">"); if (end == -1) { end = frame.indexOf("?"); } String frameUrl = frame.substring(5, end - 1); if (filter.accept(frameUrl)) links.add(frameUrl); } } } catch (ParserException e) { e.printStackTrace(); } return links; }
/** * 处理取消指派某人操作 * * <p>跟之前一样,取消指派的是后面的家伙 * * @param nodeList * @param pList * @return */ private List<PullRequestEvent> processUnassigned( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) { PullRequestEvent pEvent = new PullRequestEvent(); pEvent.setAction("assigned"); Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); if (actorNode != null) { pEvent.setActor(actorNode.toPlainTextString()); } else { pEvent.setActor(assignedNode.toPlainTextString()); } System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pEvent.setCreatedAt(time); } pList.add(pEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processUnassigned(childList, pList); } } } return pList; }
/** * 处理移除里程碑动作 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processRemoveMileStone( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) { PullRequestEvent p = new PullRequestEvent(); p.setAction("removeMilestone"); Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+"); Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText()); if (milestoneMatcher.find()) { String milestone = milestoneMatcher.group().split("\"")[0]; p.setBody(milestone); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } pList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processRemoveMileStone(childList, pList); } } } return pList; }
// 获取一个网站上的链接,filter 用来过滤链接 public static Set<String> extracLinks(String url, NodeFilter filter) { Set<String> links = new HashSet<String>(); try { Parser parser = new Parser(url); parser.setEncoding("UTF-8"); @SuppressWarnings("serial") NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter); NodeList list = parser.extractAllNodesThatMatch(linkFilter); System.out.println("length=" + list.size()); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag) { // <a> 标签 LinkTag link = (LinkTag) tag; String linkUrl = link.getLink(); // URL /* * if (filter.accept(linkUrl)) { links.add(linkUrl); } */ System.out.println("linkUrl=" + linkUrl); if (filter.accept(tag)) { links.add(linkUrl); } } else { // <frame> 标签 // 提取 frame 里 src 属性的链接,如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) { end = frame.indexOf(">"); } String frameUrl = frame.substring(5, end - 1); // if (filter.accept(frameUrl)) { // links.add(frameUrl); // } System.out.println("frameUrl=" + frameUrl); if (filter.accept(tag)) { links.add(frameUrl); } } } /* * NodeFilter filter = new TagNameFilter("DIV"); NodeList nodes = * parser.extractAllNodesThatMatch(filter); if(nodes!=null) { for * (int i = 0; i < nodes.size(); i++) { Node textnode = (Node) * nodes.elementAt(i); * System.out.println("getText:"+textnode.getText()); * System.out.println * ("================================================="); } } */ /* * for(NodeIterator i = parser.elements (); i.hasMoreNodes(); ) { * Node node = i.nextNode(); * System.out.println("getText:"+node.getText()); * System.out.println("getPlainText:"+node.toPlainTextString()); * System.out.println("toHtml:"+node.toHtml()); * System.out.println("toHtml(true):"+node.toHtml(true)); * System.out.println("toHtml(false):"+node.toHtml(false)); * System.out.println("toString:"+node.toString()); * System.out.println * ("================================================="); } */ /* * TextExtractingVisitor visitor = new TextExtractingVisitor(); * parser.visitAllNodesWith(visitor); String textInPage = * visitor.getExtractedText(); System.out.println(textInPage); */ } catch (ParserException e) { e.printStackTrace(); } return links; }
private static boolean handleURL(String address) { Main.status(String.format("Processing page \"%s\".", address)); try { NodeList posts = getPosts(address); if (posts.toNodeArray().length == 0) { return false; } for (Node post_node : posts.toNodeArray()) { if (post_node instanceof TagNode) { TagNode post = (TagNode) post_node; Post new_post = new Post(Long.parseLong(post.getAttribute("id").substring(5))); if (!Main.post_post_hash.containsKey(new_post)) { NodeList photo_posts = getPhotoPosts(post.getChildren()); NodeList remarks = getRemarks(photo_posts); for (Node node : remarks.toNodeArray()) { Matcher matcher = lores.matcher(node.getText()); String media_url = ""; if (matcher.find()) { media_url = matcher.group(); media_url = media_url.substring(17, media_url.length() - 1); } String thumb = media_url.replace( media_url.substring(media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } NodeList photoset_posts = getPhotosetPosts(post.getChildren()); NodeList iframes = getIFrames(photoset_posts); for (Node node : iframes.toNodeArray()) { if (node instanceof TagNode) { String iframe_url = ((TagNode) node).getAttribute("src"); Parser parser2 = new Parser(iframe_url); NodeList a_list = parser2.extractAllNodesThatMatch(new TagNameFilter("a")); Node[] a_array = a_list.toNodeArray(); Node[] img_array = a_list.extractAllNodesThatMatch(new TagNameFilter("img"), true).toNodeArray(); String media_url; for (int i = 0; i < a_array.length; i++) { media_url = ((TagNode) img_array[i]).getAttribute("src"); String thumb = media_url.replace( media_url.substring( media_url.lastIndexOf("_"), media_url.lastIndexOf(".")), "_75sq"); URL thumb_url = new URL(thumb); new_post.pictures.add(new Picture(new URL(media_url), thumb_url)); } } } Main.handlePost(new_post); } else { new_post = post_post_hash.get(new_post); handleNonDownloadPost(new_post); } } } } catch (Exception ex) { ex.printStackTrace(); Main.status("Error handling post."); } return true; }