/** * 提取网页中的删除操作 * * @param nodeList * @param dList * @return */ public List<DeleteEvent> processDelete(NodeList nodeList, List<DeleteEvent> dList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("discussion-item-head_ref_deleted")) { DeleteEvent d = new DeleteEvent(); // TODO 解析comment工作 Node deleteNode = DownloadUtil.getSomeChild(node, "span title=\""); d.setRef(deleteNode.getText().split("\"")[1]); System.out.println(deleteNode.getText().split("\"")[1]); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); d.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; d.setDeleteAt(time); } dList.add(d); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processDelete(childList, dList); } } } return dList; }
/** * 提取网页中所有的IssueComment元素 * * @param source */ private List<IssueCommentEvent> processComment( NodeList nodeList, List<IssueCommentEvent> icList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().matches("div id=\"issuecomment-.*\".*+")) { IssueCommentEvent i = new IssueCommentEvent(); // TODO 解析comment工作 Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); i.setActor(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); i.setCommentBody(contentNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; i.setCreatedAt(time); System.out.println(time); } icList.add(i); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processComment(childList, icList); } } } return icList; }
/** * 处理开启pullrequest的需求 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"issue-")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("open"); Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); pullRequestEvent.setBody(commentNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author"); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processOpenPull(childList, pList); } } } return pList; }
public List<PullRequestReviewCommentEvent> processSubPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList, String discussionId) { SimpleNodeIterator sni2 = nodeList.elements(); while (sni2.hasMoreNodes()) { Node node2 = sni2.nextNode(); if (node2.getText().contains("div id=\"discussion_r")) { PullRequestReviewCommentEvent p = new PullRequestReviewCommentEvent(); // TODO 解析comment工作 p.setDiscussionId(discussionId); Node actorNode = DownloadUtil.getSomeChild(node2, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node2, "div class=\"comment-body"); p.setCommentBody(contentNode.toPlainTextString()); System.out.println(contentNode.toPlainTextString().trim()); Node timeNode = DownloadUtil.getSomeChild(node2, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } prList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node2.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processSubPullRequestReviewComment(childList, prList, discussionId); } } } return prList; }
public List<Newsitem> parseContent(String content) throws Exception { List<Newsitem> newsitems = new ArrayList<Newsitem>(); Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news"); NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem"); for (int i = 0; i < nodes.size(); i++) { NewsitemImpl newsitem = new NewsitemImpl(); Tag itemTable = (Tag) nodes.elementAt(i); Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle"); newsitem.setTitle(titleTag.toPlainTextString()); Node descriptionSpan = titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling(); newsitem.setDescription( descriptionSpan .toPlainTextString() .replaceAll("[^\\u0000-\\u00FF]", " ") .replace(" Read More...", "") .trim()); Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0); newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href")); newsitems.add(newsitem); } return newsitems; }
/** * 处理Reference了当前pullrequest的操作 * * @param source */ public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("ref"); Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\""); pullRequestEvent.setBody( anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString()); Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+"); Matcher artifactMatcher = artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText()); if (artifactMatcher.find()) { String anotherAtifact = artifactMatcher.group(); pullRequestEvent.setPullrequestBaseRef(anotherAtifact); System.out.println(anotherAtifact); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processReference(childList, pList); } } } return pList; }
public void testCompositeTagWithOneTextChild() throws ParserException { String html = "<Custom>" + "Hello" + "</Custom>"; createParser(html); CustomTag customTag = parseCustomTag(1); assertEquals("child count", 1, customTag.getChildCount()); assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag()); assertEquals("starting loc", 0, customTag.getStartPosition()); assertEquals("ending loc", 8, customTag.getEndPosition()); assertEquals("starting line position", 0, customTag.getStartingLineNumber()); assertEquals("ending line position", 0, customTag.getEndingLineNumber()); Node child = customTag.childAt(0); assertType("child", Text.class, child); assertStringEquals("child text", "Hello", child.toPlainTextString()); }
/** * 处理取消指派某人操作 * * <p>跟之前一样,取消指派的是后面的家伙 * * @param nodeList * @param pList * @return */ private List<PullRequestEvent> processUnassigned( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) { PullRequestEvent pEvent = new PullRequestEvent(); pEvent.setAction("assigned"); Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); if (actorNode != null) { pEvent.setActor(actorNode.toPlainTextString()); } else { pEvent.setActor(assignedNode.toPlainTextString()); } System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pEvent.setCreatedAt(time); } pList.add(pEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processUnassigned(childList, pList); } } } return pList; }
private void processNodeList(NodeList list, List<String> valueList) { // 迭代开始 SimpleNodeIterator iterator = list.elements(); while (iterator.hasMoreNodes()) { Node node = iterator.nextNode(); // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null == childList) { // 得到值节点的值 String result = node.toPlainTextString().trim(); // 若包含关键字,则简单打印出来文本 // System.out.println(result); if (result != null && !"".equals(result)) valueList.add(result); } // end if // 孩子节点不为空,继续迭代该孩子节点 else { processNodeList(childList, valueList); } // end else } // end wile }
/** * 处理labeled操作 * * @param source */ public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("labeled"); List<Node> lableList = new ArrayList<Node>(); lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList); String lables = ""; for (int i = 0; i < lableList.size(); i++) { lables += lableList.get(i).toPlainTextString(); if (i != lableList.size() - 1) { lables += ","; } } System.out.println(lables); pullRequestEvent.setPullrequestBaseLabels(lables); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processLabled(childList, pList); } } } return pList; }
/** * 处理移除里程碑动作 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processRemoveMileStone( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) { PullRequestEvent p = new PullRequestEvent(); p.setAction("removeMilestone"); Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+"); Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText()); if (milestoneMatcher.find()) { String milestone = milestoneMatcher.group().split("\"")[0]; p.setBody(milestone); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } pList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processRemoveMileStone(childList, pList); } } } return pList; }
/** 从课表处,分课表 */ public List<Courses> parseCourses(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { e.printStackTrace(); } NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { e.printStackTrace(); } List<Courses> list = new ArrayList<Courses>(); String schoolyear = ""; String semester = ""; for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); Courses courses = null; boolean isCourse = false; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); String temp = "学年学期:"; int start = info.indexOf(temp); int len = "2012-2013".length(); if (start != -1) { start = start + temp.length(); schoolyear = info.substring(start, start + len); // semester = info.substring(start+len+2); // 网络正常时候测试学期改为数字 semester = info.substring(start + len + 3, start + len + 4); if ("一".equals(semester)) { semester = "1"; } else if ("二".equals(semester)) { semester = "2"; } } if (k == 1 && info.indexOf("[") != -1) { courses = new Courses(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); courses.setCourseCode(courseCode); courses.setCoursesname(coursesname); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); courses.setCredit(credit); } if (k == 3 && isCourse) { courses.setType(info); } if (k == 4 && isCourse) { courses.setLeanType(info); } if (k == 5 && isCourse) { courses.setCheckType(info); } if (k == 6 && isCourse) { courses.setGetType(info); } if (k == 7 && isCourse) { // double score=Double.parseDouble(info); courses.setScore(info); } if (k == 8 && isCourse) { courses.setRemark(info); } } // end for k if (courses != null) { courses.setSchoolYear(schoolyear); courses.setSemester(semester); list.add(courses); } } // end for j } } return list; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter filter = new NodeClassFilter(TableTag.class); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if (tag.getText().indexOf("[课程号]") == -1) { continue; } TableRow[] rows = tag.getRows(); for (int j = 1; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); System.out.println(info + "===" + k); switch (k) { case 1: int start = info.indexOf("["); int end = info.indexOf("]"); timeTable = new TimeTable(); timeTable.setCourseCode(info.substring(start + 1, end)); timeTable.setCourseName(info.substring(end + 1)); break; case 3: timeTable.setCredit(Double.parseDouble(info)); break; case 4: timeTable.setType(info); break; case 5: int t_start = info.indexOf("]"); timeTable.setTeacher(info.substring(t_start + 1)); break; case 8: List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } break; default: break; } } } // end for j } } return list; }
public List<TimeTable> parseTimeTables(String html) { Parser parser = new Parser(); try { parser.setInputHTML(html); parser.setEncoding("utf-8"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } List<TimeTable> list = new ArrayList<TimeTable>(); NodeFilter tagfilter = new NodeClassFilter(TableTag.class); NodeFilter idFilter = new HasAttributeFilter("id", "reportArea"); NodeFilter filter = new AndFilter(tagfilter, idFilter); NodeList nodeList = null; try { nodeList = parser.extractAllNodesThatMatch(filter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } for (int i = 0; i < nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow row = (TableRow) rows[j]; TableColumn[] columns = row.getColumns(); boolean isCourse = false; TimeTable timeTable = null; for (int k = 0; k < columns.length; k++) { Node columnNode = columns[k]; String info = columnNode.toPlainTextString().trim(); // System.out.println(info+"=="+k); if (k == 1 && info.indexOf("[") != -1) { timeTable = new TimeTable(); String courseCode = info.substring(1, 9); String coursesname = info.substring(10); timeTable.setCourseName(coursesname); timeTable.setCourseCode(courseCode); isCourse = true; } if (k == 2 && isCourse) { double credit = Double.parseDouble(info); timeTable.setCredit(credit); } if (k == 3 && isCourse) { timeTable.setType(info); } if (k == 4 && isCourse) { timeTable.setTeacher(info); } if (k == 5 && isCourse) { timeTable.setClassId(info); } if (k == 6 && isCourse) { timeTable.setClassNum(info); } if (k == 11 && isCourse) { List<TimeAndAdress> ta_list = praseStr(info); for (TimeAndAdress ta : ta_list) { timeTable.setAddress(ta.getAddress()); timeTable.setTime(ta.getTime()); timeTable.setCycle(ta.getCycle()); timeTable.setSingleDouble(ta.getSingleDouble()); timeTable.setWeek(ta.getWeek()); list.add(timeTable.clone()); } } } // end for k } // end for j } } return list; }
public ContentModel listHtml(String param, String type) { ContentModel model = new ContentModel(); StringBuffer html = new StringBuffer(); try { NodeFilter filter = new TagNameFilter("body"); Parser parser = new Parser(); parser.setURL(SearchHelper.SEARCH_URL_BAIDU + param); parser.setEncoding(parser.getEncoding()); NodeList list = parser.extractAllNodesThatMatch(filter); String body = list.toHtml(); Parser content = new Parser(); content.setInputHTML(body); content.setEncoding(parser.getEncoding()); NodeFilter content_filter = new TagNameFilter("table"); NodeList content_list = content.extractAllNodesThatMatch(content_filter); for (int i = 0; i < content_list.size(); i++) { String s = content_list.elementAt(i).toHtml(); if (s.indexOf("div") != -1) { continue; } if (s.indexOf("相关搜索") != -1) { html.append("<div id=\"rs\">" + s + "</div>"); continue; } html.append("<div class=\"content\">"); for (Node n : extractHtml(content_list.elementAt(i), type)) { if (n instanceof LinkTag) { if (n.toPlainTextString().equals("百度快照")) { continue; } html.append("<h3 class=\"t\">" + n.toHtml() + "</h3>"); } else { html.append(n.toHtml()); } } html.append("<br/></div><br>"); } /** 获取分页数据 */ Parser page = new Parser(); page.setInputHTML(body); page.setEncoding(parser.getEncoding()); NodeFilter page_filter = new TagNameFilter("p"); NodeList page_list = page.extractAllNodesThatMatch(page_filter); for (int i = 0; i < page_list.size(); i++) { String s = page_list.elementAt(i).toHtml(); if (s.indexOf("page") == -1) { continue; } html.append("<p id=\"page\">" + page_list.elementAt(i).toHtml() + "</div>"); } } catch (Exception e) { e.printStackTrace(); } model.setContent(html.toString()); return model; }