/** * 方法:获取对应的页面内容 * * @param htmlPageContent * @param preUrl * @throws ParserException * <p>Add By Ethan Lam At 2011-11-23 */ public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter( new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon")); NodeList nodeList = parser.parse(filter); NodeIterator it = nodeList.elements(); Div div = null; StringBuffer htmlContent = new StringBuffer(); while (it.hasMoreNodes()) { div = (Div) it.nextNode(); NodeList nl = div.getChildren(); if (nl == null) continue; NodeIterator sub = nl.elements(); while (sub.hasMoreNodes()) { Node t = sub.nextNode(); if (t instanceof ParagraphTag) { // LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText()); htmlContent.append(((ParagraphTag) t).getStringText()); } } } if ("".equals(htmlContent.toString().trim())) return; Page page = new Page(); page.setUrl(preUrl); page.setSegment(htmlContent.toString()); LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString()); pageSer.save(page); }
/** * 提取网页中所有的IssueComment元素 * * @param source */ private List<IssueCommentEvent> processComment( NodeList nodeList, List<IssueCommentEvent> icList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().matches("div id=\"issuecomment-.*\".*+")) { IssueCommentEvent i = new IssueCommentEvent(); // TODO 解析comment工作 Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); i.setActor(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); i.setCommentBody(contentNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; i.setCreatedAt(time); System.out.println(time); } icList.add(i); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processComment(childList, icList); } } } return icList; }
/** * 处理开启pullrequest的需求 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"issue-")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("open"); Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body"); pullRequestEvent.setBody(commentNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author"); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processOpenPull(childList, pList); } } } return pList; }
/** * 提取网页中的删除操作 * * @param nodeList * @param dList * @return */ public List<DeleteEvent> processDelete(NodeList nodeList, List<DeleteEvent> dList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("discussion-item-head_ref_deleted")) { DeleteEvent d = new DeleteEvent(); // TODO 解析comment工作 Node deleteNode = DownloadUtil.getSomeChild(node, "span title=\""); d.setRef(deleteNode.getText().split("\"")[1]); System.out.println(deleteNode.getText().split("\"")[1]); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); d.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; d.setDeleteAt(time); } dList.add(d); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processDelete(childList, dList); } } } return dList; }
public List<PullRequestReviewCommentEvent> processSubPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList, String discussionId) { SimpleNodeIterator sni2 = nodeList.elements(); while (sni2.hasMoreNodes()) { Node node2 = sni2.nextNode(); if (node2.getText().contains("div id=\"discussion_r")) { PullRequestReviewCommentEvent p = new PullRequestReviewCommentEvent(); // TODO 解析comment工作 p.setDiscussionId(discussionId); Node actorNode = DownloadUtil.getSomeChild(node2, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); System.out.println(actorNode.toPlainTextString()); Node contentNode = DownloadUtil.getSomeChild(node2, "div class=\"comment-body"); p.setCommentBody(contentNode.toPlainTextString()); System.out.println(contentNode.toPlainTextString().trim()); Node timeNode = DownloadUtil.getSomeChild(node2, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } prList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node2.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processSubPullRequestReviewComment(childList, prList, discussionId); } } } return prList; }
/** * 递归钻取正文信息 * * @param nodeP * @return */ @SuppressWarnings("unchecked") protected List<Node> extractHtml(Node nodeP, String type) throws Exception { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { return null; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes(); ) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { tableList.add(node); } } else { List tempList = extractHtml(node, type); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { TableContext tc = new TableContext(); tc.setLinkList(new ArrayList()); tc.setTextBuffer(new StringBuffer()); tableNumber++; tc.setTableRow(tableNumber); Iterator ti = tableList.iterator(); // 得到设置的搜索URL String baseUrl = Config.getSingleConfig(ConfigItem.SEARCH_BASE_URL); while (ti.hasNext()) { Node node = (Node) ti.next(); if (node instanceof LinkTag) { LinkTag linkTag = (LinkTag) node; if (!"1".equalsIgnoreCase(type)) { linkTag.setAttribute( "href", baseUrl + SearchHelper.encrypt(linkTag.getAttribute("href"))); } tc.getLinkList().add(linkTag); } else { tc.getTextBuffer().append(node.getText()); } } return tableList; } return null; }
@Override public void crawl(Parser parser) throws ParserException { List<LCOdds> data = new ArrayList<LCOdds>(); NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT)); for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) { NodeList cells = it.nextNode().getChildren(); cells.keepAllNodesThatMatch(tdFilter); LCOdds lc = parseRow(cells); if (null != lc) { data.add(lc); } } // persist if (data.size() < 1) { log.warn(" -- [ 06_LC_2 ] data is empty !"); } storeData("lc_odds", data); }
/** * 处理对pullrequest的review时,comment的操作, 与processSubPullRequestReviewComment配合一起使用 * * @param nodeList * @param prList * @return */ public List<PullRequestReviewCommentEvent> processPullRequestReviewComment( NodeList nodeList, List<PullRequestReviewCommentEvent> prList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div id=\"diff-for-comment-")) { String discussionId = node.getText().split("\"")[1]; System.out.println(discussionId); NodeList subNodeList = node.getChildren(); prList = processSubPullRequestReviewComment(subNodeList, prList, discussionId); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processPullRequestReviewComment(childList, prList); } } } return prList; }
private void processNodeList(NodeList list, List<String> valueList) { // 迭代开始 SimpleNodeIterator iterator = list.elements(); while (iterator.hasMoreNodes()) { Node node = iterator.nextNode(); // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null == childList) { // 得到值节点的值 String result = node.toPlainTextString().trim(); // 若包含关键字,则简单打印出来文本 // System.out.println(result); if (result != null && !"".equals(result)) valueList.add(result); } // end if // 孩子节点不为空,继续迭代该孩子节点 else { processNodeList(childList, valueList); } // end else } // end wile }
/** * 处理Reference了当前pullrequest的操作 * * @param source */ public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("ref"); Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\""); pullRequestEvent.setBody( anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString()); Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+"); Matcher artifactMatcher = artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText()); if (artifactMatcher.find()) { String anotherAtifact = artifactMatcher.group(); pullRequestEvent.setPullrequestBaseRef(anotherAtifact); System.out.println(anotherAtifact); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processReference(childList, pList); } } } return pList; }
/** * 处理labeled操作 * * @param source */ public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) { PullRequestEvent pullRequestEvent = new PullRequestEvent(); pullRequestEvent.setAction("labeled"); List<Node> lableList = new ArrayList<Node>(); lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList); String lables = ""; for (int i = 0; i < lableList.size(); i++) { lables += lableList.get(i).toPlainTextString(); if (i != lableList.size() - 1) { lables += ","; } } System.out.println(lables); pullRequestEvent.setPullrequestBaseLabels(lables); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pullRequestEvent.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pullRequestEvent.setCreatedAt(time); } pList.add(pullRequestEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processLabled(childList, pList); } } } return pList; }
/** * 处理取消指派某人操作 * * <p>跟之前一样,取消指派的是后面的家伙 * * @param nodeList * @param pList * @return */ private List<PullRequestEvent> processUnassigned( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) { PullRequestEvent pEvent = new PullRequestEvent(); pEvent.setAction("assigned"); Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\""); pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString()); Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); if (actorNode != null) { pEvent.setActor(actorNode.toPlainTextString()); } else { pEvent.setActor(assignedNode.toPlainTextString()); } System.out.println(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; pEvent.setCreatedAt(time); } pList.add(pEvent); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processUnassigned(childList, pList); } } } return pList; }
/** * 处理移除里程碑动作 * * @param nodeList * @param pList * @return */ public List<PullRequestEvent> processRemoveMileStone( NodeList nodeList, List<PullRequestEvent> pList) { SimpleNodeIterator sni = nodeList.elements(); while (sni.hasMoreNodes()) { Node node = sni.nextNode(); if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) { PullRequestEvent p = new PullRequestEvent(); p.setAction("removeMilestone"); Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\""); Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+"); Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText()); if (milestoneMatcher.find()) { String milestone = milestoneMatcher.group().split("\"")[0]; p.setBody(milestone); } Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\""); p.setActor(actorNode.toPlainTextString()); Node timeNode = DownloadUtil.getSomeChild(node, "datetime"); Pattern pattern = Pattern.compile("datetime=\".*\""); Matcher matcher = pattern.matcher(timeNode.getText()); if (matcher.find()) { String time = matcher.group().split("\"")[1]; p.setCreatedAt(time); } pList.add(p); } else { // 得到该节点的子节点列表 NodeList childList = node.getChildren(); // 孩子节点为空,说明是值节点 if (null != childList) { // 如果孩子结点不为空则递归调用 processRemoveMileStone(childList, pList); } } } return pList; }
/** * 处理目标 超链接节点 * * @param htmlPageContent * @param preUrl * @throws Exception */ public void dealLinkNodes(String htmlPageContent, String preUrl) { try { Parser parser = new Parser(); parser.setInputHTML(htmlPageContent); NodeFilter filter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank")); NodeList nodeList = parser.parse(filter); LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0)); NodeIterator it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); if (node instanceof LinkTag) { if (!filterHandler.isLinkTagFilter(((LinkTag) node))) { LoggerUtil.debug( "ParserHandler ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText()); CrawlQueue.getQueueManager() .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl); } } } } catch (Exception e) { } }
public ConversationID findCorrespondingHTMLFormConversation(ConversationID samlId) { ConversationModel conversationModel = this.model.getConversationModel(); HttpUrl samlHttpUrl = conversationModel.getRequestUrl(samlId); int samlConversationIndex = conversationModel.getIndexOfConversation(samlId); for (int conversationIndex = samlConversationIndex - 1; conversationIndex >= 0; conversationIndex--) { ConversationID id = conversationModel.getConversationAt(conversationIndex); Response response = conversationModel.getResponse(id); HttpUrl httpUrl = conversationModel.getRequestUrl(id); Object parsedContent = Parser.parse(httpUrl, response); if (null == parsedContent) { continue; } if (false == parsedContent instanceof org.htmlparser.util.NodeList) { continue; } org.htmlparser.util.NodeList htmlNodeList = (org.htmlparser.util.NodeList) parsedContent; org.htmlparser.util.NodeList forms = htmlNodeList.searchFor(FormTag.class); try { for (NodeIterator ni = forms.elements(); ni.hasMoreNodes(); ) { FormTag form = (FormTag) ni.nextNode(); String formAction = form.getAttribute("action"); HttpUrl formActionHttpUrl = new HttpUrl(formAction); if (samlHttpUrl.equals(formActionHttpUrl)) { return id; } } } catch (ParserException ex) { this._logger.log(Level.WARNING, "Looking for forms, got ''{0}''", ex); } catch (MalformedURLException ex) { this._logger.log(Level.WARNING, "Malformed action url: {0}", ex.getMessage()); } } return null; }
private void scanPage() throws IOException, ParserException, ParseException { URL u = new URL(this.url); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); Parser parser = new Parser(conn); System.setProperty("sun.net.client.defaultConnectTimeout", "30000000"); // jdk1.4换成这个,连接超时 System.setProperty("sun.net.client.defaultReadTimeout", "30000000"); // jdk1.4换成这个,读操作超时 // con.setConnectTimeout(5000);//jdk 1.5换成这个,连接超时 // con.setReadTimeout(5000);//jdk 1.5换成这个,读操作超时 parser.setEncoding("UTF-8"); NodeFilter filter = new NodeClassFilter(CompositeTag.class); NodeList tags = parser.extractAllNodesThatMatch(filter); SimpleNodeIterator iter = tags.elements(); CompositeTag tag = null; while (iter.hasMoreNodes()) { tag = (CompositeTag) iter.nextNode(); String id = tag.getAttribute("id"); String cls = tag.getAttribute("class"); if ((tag instanceof LinkTag)) { LinkTag lt = (LinkTag) tag; if (cls == null) { continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Category")) { this.category = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Sub-Category")) { this.subCategory = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*Breadcrumb*Brand")) { this.brand = lt.getStringText(); continue; } if (cls.startsWith("gae-click*Product-Page*PrForm*Free-Shipping")) { this.freight = "Free Shipping!"; } else if (cls.equalsIgnoreCase("link fn")) { this.pname = lt.getStringText(); continue; } } else if ((tag instanceof LabelTag)) { LabelTag lt = (LabelTag) tag; if ((id != null) && (id.startsWith("label")) && (cls != null) && (cls.startsWith("d"))) { String l = lt.getLabel(); l = l.replace("\n", ""); int idx = l.indexOf(40); if (idx > 0) { l = l.substring(0, idx); } this.dimNames.put(cls, l); } } else if (!(tag instanceof SelectTag)) { if ((tag instanceof Span)) { if ((id != null) && (id.equalsIgnoreCase("sku"))) { String sku = tag.getStringText(); this.pid = sku.substring(sku.indexOf(35) + 1); } } else if ((tag instanceof Bullet)) { Bullet b = (Bullet) tag; String text = b.getStringText().trim(); if (text.startsWith("Weight")) { int idx = text.indexOf(":"); this.weight = text.substring(idx + 1).trim(); } } else if ((tag instanceof Div)) { Div div = (Div) tag; if (cls == null) { continue; } if (cls.equalsIgnoreCase("description")) { StringBuilder sb = new StringBuilder(); BulletList bullets = (BulletList) div.getChild(0); SimpleNodeIterator bls = bullets.elements(); while (bls.hasMoreNodes()) { Node n = bls.nextNode(); if ((n instanceof Bullet)) { Bullet bl = (Bullet) n; sb.append(bl.getStringText()); } } this.intro = sb.toString(); } } else if ((this.items == null) && ((tag instanceof ScriptTag))) { this.items = readScript((ScriptTag) tag); } } } }