Пример #1
0
  /**
   * 提取网页中的删除操作
   *
   * @param nodeList
   * @param dList
   * @return
   */
  public List<DeleteEvent> processDelete(NodeList nodeList, List<DeleteEvent> dList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("discussion-item-head_ref_deleted")) {
        DeleteEvent d = new DeleteEvent();
        // TODO 解析comment工作
        Node deleteNode = DownloadUtil.getSomeChild(node, "span title=\"");
        d.setRef(deleteNode.getText().split("\"")[1]);
        System.out.println(deleteNode.getText().split("\"")[1]);
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        d.setActor(actorNode.toPlainTextString());
        System.out.println(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          d.setDeleteAt(time);
        }
        dList.add(d);
      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processDelete(childList, dList);
        }
      }
    }

    return dList;
  }
Пример #2
0
 /**
  * 提取网页中所有的IssueComment元素
  *
  * @param source
  */
 private List<IssueCommentEvent> processComment(
     NodeList nodeList, List<IssueCommentEvent> icList) {
   SimpleNodeIterator sni = nodeList.elements();
   while (sni.hasMoreNodes()) {
     Node node = sni.nextNode();
     if (node.getText().matches("div id=\"issuecomment-.*\".*+")) {
       IssueCommentEvent i = new IssueCommentEvent();
       // TODO 解析comment工作
       Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
       i.setActor(actorNode.toPlainTextString());
       Node contentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body");
       i.setCommentBody(contentNode.toPlainTextString());
       Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
       Pattern pattern = Pattern.compile("datetime=\".*\"");
       Matcher matcher = pattern.matcher(timeNode.getText());
       if (matcher.find()) {
         String time = matcher.group().split("\"")[1];
         i.setCreatedAt(time);
         System.out.println(time);
       }
       icList.add(i);
     } else {
       // 得到该节点的子节点列表
       NodeList childList = node.getChildren();
       // 孩子节点为空,说明是值节点
       if (null != childList) { // 如果孩子结点不为空则递归调用
         processComment(childList, icList);
       }
     }
   }
   return icList;
 }
Пример #3
0
  /**
   * 处理开启pullrequest的需求
   *
   * @param nodeList
   * @param pList
   * @return
   */
  public List<PullRequestEvent> processOpenPull(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div id=\"issue-")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("open");
        Node commentNode = DownloadUtil.getSomeChild(node, "div class=\"comment-body");
        pullRequestEvent.setBody(commentNode.toPlainTextString());
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author");
        pullRequestEvent.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processOpenPull(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #4
0
  public List<PullRequestReviewCommentEvent> processSubPullRequestReviewComment(
      NodeList nodeList, List<PullRequestReviewCommentEvent> prList, String discussionId) {
    SimpleNodeIterator sni2 = nodeList.elements();
    while (sni2.hasMoreNodes()) {
      Node node2 = sni2.nextNode();
      if (node2.getText().contains("div id=\"discussion_r")) {
        PullRequestReviewCommentEvent p = new PullRequestReviewCommentEvent();
        // TODO 解析comment工作
        p.setDiscussionId(discussionId);

        Node actorNode = DownloadUtil.getSomeChild(node2, "class=\"author\"");
        p.setActor(actorNode.toPlainTextString());
        System.out.println(actorNode.toPlainTextString());
        Node contentNode = DownloadUtil.getSomeChild(node2, "div class=\"comment-body");
        p.setCommentBody(contentNode.toPlainTextString());
        System.out.println(contentNode.toPlainTextString().trim());
        Node timeNode = DownloadUtil.getSomeChild(node2, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          p.setCreatedAt(time);
        }
        prList.add(p);
      } else {
        // 得到该节点的子节点列表
        NodeList childList = node2.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processSubPullRequestReviewComment(childList, prList, discussionId);
        }
      }
    }
    return prList;
  }
Пример #5
0
  public List<Newsitem> parseContent(String content) throws Exception {
    List<Newsitem> newsitems = new ArrayList<Newsitem>();

    Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news");
    NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem");

    for (int i = 0; i < nodes.size(); i++) {
      NewsitemImpl newsitem = new NewsitemImpl();
      Tag itemTable = (Tag) nodes.elementAt(i);

      Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle");
      newsitem.setTitle(titleTag.toPlainTextString());

      Node descriptionSpan =
          titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling();
      newsitem.setDescription(
          descriptionSpan
              .toPlainTextString()
              .replaceAll("[^\\u0000-\\u00FF]", " ")
              .replace("&nbsp;Read More...", "")
              .trim());

      Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0);
      newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href"));
      newsitems.add(newsitem);
    }
    return newsitems;
  }
Пример #6
0
  /**
   * 处理Reference了当前pullrequest的操作
   *
   * @param source
   */
  public List<PullRequestEvent> processReference(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div class=\"discussion-item discussion-item-ref\"")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("ref");
        Node anotherAtifactNode = DownloadUtil.getSomeChild(node, "class=\"title-link\"");
        pullRequestEvent.setBody(
            anotherAtifactNode == null ? "" : anotherAtifactNode.toPlainTextString());
        Pattern artifactPattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/[a-z[0-9]]+");
        Matcher artifactMatcher =
            artifactPattern.matcher(anotherAtifactNode == null ? "" : anotherAtifactNode.getText());
        if (artifactMatcher.find()) {
          String anotherAtifact = artifactMatcher.group();
          pullRequestEvent.setPullrequestBaseRef(anotherAtifact);
          System.out.println(anotherAtifact);
        }
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pullRequestEvent.setActor(actorNode == null ? "" : actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processReference(childList, pList);
        }
      }
    }
    return pList;
  }
 public void testCompositeTagWithOneTextChild() throws ParserException {
   String html = "<Custom>" + "Hello" + "</Custom>";
   createParser(html);
   CustomTag customTag = parseCustomTag(1);
   assertEquals("child count", 1, customTag.getChildCount());
   assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag());
   assertEquals("starting loc", 0, customTag.getStartPosition());
   assertEquals("ending loc", 8, customTag.getEndPosition());
   assertEquals("starting line position", 0, customTag.getStartingLineNumber());
   assertEquals("ending line position", 0, customTag.getEndingLineNumber());
   Node child = customTag.childAt(0);
   assertType("child", Text.class, child);
   assertStringEquals("child text", "Hello", child.toPlainTextString());
 }
Пример #8
0
  /**
   * 处理取消指派某人操作
   *
   * <p>跟之前一样,取消指派的是后面的家伙
   *
   * @param nodeList
   * @param pList
   * @return
   */
  private List<PullRequestEvent> processUnassigned(
      NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("class=\"discussion-item discussion-item-unassigned\"")) {
        PullRequestEvent pEvent = new PullRequestEvent();
        pEvent.setAction("assigned");
        Node assignedNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pEvent.setPullrequestAssgnee(assignedNode.toPlainTextString());
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\"");
        if (actorNode != null) {
          pEvent.setActor(actorNode.toPlainTextString());
        } else {
          pEvent.setActor(assignedNode.toPlainTextString());
        }
        System.out.println(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pEvent.setCreatedAt(time);
        }
        pList.add(pEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processUnassigned(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #9
0
 private void processNodeList(NodeList list, List<String> valueList) {
   // 迭代开始
   SimpleNodeIterator iterator = list.elements();
   while (iterator.hasMoreNodes()) {
     Node node = iterator.nextNode();
     // 得到该节点的子节点列表
     NodeList childList = node.getChildren();
     // 孩子节点为空,说明是值节点
     if (null == childList) {
       // 得到值节点的值
       String result = node.toPlainTextString().trim();
       // 若包含关键字,则简单打印出来文本
       // System.out.println(result);
       if (result != null && !"".equals(result)) valueList.add(result);
     } // end if
     // 孩子节点不为空,继续迭代该孩子节点
     else {
       processNodeList(childList, valueList);
     } // end else
   } // end wile
 }
Пример #10
0
  /**
   * 处理labeled操作
   *
   * @param source
   */
  public List<PullRequestEvent> processLabled(NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("class=\"discussion-item discussion-item-labeled\"")) {
        PullRequestEvent pullRequestEvent = new PullRequestEvent();
        pullRequestEvent.setAction("labeled");
        List<Node> lableList = new ArrayList<Node>();
        lableList = DownloadUtil.getLableList(node, "style=\"color:", lableList);
        String lables = "";
        for (int i = 0; i < lableList.size(); i++) {
          lables += lableList.get(i).toPlainTextString();
          if (i != lableList.size() - 1) {
            lables += ",";
          }
        }
        System.out.println(lables);
        pullRequestEvent.setPullrequestBaseLabels(lables);
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        pullRequestEvent.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          pullRequestEvent.setCreatedAt(time);
        }
        pList.add(pullRequestEvent);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processLabled(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #11
0
  /**
   * 处理移除里程碑动作
   *
   * @param nodeList
   * @param pList
   * @return
   */
  public List<PullRequestEvent> processRemoveMileStone(
      NodeList nodeList, List<PullRequestEvent> pList) {
    SimpleNodeIterator sni = nodeList.elements();
    while (sni.hasMoreNodes()) {
      Node node = sni.nextNode();
      if (node.getText().contains("div class=\"discussion-item discussion-item-demilestoned\"")) {
        PullRequestEvent p = new PullRequestEvent();
        p.setAction("removeMilestone");
        Node milestoneNode = DownloadUtil.getSomeChild(node, "class=\"discussion-item-entity\"");
        Pattern milestonePattern = Pattern.compile("[a-zA-Z]+/[a-zA-Z]+/[a-zA-Z]+/.*+");
        Matcher milestoneMatcher = milestonePattern.matcher(milestoneNode.getText());
        if (milestoneMatcher.find()) {
          String milestone = milestoneMatcher.group().split("\"")[0];
          p.setBody(milestone);
        }
        Node actorNode = DownloadUtil.getSomeChild(node, "class=\"author\"");
        p.setActor(actorNode.toPlainTextString());
        Node timeNode = DownloadUtil.getSomeChild(node, "datetime");
        Pattern pattern = Pattern.compile("datetime=\".*\"");
        Matcher matcher = pattern.matcher(timeNode.getText());
        if (matcher.find()) {
          String time = matcher.group().split("\"")[1];
          p.setCreatedAt(time);
        }
        pList.add(p);

      } else {
        // 得到该节点的子节点列表
        NodeList childList = node.getChildren();
        // 孩子节点为空,说明是值节点
        if (null != childList) { // 如果孩子结点不为空则递归调用
          processRemoveMileStone(childList, pList);
        }
      }
    }
    return pList;
  }
Пример #12
0
  /** 从课表处,分课表 */
  public List<Courses> parseCourses(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      e.printStackTrace();
    }
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      e.printStackTrace();
    }

    List<Courses> list = new ArrayList<Courses>();
    String schoolyear = "";
    String semester = "";
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          Courses courses = null;
          boolean isCourse = false;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            String temp = "学年学期:";
            int start = info.indexOf(temp);
            int len = "2012-2013".length();
            if (start != -1) {
              start = start + temp.length();
              schoolyear = info.substring(start, start + len);
              // semester = info.substring(start+len+2);
              // 网络正常时候测试学期改为数字
              semester = info.substring(start + len + 3, start + len + 4);
              if ("一".equals(semester)) {
                semester = "1";
              } else if ("二".equals(semester)) {
                semester = "2";
              }
            }
            if (k == 1 && info.indexOf("[") != -1) {
              courses = new Courses();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              courses.setCourseCode(courseCode);
              courses.setCoursesname(coursesname);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              courses.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              courses.setType(info);
            }
            if (k == 4 && isCourse) {
              courses.setLeanType(info);
            }
            if (k == 5 && isCourse) {
              courses.setCheckType(info);
            }
            if (k == 6 && isCourse) {
              courses.setGetType(info);
            }
            if (k == 7 && isCourse) {
              //	double score=Double.parseDouble(info);
              courses.setScore(info);
            }
            if (k == 8 && isCourse) {
              courses.setRemark(info);
            }
          } // end for k
          if (courses != null) {
            courses.setSchoolYear(schoolyear);
            courses.setSemester(semester);
            list.add(courses);
          }
        } // end for j
      }
    }
    return list;
  }
Пример #13
0
  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        if (tag.getText().indexOf("[课程号]") == -1) {
          continue;
        }
        TableRow[] rows = tag.getRows();
        for (int j = 1; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            System.out.println(info + "===" + k);
            switch (k) {
              case 1:
                int start = info.indexOf("[");
                int end = info.indexOf("]");
                timeTable = new TimeTable();
                timeTable.setCourseCode(info.substring(start + 1, end));
                timeTable.setCourseName(info.substring(end + 1));
                break;
              case 3:
                timeTable.setCredit(Double.parseDouble(info));
                break;
              case 4:
                timeTable.setType(info);
                break;
              case 5:
                int t_start = info.indexOf("]");
                timeTable.setTeacher(info.substring(t_start + 1));
                break;
              case 8:
                List<TimeAndAdress> ta_list = praseStr(info);
                for (TimeAndAdress ta : ta_list) {
                  timeTable.setAddress(ta.getAddress());
                  timeTable.setTime(ta.getTime());
                  timeTable.setCycle(ta.getCycle());
                  timeTable.setSingleDouble(ta.getSingleDouble());
                  timeTable.setWeek(ta.getWeek());
                  list.add(timeTable.clone());
                }
                break;
              default:
                break;
            }
          }
        } // end for j
      }
    }
    return list;
  }
Пример #14
0
  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter tagfilter = new NodeClassFilter(TableTag.class);
    NodeFilter idFilter = new HasAttributeFilter("id", "reportArea");
    NodeFilter filter = new AndFilter(tagfilter, idFilter);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;

          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            //   System.out.println(info+"=="+k);
            if (k == 1 && info.indexOf("[") != -1) {
              timeTable = new TimeTable();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              timeTable.setCourseName(coursesname);
              timeTable.setCourseCode(courseCode);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              timeTable.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              timeTable.setType(info);
            }
            if (k == 4 && isCourse) {
              timeTable.setTeacher(info);
            }
            if (k == 5 && isCourse) {
              timeTable.setClassId(info);
            }
            if (k == 6 && isCourse) {
              timeTable.setClassNum(info);
            }
            if (k == 11 && isCourse) {
              List<TimeAndAdress> ta_list = praseStr(info);
              for (TimeAndAdress ta : ta_list) {
                timeTable.setAddress(ta.getAddress());
                timeTable.setTime(ta.getTime());
                timeTable.setCycle(ta.getCycle());
                timeTable.setSingleDouble(ta.getSingleDouble());
                timeTable.setWeek(ta.getWeek());
                list.add(timeTable.clone());
              }
            }
          } // end for k
        } // end for j
      }
    }
    return list;
  }
Пример #15
0
  public ContentModel listHtml(String param, String type) {
    ContentModel model = new ContentModel();
    StringBuffer html = new StringBuffer();
    try {
      NodeFilter filter = new TagNameFilter("body");
      Parser parser = new Parser();
      parser.setURL(SearchHelper.SEARCH_URL_BAIDU + param);
      parser.setEncoding(parser.getEncoding());
      NodeList list = parser.extractAllNodesThatMatch(filter);
      String body = list.toHtml();

      Parser content = new Parser();
      content.setInputHTML(body);
      content.setEncoding(parser.getEncoding());
      NodeFilter content_filter = new TagNameFilter("table");
      NodeList content_list = content.extractAllNodesThatMatch(content_filter);
      for (int i = 0; i < content_list.size(); i++) {
        String s = content_list.elementAt(i).toHtml();
        if (s.indexOf("div") != -1) {
          continue;
        }

        if (s.indexOf("相关搜索") != -1) {

          html.append("<div id=\"rs\">" + s + "</div>");
          continue;
        }
        html.append("<div class=\"content\">");
        for (Node n : extractHtml(content_list.elementAt(i), type)) {

          if (n instanceof LinkTag) {
            if (n.toPlainTextString().equals("百度快照")) {
              continue;
            }
            html.append("<h3 class=\"t\">" + n.toHtml() + "</h3>");
          } else {
            html.append(n.toHtml());
          }
        }

        html.append("<br/></div><br>");
      }

      /** 获取分页数据 */
      Parser page = new Parser();
      page.setInputHTML(body);
      page.setEncoding(parser.getEncoding());
      NodeFilter page_filter = new TagNameFilter("p");
      NodeList page_list = page.extractAllNodesThatMatch(page_filter);
      for (int i = 0; i < page_list.size(); i++) {
        String s = page_list.elementAt(i).toHtml();
        if (s.indexOf("page") == -1) {
          continue;
        }
        html.append("<p id=\"page\">" + page_list.elementAt(i).toHtml() + "</div>");
      }
    } catch (Exception e) {
      e.printStackTrace();
    }

    model.setContent(html.toString());
    return model;
  }