示例#1
0
  /**
   * 对新闻URL进行解析并采集数据
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    String title = ""; // 新闻标题
    String source = ""; // 新闻来源
    String sourceTime = ""; // 新闻来源时间
    // String author = ""; //新闻作者
    String Content = ""; // 新闻内容
    // String collectTime = ""; //新闻采集时间-系统时间
    try {
      parser = new Parser(url);
      parser.setEncoding("GB2312");
      // 标题
      NodeFilter titleFilter = new TagNameFilter("h1");
      NodeList titleNodeList = parser.parse(titleFilter);
      title = parserUtil.getNodeListText(titleNodeList);
      parser.reset(); // 每次获取都必须reset,不然后面获取不到数据
      System.out.println(title);
      // 来源
      NodeFilter sourceFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name"));
      NodeList sourceNodeList = parser.parse(sourceFilter);
      source = parserUtil.getNodeListText(sourceNodeList);
      parser.reset();
      System.out.println(source);
      // 来源时间
      NodeFilter sourceTimeFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date"));
      NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter);
      String str = parserUtil.getNodeListText(sourceTimeNodeList);
      sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", "");
      parser.reset();
      System.out.println(sourceTime);

      // 正文
      NodeFilter ContentTimeFilter =
          new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody"));
      NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter);
      NodeList childList = ContentTimeNodeList.elementAt(0).getChildren();
      childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分
      // childList.keepAllNodesThatMatch(new RegexFilter("  相关专题"));

      Content = parserUtil.getNodeListHTML(ContentTimeNodeList);
      // Content = ParserUtil.getPlainText(Content);
      System.out.println(Content);
      parser.reset();

    } catch (ParserException e) {
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
示例#2
0
  @Override
  public void crawl(Parser parser) throws ParserException {
    List<LCOdds> data = new ArrayList<LCOdds>();
    NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT));
    for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) {
      NodeList cells = it.nextNode().getChildren();
      cells.keepAllNodesThatMatch(tdFilter);

      LCOdds lc = parseRow(cells);

      if (null != lc) {
        data.add(lc);
      }
    }
    // persist
    if (data.size() < 1) {
      log.warn(" -- [ 06_LC_2 ] data is empty !");
    }
    storeData("lc_odds", data);
  }