Ejemplo n.º 1
0
  /**
   * 对新闻URL进行解析并采集数据
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    String title = ""; // 新闻标题
    String source = ""; // 新闻来源
    String sourceTime = ""; // 新闻来源时间
    // String author = ""; //新闻作者
    String Content = ""; // 新闻内容
    // String collectTime = ""; //新闻采集时间-系统时间
    try {
      parser = new Parser(url);
      parser.setEncoding("GB2312");
      // 标题
      NodeFilter titleFilter = new TagNameFilter("h1");
      NodeList titleNodeList = parser.parse(titleFilter);
      title = parserUtil.getNodeListText(titleNodeList);
      parser.reset(); // 每次获取都必须reset,不然后面获取不到数据
      System.out.println(title);
      // 来源
      NodeFilter sourceFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name"));
      NodeList sourceNodeList = parser.parse(sourceFilter);
      source = parserUtil.getNodeListText(sourceNodeList);
      parser.reset();
      System.out.println(source);
      // 来源时间
      NodeFilter sourceTimeFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date"));
      NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter);
      String str = parserUtil.getNodeListText(sourceTimeNodeList);
      sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace(" ", "");
      parser.reset();
      System.out.println(sourceTime);

      // 正文
      NodeFilter ContentTimeFilter =
          new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody"));
      NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter);
      NodeList childList = ContentTimeNodeList.elementAt(0).getChildren();
      childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分
      // childList.keepAllNodesThatMatch(new RegexFilter("  相关专题"));

      Content = parserUtil.getNodeListHTML(ContentTimeNodeList);
      // Content = ParserUtil.getPlainText(Content);
      System.out.println(Content);
      parser.reset();

    } catch (ParserException e) {
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
Ejemplo n.º 2
0
  /**
   * 获取新闻的内容
   *
   * @param newsContentFilter
   * @param parser
   * @return content 新闻内容
   */
  public String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
    String content = null;
    StringBuilder builder = new StringBuilder();

    try {
      NodeList newsContentList = parser.parse(newsContentFilter);
      for (int i = 0; i < newsContentList.size(); i++) {
        Div newsContenTag = (Div) newsContentList.elementAt(i);
        builder = builder.append(newsContenTag.getStringText());
      }
      content = builder.toString(); // 转换为String 类型。
      if (content != null) {
        parser.reset();
        parser = Parser.createParser(content, "utf8");
        StringBean sb = new StringBean();
        sb.setCollapse(true);
        parser.visitAllNodesWith(sb);
        content = sb.getStrings();
        // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} }
        // showTurnAD444(intTurnAD444); }catch(e){}";

        content = content.replaceAll("\\\".*[a-z].*\\}", "");

        content = content.replace("[我来说两句]", "");

      } else {
        System.out.println("没有得到新闻内容!");
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return content;
  }
Ejemplo n.º 3
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
Ejemplo n.º 4
0
 // 获取页面指定内容的Link
 public static List getLinksByConditions(String result, String coditions, String codeKind) {
   List links = null;
   Parser parser;
   NodeList nodelist;
   // 页面编码配置 To do by shengf
   parser = Parser.createParser(result, codeKind);
   NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
   try {
     links = new ArrayList();
     nodelist = parser.parse(linkFilter);
     Node[] nodes = nodelist.toNodeArray();
     int count = 1;
     for (int i = 0; i < nodes.length; i++) {
       Node node = nodes[i];
       if (node instanceof LinkTag) {
         LinkTag link = (LinkTag) node;
         if (link.toHtml().indexOf(coditions) != -1) {
           links.add(link);
           count++;
           if (count > CatchNum) {
             return links;
           }
         }
       }
     }
   } catch (ParserException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return links;
 }
Ejemplo n.º 5
0
 private NodeList getMatchingTags(NodeFilter filter) throws Exception {
   String html = examiner.html();
   Parser parser = new Parser(new Lexer(new Page(html)));
   NodeList list = parser.parse(null);
   NodeList matches = list.extractAllNodesThatMatch(filter, true);
   return matches;
 }
Ejemplo n.º 6
0
  public static void setViewState(String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE"));

    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    viewState = node.getAttribute("value");
  }
Ejemplo n.º 7
0
  public static void setEventValidation(String html) throws ParserException {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(
            new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION"));
    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    eventValidation = node.getAttribute("value");
  }
Ejemplo n.º 8
0
  /*
   * 获得新闻的日期
   */
  public String getNewsDate(NodeFilter dateFilter, Parser parser) {
    String newsDate = null;
    try {
      NodeList dateList = parser.parse(dateFilter);
      for (int i = 0; i < dateList.size(); i++) {
        Div dateTag = (Div) dateList.elementAt(i);
        newsDate = dateTag.getStringText();
      }
    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return newsDate;
  }
Ejemplo n.º 9
0
  /**
   * 获得新闻的责任编辑,也就是作者。
   *
   * @param newsauthorFilter
   * @param parser
   * @return
   */
  public String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) {
    String newsAuthor = "";
    try {
      NodeList authorList = parser.parse(newsauthorFilter);
      for (int i = 0; i < authorList.size(); i++) {
        Div authorSpan = (Div) authorList.elementAt(i);
        newsAuthor = authorSpan.getStringText();
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
    return newsAuthor;
  }
Ejemplo n.º 10
0
  private String[] processBlog(InputStream in) throws BlogCrawlingException {

    // using a set here to avoid duplicates
    Set<String> linksToBlogs = new TreeSet<String>();

    try {

      Page page = new Page(in, null);
      Parser parser = new Parser(new Lexer(page));

      // register a filter to extract all the anchor tags
      TagNameFilter anchorTagsFilter = new TagNameFilter("a");

      StringBuffer buf = new StringBuffer();
      NodeList anchorTagsList = parser.parse(anchorTagsFilter);

      for (int i = 0; i < anchorTagsList.size(); i++) {
        Node node = anchorTagsList.elementAt(i);
        LinkTag tag = (LinkTag) node;
        String linkURL = tag.getLink();

        if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) {
          // logger.info(" *BLOG Detected* ==> " + linkURL);
          System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL);
          linksToBlogs.add(linkURL);
        } else {
          System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL);
        }
      }

      String[] links = new String[linksToBlogs.size()];
      int count = 0;
      for (String linksToBlog : linksToBlogs) {
        links[count++] = linksToBlog;
      }

      return links;

    } catch (ParserException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (IOException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    }
  }
Ejemplo n.º 11
0
  /**
   * 获得新闻的标题
   *
   * @param titleFilter
   * @param parser
   * @return
   */
  public String getTitle(NodeFilter titleFilter, Parser parser) {
    String titleName = "";
    try {

      NodeList titleNodeList = parser.parse(titleFilter);
      for (int i = 0; i < titleNodeList.size(); i++) {
        HeadingTag title = (HeadingTag) titleNodeList.elementAt(i);
        titleName = title.getStringText();
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
    return titleName;
  }
Ejemplo n.º 12
0
  private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter viewStateFilter =
        new AndFilter(
            new TagNameFilter("table"),
            new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView"));

    NodeList nodes = parser.parse(viewStateFilter);
    TableTag node = (TableTag) nodes.elementAt(0);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < rows.length; i++) {
      TableColumn[] cols = rows[i].getColumns();
      TableColumn col = cols[3];
      LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2);
      if (tag == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }

      String href = tag.getAttribute("href");
      if (href == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }
      int start = href.indexOf("standardid=");
      int end = href.indexOf("&amp;");

      String standardId = href.substring(start, end).replaceAll("standardid=", "");

      List<String> lst = map.get(pageNo);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(standardId);
      map.put(pageNo, lst);
    }
  }
Ejemplo n.º 13
0
  /**
   * @param <T> 标签类型
   * @param html 需要解析的文本html
   * @param tagType 标签类型 class
   * @param attr 该标签应该有的树形
   * @param value 属性的值 (Ϊnull ��Ϊ��ƥ��)
   * @return
   */
  public static <T extends TagNode> List<T> parseTags(
      String html,
      final Class<T> tagType,
      final String attr,
      final String value,
      final boolean test) {
    Parser parser = new Parser();
    try {
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.registerTag(new PreTag());
      parser.setNodeFactory(factory);
      parser.setInputHTML(html);
      NodeList tagList =
          parser.parse(
              new NodeFilter() {

                @Override
                public boolean accept(Node node) {
                  if (test) logger.info(node.getClass());
                  if (node.getClass() == tagType) {
                    if (attr == null) return true;
                    T tn = (T) node;
                    String attrv = tn.getAttribute(attr);
                    if (value == null && attrv != null) { // || attrv.equals(value)
                      return true;
                    }
                    if (test) logger.info(attrv);
                    if (value != null && attrv != null && attrv.equals(value)) return true;
                  }
                  return false;
                }
              });

      List<T> tags = new ArrayList<T>();
      for (int i = 0; i < tagList.size(); i++) {
        tags.add((T) tagList.elementAt(i));
      }
      return tags;
    } catch (ParserException e) {
      e.printStackTrace();
    }

    return null;
  }
  /** Test a better method of modifying an HTML page. */
  public void testPageModification() throws Exception {
    Parser parser = Parser.createParser(HTML_WITH_LINK, null);
    NodeList list = parser.parse(null); // no filter
    // make an inner class that does the same thing as the UrlModifyingVisitor
    NodeVisitor visitor =
        new NodeVisitor() {
          String linkPrefix = "localhost://";

          public void visitTag(Tag tag) {
            if (tag instanceof LinkTag)
              ((LinkTag) tag).setLink(linkPrefix + ((LinkTag) tag).getLink());
            else if (tag instanceof ImageTag)
              ((ImageTag) tag).setImageURL(linkPrefix + ((ImageTag) tag).getImageURL());
          }
        };
    list.visitAllNodesWith(visitor);
    String result = list.toHtml();
    assertStringEquals("Expected HTML", MODIFIED_HTML, result);
  }
Ejemplo n.º 15
0
  @Override
  public void crawl(Parser parser) throws ParserException {
    List<LCOdds> data = new ArrayList<LCOdds>();
    NodeList nl = parser.parse(new CssSelectorNodeFilter(ROOT));
    for (NodeIterator it = nl.elements(); it.hasMoreNodes(); ) {
      NodeList cells = it.nextNode().getChildren();
      cells.keepAllNodesThatMatch(tdFilter);

      LCOdds lc = parseRow(cells);

      if (null != lc) {
        data.add(lc);
      }
    }
    // persist
    if (data.size() < 1) {
      log.warn(" -- [ 06_LC_2 ] data is empty !");
    }
    storeData("lc_odds", data);
  }
Ejemplo n.º 16
0
  private static void addDetailToMap(String key, String text) throws Exception {
    Parser parser = Parser.createParser(text, "gb2312");
    TagNameFilter tableFiler = new TagNameFilter("table");

    NodeList nodes = parser.parse(tableFiler);

    TableTag node = (TableTag) nodes.elementAt(5);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < 11; i++) {
      TableColumn[] cols = rows[i].getColumns();

      StringBuffer txt1 = new StringBuffer();

      StringBuffer txt2 = new StringBuffer();

      NodeList span1 = cols[1].getChildren().elementAt(1).getChildren();

      for (int j = 0; j < span1.size(); j++) {
        if (span1.elementAt(j) instanceof TextNode) {
          txt1.append(span1.elementAt(j).getText()).append(" ");
        }
      }

      NodeList span2 = cols[3].getChildren().elementAt(1).getChildren();

      for (int j = 0; j < span2.size(); j++) {
        if (span2.elementAt(j) instanceof TextNode) {
          txt2.append(span2.elementAt(j).getText()).append(" ");
        }
      }

      List<String> lst = detailMap.get(key);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(txt1.toString().trim());
      lst.add(txt2.toString().trim());
      detailMap.put(key, lst);
    }
  }
Ejemplo n.º 17
0
 // 土地交易单独处理
 public static List getLinksByConditions2(String result, String coditions, String codeKind) {
   List links = null;
   Parser parser;
   NodeList nodelist;
   parser = Parser.createParser(result, codeKind);
   NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
   try {
     links = new ArrayList();
     nodelist = parser.parse(linkFilter);
     Node[] nodes = nodelist.toNodeArray();
     int count = 1;
     for (int i = 0; i < nodes.length; i++) {
       Node node = nodes[i];
       if (node instanceof LinkTag) {
         LinkTag link = (LinkTag) node;
         if ((link.toHtml().indexOf(coditions) != -1)
             && (link.getChildrenHTML().indexOf("查看") == -1)) {
           // System.out.println(link.toHtml());
           // System.out.println(link.getLink());
           // System.out.println("test:" + link.getChildrenHTML());
           // Node nextNode = link.getParent().getNextSibling();
           // System.out.println(nextNode.getChildren().toHtml().replaceAll("/r/n","").trim());
           // nextNode =
           // nextNode.getNextSibling().getNextSibling();
           // System.out.println(nextNode.getChildren().toHtml().replaceAll("/r/n","").trim());
           links.add(link);
           count++;
           if (count > CatchNum) {
             return links;
           }
         }
       }
     }
   } catch (ParserException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return links;
 }
Ejemplo n.º 18
0
  /**
   * 提取具有某个属性值的标签列表
   *
   * @param html 被提取的html文本
   * @param tagType 标签的类型
   * @param attributeName 某个属性的名称
   * @param attributeValue 属性应取的值
   * @return
   */
  public static <T extends TagNode> List<T> parseTags(
      String html,
      final Class<T> tagType,
      final String attributeName,
      final String attributeValue) {
    try {
      // 创建一个html解释器
      Parser parser = new Parser();
      parser.setInputHTML(html);

      NodeList tagList =
          parser.parse(
              new NodeFilter() {
                @Override
                public boolean accept(Node node) {
                  if (node.getClass() == tagType) {
                    T tn = (T) node;
                    String attrValue = tn.getAttribute(attributeName);
                    if (attrValue != null && attrValue.equals(attributeValue)) {
                      return true;
                    }
                  }
                  return false;
                }
              });
      List<T> tags = new ArrayList<T>();
      for (int i = 0; i < tagList.size(); i++) {
        T t = (T) tagList.elementAt(i);
        tags.add(t);
      }
      return tags;
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return null;
  }
Ejemplo n.º 19
0
  public void extractLinks() throws Exception {
    logger.debug("Extracting links " + pageInfo.getUrl());
    String content = pageInfo.getContent();
    if (content == null || content.length() == 0) {
      return;
    }
    URI uri = new URI(pageInfo.getUrl());

    Parser parser = new Parser();
    parser.setInputHTML(content);
    NodeList nodeList = parser.parse(new TagNameFilter("A"));
    logger.debug("get links from " + pageInfo.getUrl() + " size : " + nodeList.size());
    for (int i = 0; i < nodeList.size(); i++) {
      Node node = nodeList.elementAt(i);

      LinkTag tag = (LinkTag) node;
      String linkHref = tag.extractLink();
      if (linkHref.indexOf("http") != linkHref.lastIndexOf("http")) {
        continue;
      }
      try {
        URI linkUri = uri.resolve(linkHref);
        String link = linkUri.toString();
        if (link != null && link.length() > 0) {
          for (FilterRule fr : crawler.getFilterRules()) {
            CrawlAction ca = fr.judge(link);
            if (ca == CrawlAction.STORE
                || ca == CrawlAction.FOLLOW
                || ca == CrawlAction.FOLLOW_STORE) {
              logger.debug("linkUri : " + link + " -- ca : " + ca.toString());
              pageInfo.getLinks().add(link);
            }
          }
        }
      } catch (Exception ignore) {
      }
    }
  }
Ejemplo n.º 20
0
  public static NodeList parseAllTags(String html) {
    Parser parser = new Parser();
    try {
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.registerTag(new PreTag());
      parser.setNodeFactory(factory);
      parser.setInputHTML(html);
      NodeList tagList =
          parser.parse(
              new NodeFilter() {
                @Override
                public boolean accept(Node node) {
                  return true;
                }
              });

      return tagList;
    } catch (ParserException e) {
      e.printStackTrace();
    }

    return null;
  }
Ejemplo n.º 21
0
  public static <T extends TagNode> List<T> parseTags(
      Parser parser, final Class<T> tagType, final String attr, final String value) {
    // Parser parser = new Parser();
    try {
      // parser.setInputHTML(html);
      NodeList tagList =
          parser.parse(
              new NodeFilter() {
                @Override
                public boolean accept(Node node) {

                  if (tagType == null || node.getClass() == tagType) {
                    T tn = (T) node;
                    String attrv = tn.getAttribute(attr);
                    if (node instanceof Div) logger.info(attrv);
                    if (value == null && attrv != null) { // || attrv.equals(value)
                      return true;
                    }

                    if (value != null && attrv != null && attrv.equals(value)) return true;
                  }
                  return false;
                }
              });

      List<T> tags = new ArrayList<T>();
      for (int i = 0; i < tagList.size(); i++) {
        tags.add((T) tagList.elementAt(i));
      }
      return tags;
    } catch (ParserException e) {
      e.printStackTrace();
    }

    return null;
  }
Ejemplo n.º 22
0
  /**
   * 处理目标 超链接节点
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws Exception
   */
  public void dealLinkNodes(String htmlPageContent, String preUrl) {
    try {
      Parser parser = new Parser();
      parser.setInputHTML(htmlPageContent);
      NodeFilter filter =
          new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"));
      NodeList nodeList = parser.parse(filter);
      LoggerUtil.info("ParserHandler", "爬虫得到新的节点个数:" + (nodeList != null ? nodeList.size() : 0));
      NodeIterator it = nodeList.elements();
      while (it.hasMoreNodes()) {
        Node node = it.nextNode();
        if (node instanceof LinkTag) {
          if (!filterHandler.isLinkTagFilter(((LinkTag) node))) {
            LoggerUtil.debug(
                "ParserHandler  ", ((LinkTag) node).getLink(), ((LinkTag) node).getLinkText());
            CrawlQueue.getQueueManager()
                .newNode(((LinkTag) node).getLinkText(), ((LinkTag) node).getLink(), preUrl);
          }
        }
      }
    } catch (Exception e) {

    }
  }