예제 #1
0
  public void checkprice() throws Exception {

    // System.out.println("checking Aptamil url [" + page.url + "]");
    URL url = new URL(page.url);
    HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
    urlConnection.setConnectTimeout(Constant.connect_timeout);
    urlConnection.connect();

    Parser parser = new Parser(urlConnection);

    parser.setEncoding(Constant.ENCODE);
    NodeClassFilter div_filter = new NodeClassFilter(Div.class);

    OrFilter filters = new OrFilter();
    filters.setPredicates(new NodeFilter[] {div_filter});

    NodeList list = parser.extractAllNodesThatMatch(filters);

    for (int i = 0; i < list.size(); i++) {
      Node tag = list.elementAt(i);
      if (tag instanceof Div) {
        Div d = (Div) tag;
        String divclass = d.getAttribute("class");
        if ("pl_addToBasket".equalsIgnoreCase(divclass)) {
          // return getName(d);
        }
      }
    }
  }
예제 #2
0
  /**
   * 方法:获取对应的页面内容
   *
   * @param htmlPageContent
   * @param preUrl
   * @throws ParserException
   *     <p>Add By Ethan Lam At 2011-11-23
   */
  public void fetchHtmlContent(String htmlPageContent, String preUrl) throws ParserException {
    Parser parser = new Parser();
    parser.setInputHTML(htmlPageContent);
    NodeFilter filter =
        new AndFilter(
            new TagNameFilter("div"), new HasAttributeFilter("class", "blkContainerSblkCon"));
    NodeList nodeList = parser.parse(filter);
    NodeIterator it = nodeList.elements();
    Div div = null;
    StringBuffer htmlContent = new StringBuffer();
    while (it.hasMoreNodes()) {
      div = (Div) it.nextNode();
      NodeList nl = div.getChildren();
      if (nl == null) continue;
      NodeIterator sub = nl.elements();
      while (sub.hasMoreNodes()) {
        Node t = sub.nextNode();
        if (t instanceof ParagraphTag) {
          //	        		    LoggerUtil.info("fetchHtmlContent:",((ParagraphTag) t).getStringText());
          htmlContent.append(((ParagraphTag) t).getStringText());
        }
      }
    }
    if ("".equals(htmlContent.toString().trim())) return;

    Page page = new Page();
    page.setUrl(preUrl);
    page.setSegment(htmlContent.toString());
    LoggerUtil.info(preUrl + "获取到的页面内容:", htmlContent.toString());
    pageSer.save(page);
  }
예제 #3
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /**
   * 获取新闻的内容
   *
   * @param newsContentFilter
   * @param parser
   * @return content 新闻内容
   */
  public String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
    String content = null;
    StringBuilder builder = new StringBuilder();

    try {
      NodeList newsContentList = parser.parse(newsContentFilter);
      for (int i = 0; i < newsContentList.size(); i++) {
        Div newsContenTag = (Div) newsContentList.elementAt(i);
        builder = builder.append(newsContenTag.getStringText());
      }
      content = builder.toString(); // 转换为String 类型。
      if (content != null) {
        parser.reset();
        parser = Parser.createParser(content, "utf8");
        StringBean sb = new StringBean();
        sb.setCollapse(true);
        parser.visitAllNodesWith(sb);
        content = sb.getStrings();
        // String s = "\";} else{ document.getElementById('TurnAD444').innerHTML = \"\";} }
        // showTurnAD444(intTurnAD444); }catch(e){}";

        content = content.replaceAll("\\\".*[a-z].*\\}", "");

        content = content.replace("[我来说两句]", "");

      } else {
        System.out.println("没有得到新闻内容!");
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }

    return content;
  }
예제 #4
0
 // 获取页面指定内容的Link
 public static List getLinksByConditions(String result, String coditions, String codeKind) {
   List links = null;
   Parser parser;
   NodeList nodelist;
   // 页面编码配置 To do by shengf
   parser = Parser.createParser(result, codeKind);
   NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
   try {
     links = new ArrayList();
     nodelist = parser.parse(linkFilter);
     Node[] nodes = nodelist.toNodeArray();
     int count = 1;
     for (int i = 0; i < nodes.length; i++) {
       Node node = nodes[i];
       if (node instanceof LinkTag) {
         LinkTag link = (LinkTag) node;
         if (link.toHtml().indexOf(coditions) != -1) {
           links.add(link);
           count++;
           if (count > CatchNum) {
             return links;
           }
         }
       }
     }
   } catch (ParserException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return links;
 }
 public void testUrlModificationWithVisitor() throws Exception {
   Parser parser = Parser.createParser(HTML_WITH_LINK, null);
   UrlModifyingVisitor visitor = new UrlModifyingVisitor("localhost://");
   parser.visitAllNodesWith(visitor);
   String result = visitor.getModifiedResult();
   assertStringEquals("Expected HTML", MODIFIED_HTML, result);
 }
 // If there is a <title> element on the start page, use that as our AU
 // name.
 String recomputeRegName() {
   if (!isStarted()) {
     // This can get invoked (seveeral times, mostly from logging) before
     // enough mechanism has started to make it possible to resolve the CuUrl
     // below.
     return null;
   }
   try {
     CachedUrl cu = makeCachedUrl(m_registryUrl);
     if (cu == null) return null;
     URL cuUrl = CuUrl.fromCu(cu);
     Parser parser = new Parser(cuUrl.toString());
     NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class));
     Node nodes[] = nodelst.toNodeArray();
     recomputeRegName = false;
     if (nodes.length < 1) return null;
     // Get the first title found
     TitleTag tag = (TitleTag) nodes[0];
     if (tag == null) return null;
     return tag.getTitle();
   } catch (MalformedURLException e) {
     log.warning("recomputeRegName", e);
     return null;
   } catch (ParserException e) {
     if (e.getThrowable() instanceof FileNotFoundException) {
       log.warning("recomputeRegName: " + e.getThrowable().toString());
     } else {
       log.warning("recomputeRegName", e);
     }
     return null;
   }
 }
예제 #7
0
 private NodeList getMatchingTags(NodeFilter filter) throws Exception {
   String html = examiner.html();
   Parser parser = new Parser(new Lexer(new Page(html)));
   NodeList list = parser.parse(null);
   NodeList matches = list.extractAllNodesThatMatch(filter, true);
   return matches;
 }
예제 #8
0
  public static void dealOnePage(String url, int starNo) {
    try {
      Parser parser = new Parser((HttpURLConnection) (new URL(url)).openConnection());
      NodeList tableSet =
          parser.extractAllNodesThatMatch(new HasAttributeFilter("bgcolor", "#DDE1FF"));
      parser = new Parser(new Lexer(tableSet.toHtml()));
      NodeList tdSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("tr"));
      parser = new Parser(new Lexer(tdSet.toHtml()));

      PrototypicalNodeFactory p = new PrototypicalNodeFactory();
      p.registerTag(new SpanTag());
      parser.setNodeFactory(p);

      NodeList spanSet = parser.extractAllNodesThatMatch(new HasAttributeFilter("span"));
      int index = 0;
      for (int i = 5; i < spanSet.size(); i = i + 5) {
        String str = spanSet.elementAt(i).toPlainTextString();
        String now = "" + (starNo * 100 + index);
        index++;
        while (str.compareTo(now) != 0) {
          System.out.println(now);
          now = "" + (starNo * 100 + index);
          index++;
        }
        // System.out.println(str);
      }
    } catch (ParserException e) {
      e.printStackTrace();
    } catch (MalformedURLException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
예제 #9
0
 @Test
 public void test5() {
   try {
     Parser parser = new Parser();
     parser.setURL("www.zhihu.com");
   } catch (Exception e) {
   }
 }
예제 #10
0
  // 获取一个网站上的链接,filter来过滤链接
  public static Set<String> extracLinks(String url, Cobweb cobweb) {
    Set<String> links = new HashSet<String>();

    try {
      Parser parser = new Parser(url);
      parser.setEncoding(cobweb.getCharSet());

      // 过滤<frame >标签的filter,用来提取frame 标签里的src 属性
      NodeFilter frameFilter =
          new NodeFilter() {
            public boolean accept(Node node) {
              if (node.getText().startsWith("frame src=")) {
                return true;
              } else {
                return false;
              }
            }
          };

      // OrFilter 来设置过滤<a> 标签和<frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) { // <a> 标签
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // URL
          if (cobweb.accept(linkUrl)) {
            links.add( // java.net.URLEncoder.encode(linkUrl));
                linkUrl
                    .replaceAll("\\?", "\\%3F") // 转码
                    .replaceAll("\\&", "\\%26")
                    .replaceAll("\\|", "\\%124")
                    .replaceAll("\\#", ""));
          }
          ;
        } else { // <frame>标签
          // 提取frame 里src 属性的链接,如<frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=");
          frame = frame.substring(start);
          int end = frame.indexOf(" ");
          if (end == -1) {
            end = frame.indexOf(">");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (cobweb.accept(frameUrl)) {
            links.add(frameUrl);
          }
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }
 /**
  * 获取滚动品牌
  *
  * @param path
  * @param city
  * @param fileName
  * @return
  */
 public static Map<String, String> getBrandInfo(String path, String city, String fileName) {
   Map<String, String> brandMap = new LinkedHashMap<String, String>();
   try {
     StringBuilder filePath = new StringBuilder();
     filePath.append(PATH);
     filePath.append(city);
     filePath.append(INCLUDE);
     filePath.append(fileName);
     filePath.append(STUFF);
     // 开始解析
     Parser parser = new Parser(filePath.toString());
     // 过滤出<a></a>标签
     NodeFilter divFilter = new NodeClassFilter(Div.class);
     NodeList classList = parser.extractAllNodesThatMatch(divFilter);
     NodeList hrefList = null;
     NodeList imgList = null;
     Node picNode = null;
     Node hrefNode = null;
     Node imgNode = null;
     String classStr = "";
     String hrefStr = "";
     String imgStr = "";
     String imgClass = "";
     for (int i = 0; i < classList.size(); i++) {
       picNode = classList.elementAt(i);
       classStr = ((Div) picNode).getAttribute("class");
       if ("business_list_pic".equalsIgnoreCase(classStr)) {
         hrefList = picNode.getChildren();
         for (int j = 0; j < hrefList.size(); j++) {
           hrefNode = hrefList.elementAt(j);
           if (hrefNode instanceof LinkTag) {
             hrefStr = ((LinkTag) hrefNode).getLink(); // 有用品牌id,获取到id
             hrefStr = MParseBrand.getBrandId(hrefStr);
             imgList = hrefNode.getChildren();
             for (int k = 0; k < imgList.size(); k++) {
               imgNode = imgList.elementAt(k);
               if (imgNode instanceof ImageTag) {
                 imgClass = ((ImageTag) imgNode).getAttribute("class");
                 if (null != imgClass) {
                   imgStr = ((ImageTag) imgNode).getAttribute("src");
                   if (null == imgStr) {
                     imgStr = ((ImageTag) imgNode).getAttribute("original");
                   }
                 }
               }
             }
             brandMap.put(hrefStr, imgStr);
           }
         }
       }
     }
   } catch (Exception e) {
     e.printStackTrace();
   }
   return brandMap;
 }
예제 #12
0
  public static void setEventValidation(String html) throws ParserException {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(
            new TagNameFilter("input"), new HasAttributeFilter("id", "__EVENTVALIDATION"));
    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    eventValidation = node.getAttribute("value");
  }
예제 #13
0
  public static void setViewState(String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter filter =
        new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("id", "__VIEWSTATE"));

    NodeList nodes = parser.parse(filter);
    InputTag node = (InputTag) nodes.elementAt(0);

    viewState = node.getAttribute("value");
  }
 /** @throws ParserException */
 private void parseHtml() throws ParserException {
   htmlTags = new ArrayList();
   Parser parser = new Parser();
   parser.setInputHTML(fDocument.get());
   for (NodeIterator e = parser.elements(); e.hasMoreNodes(); ) {
     Node node = e.nextNode();
     VHtmlNodeVisitor htmlNodeVisitor = new VHtmlNodeVisitor();
     node.accept(htmlNodeVisitor);
   }
 }
 public static List<String> getLinks(String url) throws ParserException {
   Parser htmlParser = new Parser(url);
   List<String> links = new LinkedList<String>();
   NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
   for (int m = 0; m < tagNodeList.size(); m++) {
     LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
     String linkName = loopLinks.getLink();
     links.add(linkName);
   }
   return links;
 }
예제 #16
0
  /** Test regular expression matching: */
  public void testRegularExpression() throws Exception {
    String target =
        "\n"
            + "\n"
            + "Most recently, in the Western Conference final, the Flames knocked off \n"
            + "the San Jose Sharks, the Pacific Division champions, to become the first \n"
            + "Canadian team to reach the Stanley Cup Championship series since 1994.";

    String html =
        "<html><head><title>CBC Sports Online: NHL Playoffs</title></head>"
            + "<body><h1>CBC SPORTS ONLINE</h1>\n"
            + "The Calgary Flames have already defeated three NHL division winners \n"
            + "during their improbable playoff run. If they are to hoist the Stanley \n"
            + "Cup they'll have to go through one more. <p><table ALIGN=\"Right\" width=196 CELLPADDING=0 cellspacing=0 hspace=4> <tr><td><img src=\"/gfx/topstory/sports/iginla_j0524.jpg\" width=194 height=194 hspace=3 border=1><br>\n"
            + "\n"
            + "<font SIZE=\"1\" FACE=\"verdana,arial\">\n"
            + "Jarome Iginla skates during the Flames' practice on Monday. Calgary takes on the Tampa Bay Lightning in the Stanley Cup finals beginning Tuesday night in Tampa\n"
            + "</font></td></tr></table>\n"
            + "\n"
            + "\n"
            + "In the post-season's first round, the Flames defeated the Vancouver \n"
            + "Canucks, the Northwest Division winners, in seven tough games. <p>\n"
            + "\n"
            + "In Round 2 it was the Detroit Red Wings, who not only won the Central \n"
            + "Division, but also boasted the NHL's best overall record during the \n"
            + "regular season, who fell to the Flames. <p>"
            + target
            + "<p>\n"
            + "\n"
            + "Up next for the Flames is the Tampa Bay Lighting -- the runaway winners \n"
            + "of the NHL's Southeast Division and the Eastern Conference's best team \n"
            + "during the regular season. <p>\n"
            + "\n"
            + "The Lighting advanced by beating the Philadelphia Flyers in the Eastern \n"
            + "Conference final. <p>\n"
            + "</body></html>\n";
    Lexer lexer;
    Parser parser;
    RegexFilter filter;
    NodeIterator iterator;
    int count;

    lexer = new Lexer(html);
    parser = new Parser(lexer);
    filter =
        new RegexFilter(
            "(19|20)\\d\\d([- \\\\/.](0[1-9]|1[012])[- \\\\/.](0[1-9]|[12][0-9]|3[01]))?");
    count = 0;
    for (iterator = parser.extractAllNodesThatMatch(filter).elements(); iterator.hasMoreNodes(); ) {
      assertEquals("text wrong", target, iterator.nextNode().toHtml());
      count++;
    }
    assertEquals("wrong count", 1, count);
  }
예제 #17
0
  // 获取一个网站上的链接,filter 用来过滤链接
  public static Set<String> extracLinks(String url, LinkFilter filter) {

    Set<String> links = new HashSet<String>();
    try {
      Parser parser = new Parser(url);
      // parser.setEncoding("utf8");
      // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
      NodeFilter frameFilter =
          new NodeFilter() {
            /** */
            private static final long serialVersionUID = 1L;

            public boolean accept(Node node) {
              if (node.getText().startsWith("iframe") && node.getText().contains("src=")) {
                return true;
              } else {
                return false;
              }
            }
          };
      // OrFilter 来设置过滤 <a> 标签和 <frame> 标签
      OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
      // 得到所有经过过滤的标签
      NodeList list = parser.extractAllNodesThatMatch(linkFilter);
      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);
        if (tag instanceof LinkTag) // <a> 标签
        {
          LinkTag link = (LinkTag) tag;
          String linkUrl = link.getLink(); // url可能出现在src,href等属性中
          if (filter.accept(linkUrl)) links.add(linkUrl);
        } else // <frame> 标签
        {
          // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
          String frame = tag.getText();
          int start = frame.indexOf("src=\"");
          frame = frame.substring(start);
          int end = frame.indexOf("\">");
          if (end == -1) {
            end = frame.indexOf("?");
          }
          String frameUrl = frame.substring(5, end - 1);
          if (filter.accept(frameUrl)) links.add(frameUrl);
        }
      }
    } catch (ParserException e) {
      e.printStackTrace();
    }
    return links;
  }
예제 #18
0
  private String[] processBlog(InputStream in) throws BlogCrawlingException {

    // using a set here to avoid duplicates
    Set<String> linksToBlogs = new TreeSet<String>();

    try {

      Page page = new Page(in, null);
      Parser parser = new Parser(new Lexer(page));

      // register a filter to extract all the anchor tags
      TagNameFilter anchorTagsFilter = new TagNameFilter("a");

      StringBuffer buf = new StringBuffer();
      NodeList anchorTagsList = parser.parse(anchorTagsFilter);

      for (int i = 0; i < anchorTagsList.size(); i++) {
        Node node = anchorTagsList.elementAt(i);
        LinkTag tag = (LinkTag) node;
        String linkURL = tag.getLink();

        if (blogDetector.identifyURL(linkURL, null) != Constants.NOT_A_BLOG) {
          // logger.info(" *BLOG Detected* ==> " + linkURL);
          System.out.println("[" + myNumber + "] *BLOG Detected* ==> " + linkURL);
          linksToBlogs.add(linkURL);
        } else {
          System.out.println("[" + myNumber + "] *Non-BLOG Detected* ==> " + linkURL);
        }
      }

      String[] links = new String[linksToBlogs.size()];
      int count = 0;
      for (String linksToBlog : linksToBlogs) {
        links[count++] = linksToBlog;
      }

      return links;

    } catch (ParserException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    } catch (IOException e) {
      e.printStackTrace();
      throw new BlogCrawlingException(e);
    }
  }
  /**
   * Assign the underlying node filter for this wrapper.
   *
   * @param filter The filter to wrap.
   * @param context The parser to use for conditioning this filter. Some filters need contextual
   *     information to provide to the user, i.e. for tag names or attribute names or values, so the
   *     Parser context is provided.
   */
  public void setNodeFilter(NodeFilter filter, Parser context) {
    Set set;

    mFilter = (TagNameFilter) filter;
    set = new HashSet();
    context.reset();
    try {
      for (NodeIterator iterator = context.elements(); iterator.hasMoreNodes(); )
        addName(set, iterator.nextNode());
    } catch (ParserException pe) {
      // oh well, we tried
    }
    for (Iterator iterator = set.iterator(); iterator.hasNext(); ) mName.addItem(iterator.next());
    mName.setSelectedItem(mFilter.getName());
  }
예제 #20
0
  private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter viewStateFilter =
        new AndFilter(
            new TagNameFilter("table"),
            new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView"));

    NodeList nodes = parser.parse(viewStateFilter);
    TableTag node = (TableTag) nodes.elementAt(0);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < rows.length; i++) {
      TableColumn[] cols = rows[i].getColumns();
      TableColumn col = cols[3];
      LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2);
      if (tag == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }

      String href = tag.getAttribute("href");
      if (href == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }
      int start = href.indexOf("standardid=");
      int end = href.indexOf("&amp;");

      String standardId = href.substring(start, end).replaceAll("standardid=", "");

      List<String> lst = map.get(pageNo);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(standardId);
      map.put(pageNo, lst);
    }
  }
예제 #21
0
파일: AreaTest.java 프로젝트: zhaoccx/LS
  /**
   * 对新闻URL进行解析提取新闻,同时将新闻插入到数据库中。
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    try {
      parser = new Parser(url);
      // NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"), new
      // HasAttributeFilter("class", "TRS_PreAppend"));

      // parser.reset(); //记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。

      NodeFilter innerFilter =
          new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "MsoNormal"));
      NodeFilter xk = new HasParentFilter(innerFilter);
      NodeList nodes = parser.extractAllNodesThatMatch(xk);
      System.out.println(nodes.size());
      for (int i = 0; i < nodes.size(); i++) {
        Node time = nodes.elementAt(i);
        // System.out.println(time.toPlainTextString().trim().replace("&nbsp;",
        // "").replaceAll("[\\t\\n\\r]", "").replaceAll(" ", ""));
        System.out.println(
            replaceBlank(time.getLastChild().getText().replaceAll("span", "").replaceAll(" ", "")));
      }

    } catch (ParserException ex) {
      Logger.getLogger(AreaTest.class.getName()).log(Level.SEVERE, null, ex);
    }
  }
예제 #22
0
  public static void setInnerHTML(Element root, String html) {

    // remove old root childs
    OverrideNodeList<Node> list = (OverrideNodeList<Node>) root.getChildNodes();
    list.getList().clear();

    if (html != null) {
      Parser parser = Parser.createParser(html, "UTF-8");
      try {
        parser.visitAllNodesWith(new GwtNodeVisitor(root));
      } catch (ParserException e) {
        throw new RuntimeException(
            "error while parsing <" + root.getTagName() + "> element's innerHTML : " + html, e);
      }
    }
  }
예제 #23
0
  /**
   * 对新闻URL进行解析并采集数据
   *
   * @param url 新闻连接。
   */
  public void parser(String url) {
    String title = ""; // 新闻标题
    String source = ""; // 新闻来源
    String sourceTime = ""; // 新闻来源时间
    // String author = ""; //新闻作者
    String Content = ""; // 新闻内容
    // String collectTime = ""; //新闻采集时间-系统时间
    try {
      parser = new Parser(url);
      parser.setEncoding("GB2312");
      // 标题
      NodeFilter titleFilter = new TagNameFilter("h1");
      NodeList titleNodeList = parser.parse(titleFilter);
      title = parserUtil.getNodeListText(titleNodeList);
      parser.reset(); // 每次获取都必须reset,不然后面获取不到数据
      System.out.println(title);
      // 来源
      NodeFilter sourceFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "media_name"));
      NodeList sourceNodeList = parser.parse(sourceFilter);
      source = parserUtil.getNodeListText(sourceNodeList);
      parser.reset();
      System.out.println(source);
      // 来源时间
      NodeFilter sourceTimeFilter =
          new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "pub_date"));
      NodeList sourceTimeNodeList = parser.parse(sourceTimeFilter);
      String str = parserUtil.getNodeListText(sourceTimeNodeList);
      sourceTime = str.replace("年", "-").replace("月", "-").replace("日", " ").replace("&nbsp;", "");
      parser.reset();
      System.out.println(sourceTime);

      // 正文
      NodeFilter ContentTimeFilter =
          new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "artibody"));
      NodeList ContentTimeNodeList = parser.parse(ContentTimeFilter);
      NodeList childList = ContentTimeNodeList.elementAt(0).getChildren();
      childList.keepAllNodesThatMatch(new NotFilter(new TagNameFilter("div"))); // 去掉非正文部分
      // childList.keepAllNodesThatMatch(new RegexFilter("  相关专题"));

      Content = parserUtil.getNodeListHTML(ContentTimeNodeList);
      // Content = ParserUtil.getPlainText(Content);
      System.out.println(Content);
      parser.reset();

    } catch (ParserException e) {
      e.printStackTrace();
    } catch (Exception e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }
예제 #24
0
  public boolean checkprice() {
    System.out.println("checking amazon url:" + page.url);
    try {

      URL url = new URL(page.url);
      HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
      urlConnection.setConnectTimeout(Constant.connect_timeout);

      Parser parser = new Parser(urlConnection);
      parser.setEncoding(Constant.ENCODE);

      // OrFilter lastFilter = new OrFilter();
      // lastFilter.setPredicates(new NodeFilter[] {
      // new NodeClassFilter(TableTag.class),
      // new NodeClassFilter(Div.class) });
      //
      // NodeList list = parser.extractAllNodesThatMatch(lastFilter);

      NodeList list = parser.extractAllNodesThatMatch(new NodeClassFilter(Div.class));
      System.out.println("size:" + list.size());

      for (int i = 0; i < list.size(); i++) {
        Node tag = list.elementAt(i);

        if (tag instanceof Div) {
          Div d = (Div) tag;
          System.out.println(d.getAttribute("id"));

          if (d.getAttribute("id").startsWith("result_")) {
            // found one product
            try {
              AmazonProduct product = new AmazonProduct();
              product.name = d.getAttribute("name");
              getPriceAndLabel(d, product);

            } catch (Exception e) {
              e.printStackTrace();
            }
          }
        }
      }

    } catch (Exception e) {
      System.out.println(e.getMessage());
    }
    return false;
  }
예제 #25
0
  public static void main(String[] args) throws Exception {
    RequestConfig requestConfig =
        RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
    CloseableHttpClient httpclient =
        HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
    int count = 1;
    for (int i = 0; i <= 16; i++) {
      int index = i;
      // System.out.println(index);
      HttpGet httpGet = new HttpGet(url3 + index + url4);
      HttpResponse response = httpclient.execute(httpGet);
      HttpEntity entity = response.getEntity();
      String htmls = null;
      if (entity != null) {
        htmls = EntityUtils.toString(entity).replace("\t", " ");
      }
      Parser parser = Parser.createParser(htmls, "utf-8");
      AndFilter dFilter =
          new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "field-content"));
      NodeList nodes3 = parser.extractAllNodesThatMatch(dFilter);
      for (int k = 0; k < nodes3.size(); k++) {
        htmls = nodes3.elementAt(k).toHtml();
        parser = Parser.createParser(htmls, "utf-8");
        AndFilter ProfessionNameFilter =
            new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("href"));
        NodeList nodes4 = parser.extractAllNodesThatMatch(ProfessionNameFilter);
        for (int j = 0; j < nodes4.size(); j++) {
          LinkTag link = (LinkTag) nodes4.elementAt(j);
          // if(link.getAttribute("href").contains("http://www.ulster.ac.uk/"))
          { // .replaceAll("<span[\\s\\S]*/span>","")
            String temp = link.toHtml();

            System.out.println(
                "{\""
                    + count
                    + "\",\"http://www.chi.ac.uk/"
                    + link.getAttribute("href")
                    + "\",\""
                    + html2Str(temp).replace("\r\n", "").trim()
                    + "\",\"0\"},");
            count++;
          }
        }
      }
    }
    // System.out.println("DONE.");
  }
예제 #26
0
  /**
   * 获取文章链接
   *
   * @param url
   * @throws Exception
   */
  void docByHTML(String content, String pre) throws Exception {
    Parser parser = new Parser();
    parser.setInputHTML(content);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(LinkTag.class);
    NodeList list = parser.extractAllNodesThatMatch(fileter);
    if (list != null && list.size() > 0) {
      Parser p1 = new Parser();
      p1.setInputHTML(list.toHtml());
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
      NodeList linkList = p1.extractAllNodesThatMatch(linkFilter);
      if (linkList != null && linkList.size() > 0) {
        for (int i = 0; i < linkList.size(); i++) {
          LinkTag link = (LinkTag) linkList.elementAt(i);
          LinkBean bean = null;
          if (link.getLink().toLowerCase().startsWith(pre)
              && !link.getLinkText().equalsIgnoreCase("详细内容")) {
            if (null == articleDocCache.get(getKey(link.getLink()))) {
              bean = new LinkBean();
              bean.setLink(link.getLink());
              bean.setName(link.getLinkText());
              LINKHASH.put(link.getLink(), bean);
            } else {
              logger.info(">> 已存在 [" + link.getLink() + "] 地址");
            }
          }
        }
      }
    }
  }
예제 #27
0
  /**
   * 获取文章链接
   *
   * @param url
   * @throws Exception
   */
  void doc(String url, String pre) throws Exception {
    Parser parser = new Parser();
    parser.setURL(url);
    parser.setEncoding("GB2312");

    NodeFilter fileter = new NodeClassFilter(LinkTag.class);
    NodeList list = parser.extractAllNodesThatMatch(fileter);
    if (list != null && list.size() > 0) {
      Parser p1 = new Parser();
      p1.setInputHTML(list.toHtml());
      NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
      NodeList linkList = p1.extractAllNodesThatMatch(linkFilter);
      if (linkList != null && linkList.size() > 0) {
        for (int i = 0; i < linkList.size(); i++) {
          LinkTag link = (LinkTag) linkList.elementAt(i);
          LinkBean bean = null;
          if (link.getLink().toLowerCase().startsWith(pre)
              && !link.getLinkText().equalsIgnoreCase("详细内容")) {
            bean = new LinkBean();
            bean.setLink(link.getLink());
            bean.setName(link.getLinkText());
            LINKHASH.put(link.getLink(), bean);
          }
        }
      }
    }
  }
예제 #28
0
  /**
   * @param <T> 标签类型
   * @param html 需要解析的文本html
   * @param tagType 标签类型 class
   * @param attr 该标签应该有的树形
   * @param value 属性的值 (Ϊnull ��Ϊ��ƥ��)
   * @return
   */
  public static <T extends TagNode> List<T> parseTags(
      String html,
      final Class<T> tagType,
      final String attr,
      final String value,
      final boolean test) {
    Parser parser = new Parser();
    try {
      PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
      factory.registerTag(new PreTag());
      parser.setNodeFactory(factory);
      parser.setInputHTML(html);
      NodeList tagList =
          parser.parse(
              new NodeFilter() {

                @Override
                public boolean accept(Node node) {
                  if (test) logger.info(node.getClass());
                  if (node.getClass() == tagType) {
                    if (attr == null) return true;
                    T tn = (T) node;
                    String attrv = tn.getAttribute(attr);
                    if (value == null && attrv != null) { // || attrv.equals(value)
                      return true;
                    }
                    if (test) logger.info(attrv);
                    if (value != null && attrv != null && attrv.equals(value)) return true;
                  }
                  return false;
                }
              });

      List<T> tags = new ArrayList<T>();
      for (int i = 0; i < tagList.size(); i++) {
        tags.add((T) tagList.elementAt(i));
      }
      return tags;
    } catch (ParserException e) {
      e.printStackTrace();
    }

    return null;
  }
  /** Test a better method of modifying an HTML page. */
  public void testPageModification() throws Exception {
    Parser parser = Parser.createParser(HTML_WITH_LINK, null);
    NodeList list = parser.parse(null); // no filter
    // make an inner class that does the same thing as the UrlModifyingVisitor
    NodeVisitor visitor =
        new NodeVisitor() {
          String linkPrefix = "localhost://";

          public void visitTag(Tag tag) {
            if (tag instanceof LinkTag)
              ((LinkTag) tag).setLink(linkPrefix + ((LinkTag) tag).getLink());
            else if (tag instanceof ImageTag)
              ((ImageTag) tag).setImageURL(linkPrefix + ((ImageTag) tag).getImageURL());
          }
        };
    list.visitAllNodesWith(visitor);
    String result = list.toHtml();
    assertStringEquals("Expected HTML", MODIFIED_HTML, result);
  }
예제 #30
0
  public void testSelectors() throws Exception {
    String html =
        "<html><head><title>sample title</title></head><body inserterr=\"true\" yomama=\"false\"><h3 id=\"heading\">big </invalid>heading</h3><ul id=\"things\"><li><br word=\"broken\"/>&gt;moocow<li><applet/>doohickey<li class=\"last\"><b class=\"item\">final<br>item</b></ul></body></html>";
    Lexer l;
    Parser p;
    CssSelectorNodeFilter it;
    NodeIterator i;
    int count;

    l = new Lexer(html);
    p = new Parser(l);
    it = new CssSelectorNodeFilter("li + li");
    count = 0;
    for (i = p.extractAllNodesThatMatch(it).elements(); i.hasMoreNodes(); ) {
      assertEquals("tag name wrong", "LI", ((Tag) i.nextNode()).getTagName());
      count++;
    }
    assertEquals("wrong count", 2, count);
  }