Пример #1
0
  private static void loadCSS(String address, HTMLDocument document) throws Exception {
    CSSData cssData = new CSSData();
    document.putResource("CSS.DATA", cssData);

    NodePath nodePath = pathParser.toPath("HEAD");
    HTMLNode head = extractor.lookNode(document.getRoot(), nodePath);

    URLUtils urlUtils = new URLUtils();
    NodeIterator iterator = head.iterator();
    while (iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if (!node.isNode(Name.LINK)) continue;
      Attributes attributes = node.getAttributes();
      Attribute attribute = attributes.get("type");
      if (attribute == null) continue;
      if (!"text/css".equalsIgnoreCase(attribute.getValue())) continue;

      attribute = attributes.get("href");
      if (attribute == null) continue;
      String link = attribute.getValue();
      if (link == null) continue;

      link = urlUtils.createURL(new URL(address), link);

      System.out.println(link);
      byte[] bytes = loadContent(link);

      String css = new String(bytes, "utf-8");
      cssData.addValue(css);
    }
  }
Пример #2
0
  public static void main(String[] args) throws Exception {
    String address = "http://vnexpress.net/GL/Xa-hoi/2009/02/3BA0B4AB/";
    webClient.setURL(address, new URL(address));
    //  String address  = "http://vnmedia.vn/newsdetail.asp?NewsId=154558&CatId=58";
    java.net.URL url = new java.net.URL(address);
    HTMLDocument document = HTMLParser.createDocument(loadContent(address), "utf-8");

    RefsDecoder decoder = new RefsDecoder();
    NodeIterator iterator = document.getRoot().iterator();
    while (iterator.hasNext()) {
      HTMLNode node = iterator.next();
      if (!node.isNode(Name.CONTENT)) continue;
      char[] chars = node.getValue();
      chars = decoder.decode(chars);

      chars = CharsUtil.cutAndTrim(chars, 0, chars.length);
      chars = java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray();
      node.setValue(chars);
    }

    loadCSS(address, document);

    NodePath nodePath = pathParser.toPath("BODY");
    HTMLNode body = extractor.lookNode(document.getRoot(), nodePath);

    WebPageDataSearcher dataSearcher = new WebPageDataSearcher(document);
    HTMLNode node = dataSearcher.search(body);

    File file = new File("F:\\Temp2\\web\\output\\extract.htm");
    byte[] bytes = new byte[0];
    if (node != null) bytes = node.getTextValue().getBytes(Application.CHARSET);
    RWData.getInstance().save(file, bytes);
  }