private static void loadCSS(String address, HTMLDocument document) throws Exception { CSSData cssData = new CSSData(); document.putResource("CSS.DATA", cssData); NodePath nodePath = pathParser.toPath("HEAD"); HTMLNode head = extractor.lookNode(document.getRoot(), nodePath); URLUtils urlUtils = new URLUtils(); NodeIterator iterator = head.iterator(); while (iterator.hasNext()) { HTMLNode node = iterator.next(); if (!node.isNode(Name.LINK)) continue; Attributes attributes = node.getAttributes(); Attribute attribute = attributes.get("type"); if (attribute == null) continue; if (!"text/css".equalsIgnoreCase(attribute.getValue())) continue; attribute = attributes.get("href"); if (attribute == null) continue; String link = attribute.getValue(); if (link == null) continue; link = urlUtils.createURL(new URL(address), link); System.out.println(link); byte[] bytes = loadContent(link); String css = new String(bytes, "utf-8"); cssData.addValue(css); } }
public static void main(String[] args) throws Exception { String address = "http://vnexpress.net/GL/Xa-hoi/2009/02/3BA0B4AB/"; webClient.setURL(address, new URL(address)); // String address = "http://vnmedia.vn/newsdetail.asp?NewsId=154558&CatId=58"; java.net.URL url = new java.net.URL(address); HTMLDocument document = HTMLParser.createDocument(loadContent(address), "utf-8"); RefsDecoder decoder = new RefsDecoder(); NodeIterator iterator = document.getRoot().iterator(); while (iterator.hasNext()) { HTMLNode node = iterator.next(); if (!node.isNode(Name.CONTENT)) continue; char[] chars = node.getValue(); chars = decoder.decode(chars); chars = CharsUtil.cutAndTrim(chars, 0, chars.length); chars = java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray(); node.setValue(chars); } loadCSS(address, document); NodePath nodePath = pathParser.toPath("BODY"); HTMLNode body = extractor.lookNode(document.getRoot(), nodePath); WebPageDataSearcher dataSearcher = new WebPageDataSearcher(document); HTMLNode node = dataSearcher.search(body); File file = new File("F:\\Temp2\\web\\output\\extract.htm"); byte[] bytes = new byte[0]; if (node != null) bytes = node.getTextValue().getBytes(Application.CHARSET); RWData.getInstance().save(file, bytes); }