private static void loadCSS(String address, HTMLDocument document) throws Exception { CSSData cssData = new CSSData(); document.putResource("CSS.DATA", cssData); NodePath nodePath = pathParser.toPath("HEAD"); HTMLNode head = extractor.lookNode(document.getRoot(), nodePath); URLUtils urlUtils = new URLUtils(); NodeIterator iterator = head.iterator(); while (iterator.hasNext()) { HTMLNode node = iterator.next(); if (!node.isNode(Name.LINK)) continue; Attributes attributes = node.getAttributes(); Attribute attribute = attributes.get("type"); if (attribute == null) continue; if (!"text/css".equalsIgnoreCase(attribute.getValue())) continue; attribute = attributes.get("href"); if (attribute == null) continue; String link = attribute.getValue(); if (link == null) continue; link = urlUtils.createURL(new URL(address), link); System.out.println(link); byte[] bytes = loadContent(link); String css = new String(bytes, "utf-8"); cssData.addValue(css); } }
boolean hasForm(HTMLNode node) { NodeIterator iterator = node.iterator(); while (iterator.hasNext()) { HTMLNode n = iterator.next(); if (n.isNode(Name.FORM)) return true; if (isFormElement(n)) return true; } return false; }
protected void removeIFrameSource(HTMLNode node) { if (node.isNode(Name.IFRAME)) { Attributes attributes = node.getAttributes(); attributes.remove("src"); } List<HTMLNode> children = node.getChildren(); if (children == null || children.size() < 1) return; for (int i = 0; i < children.size(); i++) { removeIFrameSource(children.get(i)); } }
private int countWord(HTMLNode node) { if (node == null) return 0; NodeIterator nodeIterator = node.iterator(); int word = 0; while (nodeIterator.hasNext()) { HTMLNode iterNode = nodeIterator.next(); if (getAncestor(iterNode, Name.A, 0, 5) != null) continue; if (iterNode.isNode(Name.CONTENT)) { String text = iterNode.getTextValue(); word += textCounter.countWord(text, 0, text.length()); } } return word; }
private boolean isLinkContainer(HTMLNode node) { List<HTMLNode> children = node.getChildren(); if (children == null) return false; if (isListNode(children)) { NodeIterator nodeIterator = node.iterator(); int counter = 0; while (nodeIterator.hasNext()) { HTMLNode iterNode = nodeIterator.next(); if (iterNode.isNode(Name.A)) counter++; } return counter >= children.size() - 3; } for (int i = 0; i < children.size(); i++) { if (isLinkContainer(children.get(i))) return true; } return false; }
public void remove(HTMLNode root, HTMLNode first, HTMLNode last) { boolean remove = true; List<HTMLNode> removes = new ArrayList<HTMLNode>(); NodeIterator nodeIterator = root.iterator(); // System.out.println(new String(first.getValue())); while (nodeIterator.hasNext()) { HTMLNode node = nodeIterator.next(); switch (node.getName()) { case UL: if (isLinkContainer(node)) removes.add(node); break; case DIV: case TD: if (isLinkDiv(node)) removes.add(node); break; case CONTENT: // System.out.println(new String(iterNode.getValue())); if (node == first) remove = false; if (remove) { removes.add(node); // System.out.println(new String(node.getValue())); } if (node == last) remove = true; break; default: break; } } // System.out.println(new String(last.getValue())); for (int i = 0; i < removes.size(); i++) { HTMLNode node = removes.get(i); HTMLNode parent = node.getParent(); // System.out.println(parent.getTextValue()); while (parent != null) { // System.out.println(parent.getTextValue()); node.setValue(new char[] {}); parent.removeChild(node); int word = countWord(parent); if (word > 15) break; node = parent; parent = node.getParent(); } } }
private boolean isFormElement(HTMLNode node) { switch (node.getName()) { case INPUT: case TEXTAREA: case SELECT: case LABEL: case BUTTON: return true; default: return false; } }
private boolean equalsFormat(HTMLNode node1, HTMLNode node2) { Name name1 = node1.getName(); Name name2 = node2.getName(); if (name1 != name2) return false; Attributes attributes1 = node1.getAttributes(); Attributes attributes2 = node2.getAttributes(); if (attributes1.size() != attributes2.size()) return false; for (int i = 0; i < attributes1.size(); i++) { Attribute attribute1 = attributes1.get(i); Attribute attribute2 = attributes2.get(i); if (attribute1 == null && attribute2 != null) return false; if (attribute1 != null && attribute2 == null) return false; if (attribute1 != null && attribute2 != null && !attribute1.getName().equalsIgnoreCase(attribute2.getName())) return false; } List<HTMLNode> children1 = node1.getChildren(); List<HTMLNode> children2 = node2.getChildren(); if (children1 == null && children2 == null) return true; if (children1 == null && children2 != null) return false; if (children1 != null && children2 == null) return false; if (children1.size() != children2.size()) return false; for (int i = 0; i < children1.size(); i++) { if (!equalsFormat(children1.get(i), children2.get(i))) return false; } return true; }
public static void main(String[] args) throws Exception { String address = "http://vnexpress.net/GL/Xa-hoi/2009/02/3BA0B4AB/"; webClient.setURL(address, new URL(address)); // String address = "http://vnmedia.vn/newsdetail.asp?NewsId=154558&CatId=58"; java.net.URL url = new java.net.URL(address); HTMLDocument document = HTMLParser.createDocument(loadContent(address), "utf-8"); RefsDecoder decoder = new RefsDecoder(); NodeIterator iterator = document.getRoot().iterator(); while (iterator.hasNext()) { HTMLNode node = iterator.next(); if (!node.isNode(Name.CONTENT)) continue; char[] chars = node.getValue(); chars = decoder.decode(chars); chars = CharsUtil.cutAndTrim(chars, 0, chars.length); chars = java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray(); node.setValue(chars); } loadCSS(address, document); NodePath nodePath = pathParser.toPath("BODY"); HTMLNode body = extractor.lookNode(document.getRoot(), nodePath); WebPageDataSearcher dataSearcher = new WebPageDataSearcher(document); HTMLNode node = dataSearcher.search(body); File file = new File("F:\\Temp2\\web\\output\\extract.htm"); byte[] bytes = new byte[0]; if (node != null) bytes = node.getTextValue().getBytes(Application.CHARSET); RWData.getInstance().save(file, bytes); }
private boolean isLinkDiv(HTMLNode node) { if (node.getChildren() == null) return false; List<HTMLNode> ignores = new ArrayList<HTMLNode>(); NodeIterator iterator = node.iterator(); while (iterator.hasNext()) { HTMLNode n = iterator.next(); if (n.isNode(Name.A) && !linkNodeChecker.isValid(new CheckModel(n), 0)) ignores.add(n); } int counter = 0; iterator = node.iterator(ignores); while (iterator.hasNext()) { HTMLNode n = iterator.next(); if (n.isNode(Name.CONTENT)) { counter += countWord(n); } } return counter < 5 && ignores.size() > 1; }
private HTMLNode getAncestor(HTMLNode node, Name name, int level, int max) { if (level > max || node == null) return null; if (node.isNode(name)) return node; return getAncestor(node.getParent(), name, level + 1, max); }