private boolean equalsFormat(HTMLNode node1, HTMLNode node2) { Name name1 = node1.getName(); Name name2 = node2.getName(); if (name1 != name2) return false; Attributes attributes1 = node1.getAttributes(); Attributes attributes2 = node2.getAttributes(); if (attributes1.size() != attributes2.size()) return false; for (int i = 0; i < attributes1.size(); i++) { Attribute attribute1 = attributes1.get(i); Attribute attribute2 = attributes2.get(i); if (attribute1 == null && attribute2 != null) return false; if (attribute1 != null && attribute2 == null) return false; if (attribute1 != null && attribute2 != null && !attribute1.getName().equalsIgnoreCase(attribute2.getName())) return false; } List<HTMLNode> children1 = node1.getChildren(); List<HTMLNode> children2 = node2.getChildren(); if (children1 == null && children2 == null) return true; if (children1 == null && children2 != null) return false; if (children1 != null && children2 == null) return false; if (children1.size() != children2.size()) return false; for (int i = 0; i < children1.size(); i++) { if (!equalsFormat(children1.get(i), children2.get(i))) return false; } return true; }
protected void removeIFrameSource(HTMLNode node) { if (node.isNode(Name.IFRAME)) { Attributes attributes = node.getAttributes(); attributes.remove("src"); } List<HTMLNode> children = node.getChildren(); if (children == null || children.size() < 1) return; for (int i = 0; i < children.size(); i++) { removeIFrameSource(children.get(i)); } }
private boolean isLinkContainer(HTMLNode node) { List<HTMLNode> children = node.getChildren(); if (children == null) return false; if (isListNode(children)) { NodeIterator nodeIterator = node.iterator(); int counter = 0; while (nodeIterator.hasNext()) { HTMLNode iterNode = nodeIterator.next(); if (iterNode.isNode(Name.A)) counter++; } return counter >= children.size() - 3; } for (int i = 0; i < children.size(); i++) { if (isLinkContainer(children.get(i))) return true; } return false; }
private boolean isLinkDiv(HTMLNode node) { if (node.getChildren() == null) return false; List<HTMLNode> ignores = new ArrayList<HTMLNode>(); NodeIterator iterator = node.iterator(); while (iterator.hasNext()) { HTMLNode n = iterator.next(); if (n.isNode(Name.A) && !linkNodeChecker.isValid(new CheckModel(n), 0)) ignores.add(n); } int counter = 0; iterator = node.iterator(ignores); while (iterator.hasNext()) { HTMLNode n = iterator.next(); if (n.isNode(Name.CONTENT)) { counter += countWord(n); } } return counter < 5 && ignores.size() > 1; }