public static boolean isEmpty(Node node, boolean doFilter) { return node == null || node.nodeName().equals("#comment") || node.nodeName().equals("#data") || node.nodeName().equals("style") || node.nodeName().equals("script") || isHidden(node) || (doFilter && isFiltered(node)) || (node.nodeName().equals("#text") && CommonUtil.isEmpty(node.toString(), true)); }
public static boolean isResultFiltered( Result result, String[] whitelist, String[] patterns, HtmlNode[] urlNodes) { return UrlUtil.isUrlFiltered( null, result.url, CommonUtil.parseFragment(result.urlNode, false), whitelist, patterns, urlNodes, null); }
public static boolean matches(HtmlNode reference, Node test) { if (test == null) { return false; } if (!CommonUtil.isEmpty(reference.id)) { return reference.id.equalsIgnoreCase(test.attr("id")); } if (!CommonUtil.isEmpty(reference.name)) { return reference.name.equalsIgnoreCase(test.attr("name")); } List<String[]> toMatch = new ArrayList<String[]>(); toMatch.add(new String[] {reference.tagName, test.nodeName()}); toMatch.add(new String[] {reference.type, test.attr("type")}); toMatch.add(new String[] {reference.value, test.attr("value")}); toMatch.add(new String[] {reference.title, test.attr("title")}); toMatch.add(new String[] {reference.role, test.attr("role")}); toMatch.add(new String[] {reference.alt, test.attr("alt")}); toMatch.add(new String[] {reference.href, test.attr("href")}); if (test instanceof Element) { toMatch.add( new String[] { CommonUtil.strip(reference.innerText, false), CommonUtil.strip(((Element) test).text(), false) }); } String refClassesString = CommonUtil.toString(reference.classes, " "); Collection<String> refClasses = new HashSet<String>(Arrays.asList(refClassesString.toLowerCase().split("\\s"))); Collection<String> testClasses = new HashSet<String>(Arrays.asList(test.attr("class").toLowerCase().split("\\s"))); for (String[] pair : toMatch) { if (reference.any) { if (!CommonUtil.isEmpty(pair[0]) && pair[0].equalsIgnoreCase(pair[1])) { return true; } } else { if (!CommonUtil.isEmpty(pair[0]) && !pair[0].equalsIgnoreCase(pair[1])) { return false; } } } if (!refClasses.isEmpty()) { for (String testClass : testClasses) { if (reference.any) { if (refClasses.contains(testClass)) { return true; } } else { if (!refClasses.contains(testClass)) { return false; } } } } return !reference.any; }
static String classId(Node node) { if (node != null) { String className = node.attr("class"); if (!CommonUtil.isEmpty(className)) { Matcher matcher = nodeMarker.matcher(className); if (matcher.find()) { return matcher.group(0); } } } return null; }
public static Document clean(String string, String url) { Document doc = CommonUtil.parse(string, url, false); NodeUtil.clean(doc.childNodes()); return doc; }