Beispiel #1
0
 public static boolean isEmpty(Node node, boolean doFilter) {
   return node == null
       || node.nodeName().equals("#comment")
       || node.nodeName().equals("#data")
       || node.nodeName().equals("style")
       || node.nodeName().equals("script")
       || isHidden(node)
       || (doFilter && isFiltered(node))
       || (node.nodeName().equals("#text") && CommonUtil.isEmpty(node.toString(), true));
 }
Beispiel #2
0
 public static boolean isResultFiltered(
     Result result, String[] whitelist, String[] patterns, HtmlNode[] urlNodes) {
   return UrlUtil.isUrlFiltered(
       null,
       result.url,
       CommonUtil.parseFragment(result.urlNode, false),
       whitelist,
       patterns,
       urlNodes,
       null);
 }
Beispiel #3
0
 public static boolean matches(HtmlNode reference, Node test) {
   if (test == null) {
     return false;
   }
   if (!CommonUtil.isEmpty(reference.id)) {
     return reference.id.equalsIgnoreCase(test.attr("id"));
   }
   if (!CommonUtil.isEmpty(reference.name)) {
     return reference.name.equalsIgnoreCase(test.attr("name"));
   }
   List<String[]> toMatch = new ArrayList<String[]>();
   toMatch.add(new String[] {reference.tagName, test.nodeName()});
   toMatch.add(new String[] {reference.type, test.attr("type")});
   toMatch.add(new String[] {reference.value, test.attr("value")});
   toMatch.add(new String[] {reference.title, test.attr("title")});
   toMatch.add(new String[] {reference.role, test.attr("role")});
   toMatch.add(new String[] {reference.alt, test.attr("alt")});
   toMatch.add(new String[] {reference.href, test.attr("href")});
   if (test instanceof Element) {
     toMatch.add(
         new String[] {
           CommonUtil.strip(reference.innerText, false),
           CommonUtil.strip(((Element) test).text(), false)
         });
   }
   String refClassesString = CommonUtil.toString(reference.classes, " ");
   Collection<String> refClasses =
       new HashSet<String>(Arrays.asList(refClassesString.toLowerCase().split("\\s")));
   Collection<String> testClasses =
       new HashSet<String>(Arrays.asList(test.attr("class").toLowerCase().split("\\s")));
   for (String[] pair : toMatch) {
     if (reference.any) {
       if (!CommonUtil.isEmpty(pair[0]) && pair[0].equalsIgnoreCase(pair[1])) {
         return true;
       }
     } else {
       if (!CommonUtil.isEmpty(pair[0]) && !pair[0].equalsIgnoreCase(pair[1])) {
         return false;
       }
     }
   }
   if (!refClasses.isEmpty()) {
     for (String testClass : testClasses) {
       if (reference.any) {
         if (refClasses.contains(testClass)) {
           return true;
         }
       } else {
         if (!refClasses.contains(testClass)) {
           return false;
         }
       }
     }
   }
   return !reference.any;
 }
Beispiel #4
0
 static String classId(Node node) {
   if (node != null) {
     String className = node.attr("class");
     if (!CommonUtil.isEmpty(className)) {
       Matcher matcher = nodeMarker.matcher(className);
       if (matcher.find()) {
         return matcher.group(0);
       }
     }
   }
   return null;
 }
Beispiel #5
0
 public static Document clean(String string, String url) {
   Document doc = CommonUtil.parse(string, url, false);
   NodeUtil.clean(doc.childNodes());
   return doc;
 }