// @todo(dallison) handle rtl text
  private static void recurseNode(Node node, StringBuilder builder) {
    // boundry case for text
    if (node instanceof TextNode) {
      builder.append(((TextNode) node).text());
      builder.append(" ");
      return;
    }

    // check for title text
    if (node.hasAttr(TITLE)) {
      builder.append(node.attr(TITLE));
      builder.append(" ");
    }

    // if the current node has alt text append it
    if (node.hasAttr(ALT_TEXT)) {
      builder.append(node.attr(ALT_TEXT));
      // Add trailing white space to separate elements
      builder.append(" ");
    }

    // recurse into the child nodes.
    for (Node child : node.childNodes()) {
      recurseNode(child, builder);
    }
  }
  boolean unlikely(Node e) {
    if (e.attr("class") != null
        && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption")) return true;

    String style = e.attr("style");
    String clazz = e.attr("class");
    return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
  }
예제 #3
0
  private void checkFormFields(PageCompilingContext pc, Node element) {
    if (null == pc.form) return;

    String action = pc.form.attr("action");

    // Only look at contextual uris (i.e. hosted by us).
    // TODO - relative, not starting with '/'
    if (null == action || (!action.startsWith("/"))) return;

    final PageBook.Page page = pageBook.get(action);

    // Only look at pages we actually have registered.
    if (null == page) {
      pc.warnings.add(
          CompileError.in(element.outerHtml())
              .near(line(element))
              .causedBy(CompileErrors.UNRESOLVABLE_FORM_ACTION));

      return;
    }

    // If we're inside a form do a throw-away compile against the target page.
    if ("input".equals(element.nodeName()) || "textarea".equals(element.nodeName())) {
      String name = element.attr("name");

      // Skip submits and buttons.
      if (skippable(element.attr("type"))) return;

      // TODO Skip empty?
      if (null == name) {
        pc.warnings.add(
            CompileError.in(element.outerHtml())
                .near(line(element))
                .causedBy(CompileErrors.FORM_MISSING_NAME));

        return;
      }

      // Compile expression path.
      try {
        new MvelEvaluatorCompiler(page.pageClass()).compile(name);

      } catch (ExpressionCompileException e) {
        // TODO Very hacky, needed to strip out xmlns attribution.
        pc.warnings.add(
            CompileError.in(element.outerHtml())
                .near(element.siblingIndex()) // TODO - line number
                .causedBy(CompileErrors.UNRESOLVABLE_FORM_BINDING, e));
      }
    }
  }
예제 #4
0
 public static boolean matches(HtmlNode reference, Node test) {
   if (test == null) {
     return false;
   }
   if (!CommonUtil.isEmpty(reference.id)) {
     return reference.id.equalsIgnoreCase(test.attr("id"));
   }
   if (!CommonUtil.isEmpty(reference.name)) {
     return reference.name.equalsIgnoreCase(test.attr("name"));
   }
   List<String[]> toMatch = new ArrayList<String[]>();
   toMatch.add(new String[] {reference.tagName, test.nodeName()});
   toMatch.add(new String[] {reference.type, test.attr("type")});
   toMatch.add(new String[] {reference.value, test.attr("value")});
   toMatch.add(new String[] {reference.title, test.attr("title")});
   toMatch.add(new String[] {reference.role, test.attr("role")});
   toMatch.add(new String[] {reference.alt, test.attr("alt")});
   toMatch.add(new String[] {reference.href, test.attr("href")});
   if (test instanceof Element) {
     toMatch.add(
         new String[] {
           CommonUtil.strip(reference.innerText, false),
           CommonUtil.strip(((Element) test).text(), false)
         });
   }
   String refClassesString = CommonUtil.toString(reference.classes, " ");
   Collection<String> refClasses =
       new HashSet<String>(Arrays.asList(refClassesString.toLowerCase().split("\\s")));
   Collection<String> testClasses =
       new HashSet<String>(Arrays.asList(test.attr("class").toLowerCase().split("\\s")));
   for (String[] pair : toMatch) {
     if (reference.any) {
       if (!CommonUtil.isEmpty(pair[0]) && pair[0].equalsIgnoreCase(pair[1])) {
         return true;
       }
     } else {
       if (!CommonUtil.isEmpty(pair[0]) && !pair[0].equalsIgnoreCase(pair[1])) {
         return false;
       }
     }
   }
   if (!refClasses.isEmpty()) {
     for (String testClass : testClasses) {
       if (reference.any) {
         if (refClasses.contains(testClass)) {
           return true;
         }
       } else {
         if (!refClasses.contains(testClass)) {
           return false;
         }
       }
     }
   }
   return !reference.any;
 }
예제 #5
0
  private void checkUriConsistency(PageCompilingContext pc, Node element) {
    String uriAttrib = element.attr("action");
    if (null == uriAttrib) uriAttrib = element.attr("src");
    if (null == uriAttrib) uriAttrib = element.attr("href");

    if (null != uriAttrib) {

      // Verify that such a uri exists in the page book,
      // only if it is contextual--ignore abs & relative URIs.
      if (uriAttrib.startsWith("/"))
        if (null == pageBook.nonCompilingGet(uriAttrib))
          pc.warnings.add(
              CompileError.in(element.outerHtml())
                  .near(element.siblingIndex()) // TODO - line number
                  .causedBy(CompileErrors.UNRESOLVABLE_FORM_ACTION, uriAttrib));
    }
  }
예제 #6
0
  /** Called to push a new lexical scope onto the stack. */
  private boolean lexicalClimb(PageCompilingContext pc, Node node) {
    if (node.attr(ANNOTATION).length() > 1) {

      // Setup a new lexical scope (symbol table changes on each scope encountered).
      if (REPEAT_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY))
          || CHOOSE_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY))) {

        String[] keyAndContent = {node.attr(ANNOTATION_KEY), node.attr(ANNOTATION_CONTENT)};
        pc.lexicalScopes.push(new MvelEvaluatorCompiler(parseRepeatScope(pc, keyAndContent, node)));
        return true;
      }

      // Setup a new lexical scope for compiling against embedded pages (closures).
      final PageBook.Page embed = pageBook.forName(node.attr(ANNOTATION_KEY));
      if (null != embed) {
        final Class<?> embedClass = embed.pageClass();
        MvelEvaluatorCompiler compiler = new MvelEvaluatorCompiler(embedClass);
        checkEmbedAgainst(
            pc, compiler, Parsing.toBindMap(node.attr(ANNOTATION_CONTENT)), embedClass, node);

        pc.lexicalScopes.push(compiler);
        return true;
      }
    }

    return false;
  }
예제 #7
0
  static void markVisible(Node node) {
    if (node != null) {
      if (node.nodeName().equals("select")) {
        node.traverse(
            new NodeVisitor() {
              @Override
              public void tail(Node n, int d) {}

              @Override
              public void head(Node n, int d) {
                n.attr("class", hiddenMarker.matcher(n.attr("class")).replaceAll(""));
              }
            });
      }
      node.attr("class", hiddenMarker.matcher(node.attr("class")).replaceAll(""));
      markVisible(node.parent());
    }
  }
  // hit when the node is first seen
  public void head(Node node, int depth) {
    String name = node.nodeName();
    if (name.equals("li")) append('\n');
    else if (node.toString().startsWith("<select")) {
      append1("[SELECT]");
      append(tab);
    } else if (node.outerHtml().startsWith("<option")) {
      // append1(node.attr("value")+":");append1(" ");
      TextNodeVisitor textVisitor = new TextNodeVisitor();
      node.traverse(textVisitor);
      append1("{" + textVisitor.toString() + "}");

    } else if (node.outerHtml().startsWith("<input")) {
      if (node.attr("type").equals("input")) append1("[INPUT]" + node.attr("maxLength"));
    } else if (node.outerHtml().startsWith("<span")) {
      TextNodeVisitor textVisitor = new TextNodeVisitor();
      node.traverse(textVisitor);
      append1(":" + textVisitor.toString() + " ");
    }
  }
예제 #9
0
  static void markFiltered(Node node, final boolean lenient) {
    if (lenient) {
      if (!isFilteredLenient(node)) {
        node.attr("class", node.attr("class") + " " + FILTERED_LENIENT_MARKER + " ");
      }
    } else {
      node.traverse(
          new NodeVisitor() {
            @Override
            public void tail(Node n, int d) {}

            @Override
            public void head(Node n, int d) {
              if (!isFiltered(n)) {
                n.attr("class", n.attr("class") + " " + FILTERED_MARKER + " ");
              }
            }
          });
    }
  }
예제 #10
0
 static String classId(Node node) {
   if (node != null) {
     String className = node.attr("class");
     if (!CommonUtil.isEmpty(className)) {
       Matcher matcher = nodeMarker.matcher(className);
       if (matcher.find()) {
         return matcher.group(0);
       }
     }
   }
   return null;
 }
예제 #11
0
  private static String cleanHtml(final Node node) {
    if (node instanceof Element) {
      Element element = ((Element) node);
      StringBuilder accum = new StringBuilder();
      accum.append("<").append(element.tagName());
      for (Attribute attribute : element.attributes()) {
        if (!(attribute.getKey().startsWith("_"))) {
          accum.append(" ");
          accum.append(attribute.getKey());
          accum.append("=\"");
          accum.append(attribute.getValue());
          accum.append('"');
        }
      }

      if (element.childNodes().isEmpty() && element.tag().isEmpty()) {
        accum.append(" />");
      } else {
        accum.append(">");
        for (Node child : element.childNodes()) accum.append(cleanHtml(child));

        accum.append("</").append(element.tagName()).append(">");
      }
      return accum.toString();
    } else if (node instanceof TextNode) {
      return ((TextNode) node).getWholeText();
    } else if (node instanceof XmlDeclaration) {

      // HACK
      if (node.childNodes().isEmpty()) {
        return "";
      }
      return node.outerHtml();
    } else if (node instanceof Comment) {
      // HACK: elide comments for now.
      return "";
    } else if (node instanceof DataNode && node.childNodes().isEmpty()) {
      // No child nodes are defined but we have to handle content if such exists, example
      // <script language="JavaScript">var a =  { name: "${user.name}"}</script>

      String content = node.attr("data");
      if (Strings.empty(content)) {
        return "";
      }

      return content;
    } else {
      return node.outerHtml();
    }
  }
예제 #12
0
 private static int line(Node node) {
   return Integer.valueOf(node.attr(LINE_NUMBER_ATTRIBUTE));
 }
예제 #13
0
 /**
  * Set an attribute value on this element. If this element already has an attribute with the key,
  * its value is updated; otherwise, a new attribute is added.
  *
  * @return this element
  */
 public Element attr(String attributeKey, String attributeValue) {
   super.attr(attributeKey, attributeValue);
   return this;
 }
  protected ArrayList<Event> parseMonthPage(Document doc) {

    ArrayList<Event> events = new ArrayList<Event>();

    String query = "div#content.mw-body div#bodyContent div#mw-content-text.mw-content-ltr";
    for (int i = 1; i <= 31; i++) {
      query = query + " div#" + i + "_May_2005";
      Elements days = doc.select(query);
      for (Element eachday : days) { // This will loop only once because it is the whole text
        String actualDate = null;
        String modifiedDate =
            eachday.attr(
                "id"); // This is essential to do because wikipedia present dates in weird manner
                       // and if we want to faciliate search using dates in our database then they
                       // should be present in this format YYYY-MM-DD
        int firstoccur = modifiedDate.indexOf("_");
        String year = modifiedDate.substring(firstoccur + 5, firstoccur + 9);
        String day = modifiedDate.substring(0, 2).replace('_', ' ').trim();
        if (day.length() == 1) {
          day = "0" + day;
        }
        actualDate = year + "-05-" + day;
        try {
          Date.valueOf(actualDate);
        } catch (Exception ex) {
          ex.printStackTrace();
          System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
          continue;
        }

        Elements individual = eachday.children();
        for (Element dateplustext :
            individual) { // This consists of alternate date and events (with or withour newsStory)
          if (dateplustext.tagName().equals("ul")) {
            // Complete news under a given date
            Elements stories =
                dateplustext
                    .children(); // This contains different stories (newsStory may be present or
                                 // not)
            for (Element li : stories) {
              Elements uls =
                  li
                      .children(); // These are either <a> tags if it doesn't have a newsStory or it
                                   // is <a> and <ul> tag if it contains a newsStory
              boolean hasUL = false;
              for (Element ul : uls) {
                if (ul.tagName()
                    .equals("ul")) { // If li has ul then it implies that it contains a news story
                  hasUL = true; // news story is there
                  Node storyNode =
                      li.childNode(
                          0); // this the story .. it is used later at the end for each event
                  Elements eventsNodes =
                      ul
                          .children(); // Now we get inside the ul element which containd different
                                       // li elements
                  for (Element eventNode : eventsNodes) { // Here we are picking one li
                    Event event = extractDescriptionAndLinks(eventNode);
                    try {
                      event.setDate(Date.valueOf(actualDate));
                    } catch (Exception ex) {
                      ex.printStackTrace();
                      System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
                      continue;
                    }
                    // News story
                    if (!storyNode.attr("title").isEmpty() && !storyNode.attr("href").isEmpty()) {
                      if (isValidWikiURL(storyNode.attr("href"))) {
                        Story story = new Story();
                        // story.setName(st.attr("title"));
                        story.setName(getEntityName(storyNode.attr("href")));
                        story.setWikipediaUrl(getEntityURL(storyNode.attr("href")));
                        event.setStory(story);
                      }
                    }
                    events.add(event);
                  }
                }
              }
              if (!hasUL) { // event does not have a story
                Event event = extractDescriptionAndLinks(li);
                try {
                  event.setDate(Date.valueOf(actualDate));
                } catch (Exception ex) {
                  ex.printStackTrace();
                  System.err.println("ERROR: date format is wrong!!!! date = " + actualDate);
                  continue;
                }
                events.add(event);
              }
            }
          }
        }
      }
    }
    return events;
  }
예제 #15
0
 public static boolean isHidden(Node node) {
   return node.attr("class").indexOf(HIDDEN_MARKER) > -1;
 }
예제 #16
0
 private static boolean isFiltered(Node node) {
   return node.attr("class").indexOf(FILTERED_MARKER) > -1;
 }
예제 #17
0
 public static boolean isFilteredLenient(Node node) {
   return node.attr("class").indexOf(FILTERED_MARKER) > -1
       || node.attr("class").indexOf(FILTERED_LENIENT_MARKER) > -1;
 }
 public boolean isMainDiv(Node node) {
   return node instanceof Element
       && node.nodeName().equals("div")
       && node.hasAttr("class")
       && node.attr("class").contains("v-app");
 }