// @todo(dallison) handle rtl text private static void recurseNode(Node node, StringBuilder builder) { // boundry case for text if (node instanceof TextNode) { builder.append(((TextNode) node).text()); builder.append(" "); return; } // check for title text if (node.hasAttr(TITLE)) { builder.append(node.attr(TITLE)); builder.append(" "); } // if the current node has alt text append it if (node.hasAttr(ALT_TEXT)) { builder.append(node.attr(ALT_TEXT)); // Add trailing white space to separate elements builder.append(" "); } // recurse into the child nodes. for (Node child : node.childNodes()) { recurseNode(child, builder); } }
boolean unlikely(Node e) { if (e.attr("class") != null && e.attr("class").toLowerCase(Locale.getDefault()).contains("caption")) return true; String style = e.attr("style"); String clazz = e.attr("class"); return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find(); }
private void checkFormFields(PageCompilingContext pc, Node element) { if (null == pc.form) return; String action = pc.form.attr("action"); // Only look at contextual uris (i.e. hosted by us). // TODO - relative, not starting with '/' if (null == action || (!action.startsWith("/"))) return; final PageBook.Page page = pageBook.get(action); // Only look at pages we actually have registered. if (null == page) { pc.warnings.add( CompileError.in(element.outerHtml()) .near(line(element)) .causedBy(CompileErrors.UNRESOLVABLE_FORM_ACTION)); return; } // If we're inside a form do a throw-away compile against the target page. if ("input".equals(element.nodeName()) || "textarea".equals(element.nodeName())) { String name = element.attr("name"); // Skip submits and buttons. if (skippable(element.attr("type"))) return; // TODO Skip empty? if (null == name) { pc.warnings.add( CompileError.in(element.outerHtml()) .near(line(element)) .causedBy(CompileErrors.FORM_MISSING_NAME)); return; } // Compile expression path. try { new MvelEvaluatorCompiler(page.pageClass()).compile(name); } catch (ExpressionCompileException e) { // TODO Very hacky, needed to strip out xmlns attribution. pc.warnings.add( CompileError.in(element.outerHtml()) .near(element.siblingIndex()) // TODO - line number .causedBy(CompileErrors.UNRESOLVABLE_FORM_BINDING, e)); } } }
public static boolean matches(HtmlNode reference, Node test) { if (test == null) { return false; } if (!CommonUtil.isEmpty(reference.id)) { return reference.id.equalsIgnoreCase(test.attr("id")); } if (!CommonUtil.isEmpty(reference.name)) { return reference.name.equalsIgnoreCase(test.attr("name")); } List<String[]> toMatch = new ArrayList<String[]>(); toMatch.add(new String[] {reference.tagName, test.nodeName()}); toMatch.add(new String[] {reference.type, test.attr("type")}); toMatch.add(new String[] {reference.value, test.attr("value")}); toMatch.add(new String[] {reference.title, test.attr("title")}); toMatch.add(new String[] {reference.role, test.attr("role")}); toMatch.add(new String[] {reference.alt, test.attr("alt")}); toMatch.add(new String[] {reference.href, test.attr("href")}); if (test instanceof Element) { toMatch.add( new String[] { CommonUtil.strip(reference.innerText, false), CommonUtil.strip(((Element) test).text(), false) }); } String refClassesString = CommonUtil.toString(reference.classes, " "); Collection<String> refClasses = new HashSet<String>(Arrays.asList(refClassesString.toLowerCase().split("\\s"))); Collection<String> testClasses = new HashSet<String>(Arrays.asList(test.attr("class").toLowerCase().split("\\s"))); for (String[] pair : toMatch) { if (reference.any) { if (!CommonUtil.isEmpty(pair[0]) && pair[0].equalsIgnoreCase(pair[1])) { return true; } } else { if (!CommonUtil.isEmpty(pair[0]) && !pair[0].equalsIgnoreCase(pair[1])) { return false; } } } if (!refClasses.isEmpty()) { for (String testClass : testClasses) { if (reference.any) { if (refClasses.contains(testClass)) { return true; } } else { if (!refClasses.contains(testClass)) { return false; } } } } return !reference.any; }
private void checkUriConsistency(PageCompilingContext pc, Node element) { String uriAttrib = element.attr("action"); if (null == uriAttrib) uriAttrib = element.attr("src"); if (null == uriAttrib) uriAttrib = element.attr("href"); if (null != uriAttrib) { // Verify that such a uri exists in the page book, // only if it is contextual--ignore abs & relative URIs. if (uriAttrib.startsWith("/")) if (null == pageBook.nonCompilingGet(uriAttrib)) pc.warnings.add( CompileError.in(element.outerHtml()) .near(element.siblingIndex()) // TODO - line number .causedBy(CompileErrors.UNRESOLVABLE_FORM_ACTION, uriAttrib)); } }
/** Called to push a new lexical scope onto the stack. */ private boolean lexicalClimb(PageCompilingContext pc, Node node) { if (node.attr(ANNOTATION).length() > 1) { // Setup a new lexical scope (symbol table changes on each scope encountered). if (REPEAT_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY)) || CHOOSE_WIDGET.equalsIgnoreCase(node.attr(ANNOTATION_KEY))) { String[] keyAndContent = {node.attr(ANNOTATION_KEY), node.attr(ANNOTATION_CONTENT)}; pc.lexicalScopes.push(new MvelEvaluatorCompiler(parseRepeatScope(pc, keyAndContent, node))); return true; } // Setup a new lexical scope for compiling against embedded pages (closures). final PageBook.Page embed = pageBook.forName(node.attr(ANNOTATION_KEY)); if (null != embed) { final Class<?> embedClass = embed.pageClass(); MvelEvaluatorCompiler compiler = new MvelEvaluatorCompiler(embedClass); checkEmbedAgainst( pc, compiler, Parsing.toBindMap(node.attr(ANNOTATION_CONTENT)), embedClass, node); pc.lexicalScopes.push(compiler); return true; } } return false; }
static void markVisible(Node node) { if (node != null) { if (node.nodeName().equals("select")) { node.traverse( new NodeVisitor() { @Override public void tail(Node n, int d) {} @Override public void head(Node n, int d) { n.attr("class", hiddenMarker.matcher(n.attr("class")).replaceAll("")); } }); } node.attr("class", hiddenMarker.matcher(node.attr("class")).replaceAll("")); markVisible(node.parent()); } }
// hit when the node is first seen public void head(Node node, int depth) { String name = node.nodeName(); if (name.equals("li")) append('\n'); else if (node.toString().startsWith("<select")) { append1("[SELECT]"); append(tab); } else if (node.outerHtml().startsWith("<option")) { // append1(node.attr("value")+":");append1(" "); TextNodeVisitor textVisitor = new TextNodeVisitor(); node.traverse(textVisitor); append1("{" + textVisitor.toString() + "}"); } else if (node.outerHtml().startsWith("<input")) { if (node.attr("type").equals("input")) append1("[INPUT]" + node.attr("maxLength")); } else if (node.outerHtml().startsWith("<span")) { TextNodeVisitor textVisitor = new TextNodeVisitor(); node.traverse(textVisitor); append1(":" + textVisitor.toString() + " "); } }
static void markFiltered(Node node, final boolean lenient) { if (lenient) { if (!isFilteredLenient(node)) { node.attr("class", node.attr("class") + " " + FILTERED_LENIENT_MARKER + " "); } } else { node.traverse( new NodeVisitor() { @Override public void tail(Node n, int d) {} @Override public void head(Node n, int d) { if (!isFiltered(n)) { n.attr("class", n.attr("class") + " " + FILTERED_MARKER + " "); } } }); } }
static String classId(Node node) { if (node != null) { String className = node.attr("class"); if (!CommonUtil.isEmpty(className)) { Matcher matcher = nodeMarker.matcher(className); if (matcher.find()) { return matcher.group(0); } } } return null; }
private static String cleanHtml(final Node node) { if (node instanceof Element) { Element element = ((Element) node); StringBuilder accum = new StringBuilder(); accum.append("<").append(element.tagName()); for (Attribute attribute : element.attributes()) { if (!(attribute.getKey().startsWith("_"))) { accum.append(" "); accum.append(attribute.getKey()); accum.append("=\""); accum.append(attribute.getValue()); accum.append('"'); } } if (element.childNodes().isEmpty() && element.tag().isEmpty()) { accum.append(" />"); } else { accum.append(">"); for (Node child : element.childNodes()) accum.append(cleanHtml(child)); accum.append("</").append(element.tagName()).append(">"); } return accum.toString(); } else if (node instanceof TextNode) { return ((TextNode) node).getWholeText(); } else if (node instanceof XmlDeclaration) { // HACK if (node.childNodes().isEmpty()) { return ""; } return node.outerHtml(); } else if (node instanceof Comment) { // HACK: elide comments for now. return ""; } else if (node instanceof DataNode && node.childNodes().isEmpty()) { // No child nodes are defined but we have to handle content if such exists, example // <script language="JavaScript">var a = { name: "${user.name}"}</script> String content = node.attr("data"); if (Strings.empty(content)) { return ""; } return content; } else { return node.outerHtml(); } }
private static int line(Node node) { return Integer.valueOf(node.attr(LINE_NUMBER_ATTRIBUTE)); }
/** * Set an attribute value on this element. If this element already has an attribute with the key, * its value is updated; otherwise, a new attribute is added. * * @return this element */ public Element attr(String attributeKey, String attributeValue) { super.attr(attributeKey, attributeValue); return this; }
protected ArrayList<Event> parseMonthPage(Document doc) { ArrayList<Event> events = new ArrayList<Event>(); String query = "div#content.mw-body div#bodyContent div#mw-content-text.mw-content-ltr"; for (int i = 1; i <= 31; i++) { query = query + " div#" + i + "_May_2005"; Elements days = doc.select(query); for (Element eachday : days) { // This will loop only once because it is the whole text String actualDate = null; String modifiedDate = eachday.attr( "id"); // This is essential to do because wikipedia present dates in weird manner // and if we want to faciliate search using dates in our database then they // should be present in this format YYYY-MM-DD int firstoccur = modifiedDate.indexOf("_"); String year = modifiedDate.substring(firstoccur + 5, firstoccur + 9); String day = modifiedDate.substring(0, 2).replace('_', ' ').trim(); if (day.length() == 1) { day = "0" + day; } actualDate = year + "-05-" + day; try { Date.valueOf(actualDate); } catch (Exception ex) { ex.printStackTrace(); System.err.println("ERROR: date format is wrong!!!! date = " + actualDate); continue; } Elements individual = eachday.children(); for (Element dateplustext : individual) { // This consists of alternate date and events (with or withour newsStory) if (dateplustext.tagName().equals("ul")) { // Complete news under a given date Elements stories = dateplustext .children(); // This contains different stories (newsStory may be present or // not) for (Element li : stories) { Elements uls = li .children(); // These are either <a> tags if it doesn't have a newsStory or it // is <a> and <ul> tag if it contains a newsStory boolean hasUL = false; for (Element ul : uls) { if (ul.tagName() .equals("ul")) { // If li has ul then it implies that it contains a news story hasUL = true; // news story is there Node storyNode = li.childNode( 0); // this the story .. it is used later at the end for each event Elements eventsNodes = ul .children(); // Now we get inside the ul element which containd different // li elements for (Element eventNode : eventsNodes) { // Here we are picking one li Event event = extractDescriptionAndLinks(eventNode); try { event.setDate(Date.valueOf(actualDate)); } catch (Exception ex) { ex.printStackTrace(); System.err.println("ERROR: date format is wrong!!!! date = " + actualDate); continue; } // News story if (!storyNode.attr("title").isEmpty() && !storyNode.attr("href").isEmpty()) { if (isValidWikiURL(storyNode.attr("href"))) { Story story = new Story(); // story.setName(st.attr("title")); story.setName(getEntityName(storyNode.attr("href"))); story.setWikipediaUrl(getEntityURL(storyNode.attr("href"))); event.setStory(story); } } events.add(event); } } } if (!hasUL) { // event does not have a story Event event = extractDescriptionAndLinks(li); try { event.setDate(Date.valueOf(actualDate)); } catch (Exception ex) { ex.printStackTrace(); System.err.println("ERROR: date format is wrong!!!! date = " + actualDate); continue; } events.add(event); } } } } } } return events; }
public static boolean isHidden(Node node) { return node.attr("class").indexOf(HIDDEN_MARKER) > -1; }
private static boolean isFiltered(Node node) { return node.attr("class").indexOf(FILTERED_MARKER) > -1; }
public static boolean isFilteredLenient(Node node) { return node.attr("class").indexOf(FILTERED_MARKER) > -1 || node.attr("class").indexOf(FILTERED_LENIENT_MARKER) > -1; }
public boolean isMainDiv(Node node) { return node instanceof Element && node.nodeName().equals("div") && node.hasAttr("class") && node.attr("class").contains("v-app"); }