public List<Newsitem> parseContent(String content) throws Exception { List<Newsitem> newsitems = new ArrayList<Newsitem>(); Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news"); NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem"); for (int i = 0; i < nodes.size(); i++) { NewsitemImpl newsitem = new NewsitemImpl(); Tag itemTable = (Tag) nodes.elementAt(i); Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle"); newsitem.setTitle(titleTag.toPlainTextString()); Node descriptionSpan = titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling(); newsitem.setDescription( descriptionSpan .toPlainTextString() .replaceAll("[^\\u0000-\\u00FF]", " ") .replace(" Read More...", "") .trim()); Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0); newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href")); newsitems.add(newsitem); } return newsitems; }
public void testParentConnections() throws ParserException { String tag1 = "<custom>"; String tag2 = "<custom>something</custom>"; String tag3 = "</custom>"; createParser(tag1 + tag2 + tag3); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new CustomTag(false), new AnotherTag(false), })); parseAndAssertNodeCount(3); CustomTag customTag = (CustomTag) node[0]; assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml()); assertNull("first custom tag should have no parent", customTag.getParent()); customTag = (CustomTag) node[1]; assertStringEquals("second custom tag html", tag2, customTag.toHtml()); assertNull("second custom tag should have no parent", customTag.getParent()); Node firstChild = customTag.childAt(0); assertType("firstChild", Text.class, firstChild); Node parent = firstChild.getParent(); assertNotNull("first child parent should not be null", parent); assertSame("parent and custom tag should be the same", customTag, parent); Tag endTag = (Tag) node[2]; assertStringEquals("third custom tag html", tag3, endTag.toHtml()); assertNull("end tag should have no parent", endTag.getParent()); }
public Row() { rowNode = (TableRow) newTag(TableRow.class); rowNode.setChildren(new NodeList()); Tag endNode = new TableRow(); endNode.setTagName("/" + rowNode.getTagName().toLowerCase()); rowNode.setEndTag(endNode); }
/** * Extract the object <code>PARAM</code> tags from the child list. * * @return The list of object parameters (keys and values are String objects). */ public HashMap createObjectParamsTable() { NodeList kids; Node node; Tag tag; String paramName; String paramValue; HashMap ret; ret = new HashMap(); kids = getChildren(); if (null != kids) for (int i = 0; i < kids.size(); i++) { node = children.elementAt(i); if (node instanceof Tag) { tag = (Tag) node; if (tag.getTagName().equals("PARAM")) { paramName = tag.getAttribute("NAME"); if (null != paramName && 0 != paramName.length()) { paramValue = tag.getAttribute("VALUE"); ret.put(paramName.toUpperCase(), paramValue); } } } } return (ret); }
private void setExecutionResult(ExecutionResult executionResult) { NodeList cells = rowNode.getChildren(); for (int i = 0; i < cells.size(); i++) { Node cell = cells.elementAt(i); if (cell instanceof Tag) { Tag tag = (Tag) cell; tag.setAttribute("class", executionResult.toString(), '"'); } } }
/** * Returns true if a given tag is allowed. Also, it checks and removes any unwanted attribute the * tag may contain. * * @param node The tag node to analyze * @return true if it is a valid tag. */ private boolean isTagWelcome(Node node) { Tag tag = (Tag) node; if (!welcomeTags.contains(tag.getTagName())) { return false; } this.checkAndValidateAttributes(tag, true); return true; }
/** * A bug in the freshmeat page - really bad html tag - <A>Revision<\a> Reported by * Mazlan Mat Note: Actually, this is completely legal HTML - Derrick */ public void testFreshMeatBug() throws ParserException { String html = "<a>Revision</a>"; createParser(html, "http://www.yahoo.com"); parseAndAssertNodeCount(1); assertTrue("Node 0 should be a tag", node[0] instanceof Tag); Tag tag = (Tag) node[0]; assertEquals("Tag Contents", html, tag.toHtml()); assertEquals("Node 0 should have one child", 1, tag.getChildren().size()); assertTrue("The child should be a string node", tag.getChildren().elementAt(0) instanceof Text); Text stringNode = (Text) tag.getChildren().elementAt(0); assertEquals("Text Contents", "Revision", stringNode.getText()); }
public boolean accept(Node node) { Tag tag; Attribute attribute; boolean ret; ret = false; if (node instanceof Tag) { tag = (Tag) node; attribute = tag.getAttributeEx(mAttribute); ret = null != attribute; if (ret && (null != mValue)) ret = attribute.getValue().startsWith(mValue); } return (ret); }
public void testLinkDataContents() throws ParserException { createParser( "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>", "http://transfer.go.com"); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new LinkTag(), new ImageTag(), })); parseAndAssertNodeCount(1); assertTrue("Node 0 should be a link tag", node[0] instanceof LinkTag); LinkTag linkTag = (LinkTag) node[0]; assertEquals( "Link URL", "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689", linkTag.getLink()); assertEquals("Link Text", "", linkTag.getLinkText()); Node[] containedNodes = new Node[10]; int i = 0; for (SimpleNodeIterator e = linkTag.children(); e.hasMoreNodes(); ) { containedNodes[i++] = e.nextNode(); } assertEquals("There should be 5 contained nodes in the link tag", 5, i); assertTrue( "First contained node should be an image tag", containedNodes[0] instanceof ImageTag); ImageTag imageTag = (ImageTag) containedNodes[0]; assertEquals( "Image Location", "http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif", imageTag.getImageURL()); assertEquals("Image Height", "60", imageTag.getAttribute("HEIGHT")); assertEquals("Image Width", "468", imageTag.getAttribute("WIDTH")); assertEquals("Image Border", "0", imageTag.getAttribute("BORDER")); assertEquals( "Image Alt", "See Signs in Theaters 8-2 - Starring Mel Gibson", imageTag.getAttribute("ALT")); assertTrue("Second contained node should be Tag", containedNodes[1] instanceof Tag); Tag tag1 = (Tag) containedNodes[1]; assertEquals( "Tag Contents", "font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.getText()); assertTrue("Third contained node should be Tag", containedNodes[2] instanceof Tag); Tag tag2 = (Tag) containedNodes[2]; assertEquals("Tag Contents", "b", tag2.getText()); assertTrue("Fourth contained node should be a Tag", containedNodes[3] instanceof Tag); Tag tag = (Tag) containedNodes[3]; assertTrue("Fourth contained node should be an EndTag", tag.isEndTag()); assertEquals("Fourth Tag contents", "/b", tag.getText()); assertTrue("Fifth contained node should be a Tag", containedNodes[4] instanceof Tag); tag = (Tag) containedNodes[4]; assertTrue("Fifth contained node should be an EndTag", tag.isEndTag()); assertEquals("Fifth Tag contents", "/font", tag.getText()); }
/** * Look up the index of an attribute by Namespace name. * * @param uri The Namespace URI, or the empty string if the name has no Namespace URI. * @param localName The attribute's local name. * @return The index of the attribute, or -1 if it does not appear in the list. */ public int getIndex(String uri, String localName) { Vector attributes; int size; Attribute attribute; String string; int ret; ret = -1; attributes = mTag.getAttributesEx(); if (null != attributes) { size = attributes.size(); for (int i = 1; i < size; i++) { attribute = (Attribute) attributes.elementAt(i); string = attribute.getName(); if (null != string) // not whitespace { mSupport.processName(string, mParts, true); if (uri.equals(mParts[0]) & localName.equalsIgnoreCase(mParts[1])) { ret = i; i = size; // exit fast } } } } return (ret); }
public void visitTag(Tag tag) { for (int i = 0; i < tagsToBeFound.length; i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { count[i]++; tags[i].add(tag); } }
public void visitEndTag(Tag tag) { if (!endTagCheck) return; for (int i = 0; i < tagsToBeFound.length; i++) if (tag.getTagName().equalsIgnoreCase(tagsToBeFound[i])) { endTagCount[i]++; endTags[i].add(tag); } }
/** Test child filtering. */ public void testChild() throws ParserException { String guts; String html; NodeList list; guts = "<body>Now is the <a id=target><b>time</b></a> for all good <a href=http://bongo.com>men</a>..</body>"; html = "<html>" + guts + "</html>"; createParser(html); list = parser.extractAllNodesThatMatch(new HasChildFilter(new TagNameFilter("b"))); assertEquals("only one element", 1, list.size()); assertType("should be LinkTag", LinkTag.class, list.elementAt(0)); LinkTag link = (LinkTag) list.elementAt(0); assertEquals("three children", 3, link.getChildCount()); assertSuperType("should be TagNode", Tag.class, link.getChildren().elementAt(0)); Tag tag = (Tag) link.getChildren().elementAt(0); assertStringEquals("name", "B", tag.getTagName()); }
/** * Look up an attribute's value by index. * * <p>If the attribute value is a list of tokens (IDREFS, ENTITIES, or NMTOKENS), the tokens will * be concatenated into a single string with each token separated by a single space. * * @param index The attribute index (zero-based). * @return The attribute's value as a string, or null if the index is out of range. * @see #getLength */ public String getValue(int index) { Attribute attribute; String ret; attribute = (Attribute) (mTag.getAttributesEx().elementAt(index + 1)); ret = attribute.getValue(); if (null == ret) ret = ""; return (ret); }
/** * Look up an attribute's XML qualified (prefixed) name by index. * * @param index The attribute index (zero-based). * @return The XML qualified name, or the empty string if none is available, or null if the index * is out of range. * @see #getLength */ public String getQName(int index) { Attribute attribute; String ret; attribute = (Attribute) (mTag.getAttributesEx().elementAt(index + 1)); if (attribute.isWhitespace()) ret = "#text"; else ret = attribute.getName(); return (ret); }
@Override public void visitTag(Tag tag) { Element e = Document.get().createElement(tag.getTagName()); map.put(tag, e); for (Object o : tag.getAttributesEx()) { Attribute a = (Attribute) o; if ("id".equalsIgnoreCase(a.getName())) { e.setId(a.getValue()); } else if ("style".equalsIgnoreCase(a.getName())) { processStyle(e, a.getValue()); } else if ("class".equalsIgnoreCase(a.getName())) { e.setClassName(a.getValue()); } else if (!a.isEmpty() && !a.isWhitespace() && a.isValued()) { e.setAttribute(a.getName(), a.getValue()); } } Element parent = getParent(tag.getParent()); parent.appendChild(e); }
private Tag newTag(Class<? extends Tag> klass) { Tag tag = null; try { tag = klass.newInstance(); tag.setTagName(tag.getTagName().toLowerCase()); Tag endTag = klass.newInstance(); endTag.setTagName("/" + tag.getTagName().toLowerCase()); endTag.setParent(tag); tag.setEndTag(endTag); } catch (Exception e) { e.printStackTrace(); } return tag; }
public void testCompositeTagWithSelfChildren() throws ParserException { String tag1 = "<custom>"; String tag2 = "<custom>something</custom>"; String tag3 = "</custom>"; createParser(tag1 + tag2 + tag3); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new CustomTag(false), new AnotherTag(false), })); parseAndAssertNodeCount(3); CustomTag customTag = (CustomTag) node[0]; assertEquals("child count", 0, customTag.getChildCount()); assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag()); assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml()); customTag = (CustomTag) node[1]; assertStringEquals("second custom tag html", tag2, customTag.toHtml()); Tag endTag = (Tag) node[2]; assertStringEquals("third custom tag html", tag3, endTag.toHtml()); }
/** @see org.htmlparser.visitors.NodeVisitor */ public void visitTag(final Tag n) { if ((null != n.getParent()) || ((n instanceof CompositeTag) && (null == ((CompositeTag) n).getEndTag()))) { if (n instanceof ScriptTag) { this.scriptTag = (ScriptTag) n; } } else { if (n instanceof ScriptTag) { this.scriptTag = (ScriptTag) n; } } }
/** * Given a tag, check its attributes, removing those unwanted or not secure * * @param tag The tag to analyze * @param checkIfAttributeIsWelcome true if the attribute name should be matched against the list * of welcome attributes, set in the main configuration file. */ private void checkAndValidateAttributes(Tag tag, boolean checkIfAttributeIsWelcome) { Vector<Attribute> newAttributes = new Vector<Attribute>(); for (Iterator<?> iter = tag.getAttributesEx().iterator(); iter.hasNext(); ) { Attribute a = (Attribute) iter.next(); String name = a.getName(); if (name == null) { newAttributes.add(a); } else { name = name.toUpperCase(); if (a.getValue() == null) { newAttributes.add(a); continue; } String value = a.getValue().toLowerCase(); if (checkIfAttributeIsWelcome && !this.isAttributeWelcome(name)) { continue; } if (!this.isAttributeSafe(name, value)) { continue; } if (a.getValue().indexOf("&#") > -1) { a.setValue(a.getValue().replaceAll("&#", "&#")); } newAttributes.add(a); } } tag.setAttributesEx(newAttributes); }
/** * Given an input, analyze each HTML tag and remove unsecured attributes from them. * * @param contents The content to verify * @return the content, secure. */ public String ensureAllAttributesAreSafe(String contents) { StringBuffer sb = new StringBuffer(contents.length()); try { Lexer lexer = new Lexer(contents); Node node; while ((node = lexer.nextNode()) != null) { if (node instanceof Tag) { Tag tag = (Tag) node; this.checkAndValidateAttributes(tag, false); sb.append(tag.toHtml()); } else { sb.append(node.toHtml()); } } } catch (Exception e) { throw new ForumException("Problems while parsing HTML: " + e, e); } return sb.toString(); }
public int getEndLine() { int nr = arg0.getStartingLineNumber() + 1; int nrE = nr; Tag endTag = arg0.getEndTag(); if (endTag != null) { nrE = endTag.getEndingLineNumber(); int offset = endTag.getStartPosition() - endTag.getEndPosition(); if (offset == 0) fEditor.addProblemMarker( endTag.getTagName().toLowerCase() + " is not correctly closed proposed line for closing is line " + nrE, nr, IMarker.SEVERITY_WARNING); } return nrE; }
/** * Scan the tag. For this implementation, the only operation is to perform the tag's semantic * action. * * @param tag The tag to scan. * @param lexer Provides html page access. * @param stack The parse stack. May contain pending tags that enclose this tag. * @return The resultant tag (may be unchanged). */ public Tag scan(Tag tag, Lexer lexer, NodeList stack) throws ParserException { tag.doSemanticAction(); return (tag); }
public int getStartLine() { return arg0.getStartingLineNumber() + 1; }
public String getUniqueID() { return arg0.toString(); }
public void assertTagNameShouldBe(String message, Node node, String expectedTagName) { Tag tag = (Tag) node; assertStringEquals(message, expectedTagName, tag.getTagName()); }
/** * Return the number of attributes in the list. * * <p>Once you know the number of attributes, you can iterate through the list. * * @return The number of attributes in the list. * @see #getURI(int) * @see #getLocalName(int) * @see #getQName(int) * @see #getType(int) * @see #getValue(int) */ public int getLength() { return (mTag.getAttributesEx().size() - 1); }
/** * Look up an attribute's value by Namespace name. * * <p>See {@link #getValue(int) getValue(int)} for a description of the possible values. * * @param uri The Namespace URI, or the empty String if the name has no Namespace URI. * @param localName The local name of the attribute. * @return The attribute value as a string, or null if the attribute is not in the list. */ public String getValue(String uri, String localName) { return (mTag.getAttribute(localName)); }