public List<Newsitem> parseContent(String content) throws Exception { List<Newsitem> newsitems = new ArrayList<Newsitem>(); Tag newsDiv = this.extractTagByClassName(this.stripHtmlComments(content), "box_news"); NodeList nodes = this.extractTagsByClassName(newsDiv.toHtml(), "subItem"); for (int i = 0; i < nodes.size(); i++) { NewsitemImpl newsitem = new NewsitemImpl(); Tag itemTable = (Tag) nodes.elementAt(i); Tag titleTag = this.extractTagByClassName(itemTable.toHtml(), "subItemtitle"); newsitem.setTitle(titleTag.toPlainTextString()); Node descriptionSpan = titleTag.getNextSibling().getNextSibling().getNextSibling().getNextSibling(); newsitem.setDescription( descriptionSpan .toPlainTextString() .replaceAll("[^\\u0000-\\u00FF]", " ") .replace(" Read More...", "") .trim()); Tag linkTag = (Tag) extractLinks(itemTable.toHtml(), "/index.php.*").elementAt(0); newsitem.setUrl(URL_PREFIX + linkTag.getAttribute("href")); newsitems.add(newsitem); } return newsitems; }
public void testParentConnections() throws ParserException { String tag1 = "<custom>"; String tag2 = "<custom>something</custom>"; String tag3 = "</custom>"; createParser(tag1 + tag2 + tag3); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new CustomTag(false), new AnotherTag(false), })); parseAndAssertNodeCount(3); CustomTag customTag = (CustomTag) node[0]; assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml()); assertNull("first custom tag should have no parent", customTag.getParent()); customTag = (CustomTag) node[1]; assertStringEquals("second custom tag html", tag2, customTag.toHtml()); assertNull("second custom tag should have no parent", customTag.getParent()); Node firstChild = customTag.childAt(0); assertType("firstChild", Text.class, firstChild); Node parent = firstChild.getParent(); assertNotNull("first child parent should not be null", parent); assertSame("parent and custom tag should be the same", customTag, parent); Tag endTag = (Tag) node[2]; assertStringEquals("third custom tag html", tag3, endTag.toHtml()); assertNull("end tag should have no parent", endTag.getParent()); }
/** * A bug in the freshmeat page - really bad html tag - <A>Revision<\a> Reported by * Mazlan Mat Note: Actually, this is completely legal HTML - Derrick */ public void testFreshMeatBug() throws ParserException { String html = "<a>Revision</a>"; createParser(html, "http://www.yahoo.com"); parseAndAssertNodeCount(1); assertTrue("Node 0 should be a tag", node[0] instanceof Tag); Tag tag = (Tag) node[0]; assertEquals("Tag Contents", html, tag.toHtml()); assertEquals("Node 0 should have one child", 1, tag.getChildren().size()); assertTrue("The child should be a string node", tag.getChildren().elementAt(0) instanceof Text); Text stringNode = (Text) tag.getChildren().elementAt(0); assertEquals("Text Contents", "Revision", stringNode.getText()); }
public void testCompositeTagWithSelfChildren() throws ParserException { String tag1 = "<custom>"; String tag2 = "<custom>something</custom>"; String tag3 = "</custom>"; createParser(tag1 + tag2 + tag3); parser.setNodeFactory( new PrototypicalNodeFactory( new Tag[] { new CustomTag(false), new AnotherTag(false), })); parseAndAssertNodeCount(3); CustomTag customTag = (CustomTag) node[0]; assertEquals("child count", 0, customTag.getChildCount()); assertFalse("custom tag should not be xml end tag", customTag.isEmptyXmlTag()); assertStringEquals("first custom tag html", tag1 + "</custom>", customTag.toHtml()); customTag = (CustomTag) node[1]; assertStringEquals("second custom tag html", tag2, customTag.toHtml()); Tag endTag = (Tag) node[2]; assertStringEquals("third custom tag html", tag3, endTag.toHtml()); }
/** * Given an input, analyze each HTML tag and remove unsecured attributes from them. * * @param contents The content to verify * @return the content, secure. */ public String ensureAllAttributesAreSafe(String contents) { StringBuffer sb = new StringBuffer(contents.length()); try { Lexer lexer = new Lexer(contents); Node node; while ((node = lexer.nextNode()) != null) { if (node instanceof Tag) { Tag tag = (Tag) node; this.checkAndValidateAttributes(tag, false); sb.append(tag.toHtml()); } else { sb.append(node.toHtml()); } } } catch (Exception e) { throw new ForumException("Problems while parsing HTML: " + e, e); } return sb.toString(); }