public void testCharsetWithTEXT() throws Exception { String text = "<html>" + "<head>" + "</head>" + "<body>" + "<h1>This is a HTML file for testing!</h1>" + "</body>" + "</html>"; HTMLDocument document = HTMLParser.createDocument(text); assertNotNull(document); NodePath path = NodePathParser.toPath("html.body.h1"); HTMLNode node = NodePathUtil.lookFor(document.getRoot(), path); assertNotNull(node); assertEquals(node.getName(), Name.H1); assertEquals(node.getName().toString(), "H1"); System.out.println("NAME: " + node.getName()); System.out.println("VALUE: " + new String(node.getValue())); System.out.println("TEXTVALUE: " + node.getTextValue()); // assertNull(node.getChildren()); assertNotNull(node.getChildren()); assertEquals(node.getChildren().size(), 1); assertEquals(!node.getChildren().isEmpty(), true); assertEquals(node.getChildren().get(0).getName(), Name.CONTENT); assertEquals(node.getChildren().get(0).getName().toString(), "CONTENT"); HTMLNode child = node.getChildren().get(0); assertNotNull(child); assertNull(child.getChildren()); assertEquals(child.getTextValue(), "This is a HTML file for testing!"); // assertEquals(child.getValue(),"content"); System.out.println("CONTENT-VALUE: " + new String(child.getValue())); assertEquals(child.getTextValue(), new String(child.getValue())); }
public void testCharsetWithURL() throws Exception { HTMLDocument document; try { URL url_ = new URL("http://www.24h.com.vn"); document = HTMLParser.createDocument(url_.openConnection().getInputStream(), null); document = HTMLParser.createDocument(url_.openStream(), null); } catch (java.net.UnknownHostException e) { return; } catch (java.net.ConnectException e) { return; } assertNotNull(document); assertNotNull(document.getRoot()); }
public void testCharsetWithFile() throws Exception { HTMLDocument document = HTMLParser.createDocument(this.file_, null); assertNotNull(document); assertEquals("ASCII", HTMLParser.getCharset()); assertNotSame("UTF-8", HTMLParser.getCharset()); System.out.println("CHARSET: " + HTMLParser.getCharset()); System.out.println("DOCUMENT-TEXTVALUE: " + document.getTextValue()); System.out.println("DOCUMENT-ROOT: " + document.getRoot().getName().toString()); System.out.println("CLASS: " + document.getClass().getName() + "\n"); System.out.println("DOCUMENT-DOCTYPE-TEXTVALUE: " + document.getDoctype().getTextValue()); System.out.println("DOCUMENT-DOCTYPE-VALUE: " + new String(document.getDoctype().getValue())); System.out.println("DOCUMENT-DOCTYPE-NAME: " + document.getDoctype().getName().toString()); }
public void testNode() throws Exception { // assertNotNull(this.file_); System.out.println("FILE PATH: " + this.file_.getAbsolutePath()); // HTMLDocument. String text = "<html>" + "<head>" + "<title>My own HTML file</title>" + "</head>" + "<body>" + "<h2>This is a test exercise for me!</h2>" + "</body>" + "</html>"; HTMLDocument document = HTMLParser.createDocument(text); assertNotNull(document); String pathStr = "html.head.title"; NodePath path = NodePathParser.toPath(pathStr); assertNotNull(path); assertEquals(path.toString(), "HTML[0].HEAD[0].TITLE[0]"); System.out.println("PATH: " + path.toString()); HTMLNode node = NodePathUtil.lookFor(document.getRoot(), path); assertNotNull(node); assertEquals(node.getName(), Name.TITLE); // Add a Tag to HTMLDocument. NodeImpl impl = new NodeImpl("h2 id = \"dds\"".toCharArray(), Name.H2); node.addChild(impl); assertNotNull(node.getChildrenNode().get(1)); assertEquals(node.getChildren().get(1).getName(), Name.H2); System.out.println("THE NEW NODE-NAME: " + node.getChildrenNode().get(1).getName().toString()); System.out.println("THE NEW NODE-VALUE: " + new String(node.getChildren().get(1).getValue())); // Add a Table to HTMLDocument. HTMLDocument doc = HTMLParser.createDocument("<table border='1'><tr></tr></table>"); HTMLNode table = NodePathUtil.lookFor(doc.getRoot(), NodePathParser.toPath("html.body.table")); node.addChild(table); // Remove a Node which is text in format from HTMLDocument. System.out.println("\n\nRemove:"); HTMLNode contentNode = NodePathUtil.lookFor(document.getRoot(), NodePathParser.toPath("html.head.title.content")); assertNotNull(contentNode); assertEquals(Name.CONTENT, contentNode.getName()); assertEquals("CONTENT", contentNode.getName().toString()); assertEquals(new String(contentNode.getValue()), contentNode.getTextValue()); System.out.println("NODE-VALUE: " + new String(contentNode.getValue())); System.out.println("NODE-TEXTVALUE: " + contentNode.getTextValue()); assertEquals(true, node.getChildren().remove(contentNode)); // Pass the Node which has removed from HTMLDocument into the <h2> TAG. HTMLNode h2Node = NodePathUtil.lookFor(document.getRoot(), NodePathParser.toPath("html.head.title.h2")); assertNotNull(h2Node); assertEquals(Name.H2, h2Node.getName()); assertEquals("H2", h2Node.getName().toString()); h2Node.addChild(contentNode); // Show all. System.out.println("\n\nShow all the content of HTML file:"); System.out.println(node.getTextValue()); }