@Test public void testRelativeLinkWithLocationUrl() throws Exception { // Read in test data from test/resources String html = readFromFile("parser-files/relative-urls.html"); // Create FetchedDatum using data String url = "http://olddomain.com/relative-urls.html"; String location = "http://newdomain.com"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); headers.add(HttpHeaderNames.CONTENT_LOCATION, location); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse SimpleParser parser = new SimpleParser(); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify outlink is correct. Outlink[] outlinks = parsedDatum.getOutlinks(); Assert.assertEquals(2, outlinks.length); Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl()); Assert.assertEquals("link1", outlinks[0].getAnchor()); // TODO KKr - reenable this test when Tika changes are submitted: // Assert.assertEquals("nofollow", outlinks[0].getRelAttributes()); Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl()); Assert.assertEquals("link2", outlinks[1].getAnchor()); }
@Test public void testLanguageDetectionHttpEquiv() throws Exception { // Read in test data from test/resources String html = readFromFile("parser-files/lang-http-equiv.html"); // Create FetchedDatum using data String url = "http://domain.com/lang-dc.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en"); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse SimpleParser parser = new SimpleParser(); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify content is correct Assert.assertEquals("SimpleHttpEquiv", parsedDatum.getTitle()); compareTermsInStrings("SimpleHttpEquiv Content", parsedDatum.getParsedText()); Assert.assertEquals("ja", parsedDatum.getLanguage()); }
@Test public void testRelativeLinkWithBaseUrl() throws Exception { // Read in test data from test/resources String html = readFromFile("parser-files/base-url.html"); // Create FetchedDatum using data String url = "http://olddomain.com/base-url.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse SimpleParser parser = new SimpleParser(); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify outlink is correct. Outlink[] outlinks = parsedDatum.getOutlinks(); Assert.assertEquals(2, outlinks.length); // TODO KKr - reenable this test when Tika parser calls my handler with // the <base> element, which is needed to correctly resolve relative links. // Assert.assertEquals("http://newdomain.com/link", outlinks[0].getToUrl()); Assert.assertEquals("link1", outlinks[0].getAnchor()); Assert.assertEquals("http://domain.com/link", outlinks[1].getToUrl()); Assert.assertEquals("link2", outlinks[1].getAnchor()); }
@Test public void testHtmlWithTags() throws Exception { final String htmlText = "<html><head><title>Title</title></head>" + "<body><p>this is a test</p></body></html>"; // Create FetchedDatum using data String url = "http://domain.com/page.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Now take the resulting HTML, process it using Dom4J SAXReader reader = new SAXReader(new Parser()); reader.setEncoding("UTF-8"); String htmlWithMarkup = parsedDatum.getParsedText(); Document doc = reader.read(new StringInputStream(htmlWithMarkup)); // We have to do helicopter stunts since HTML has a global namespace on it, set // at the <html> element level. XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p"); Map<String, String> namespaceUris = new HashMap<String, String>(); namespaceUris.put("xhtml", "http://www.w3.org/1999/xhtml"); xpath.setNamespaceURIs(namespaceUris); Node paragraphNode = xpath.selectSingleNode(doc); Assert.assertNotNull(paragraphNode); Assert.assertEquals("this is a test", paragraphNode.getText()); }
@Test public void testExtractingObjectTag() throws Exception { final String html = "<html><head><title>Title</title></head>" + "<body><object data=\"http://domain.com/song.mid\" /></body></html>"; // Create FetchedDatum using data String url = "http://domain.com/music.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse ParserPolicy policy = new ParserPolicy( ParserPolicy.NO_MAX_PARSE_DURATION, BaseLinkExtractor.ALL_LINK_TAGS, BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES); SimpleParser parser = new SimpleParser(new SimpleContentExtractor(), new SimpleLinkExtractor(), policy, true); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify outlinks are correct Outlink[] outlinks = parsedDatum.getOutlinks(); Assert.assertEquals(1, outlinks.length); Assert.assertEquals("http://domain.com/song.mid", outlinks[0].getToUrl()); }
@Test public void testDefaultLinkTypes() throws Exception { // Read in test data from test/resources String html = readFromFile("parser-files/all-link-types.html"); // Create FetchedDatum using data String url = "http://domain.com/all-link-types.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse SimpleParser parser = new SimpleParser(); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify outlinks are correct (and we only get the a href ones). Outlink[] outlinks = parsedDatum.getOutlinks(); Assert.assertEquals(2, outlinks.length); Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl()); Assert.assertEquals("link1", outlinks[0].getAnchor()); Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl()); Assert.assertEquals("link2", outlinks[1].getAnchor()); }
@SuppressWarnings("serial") @Test public void testSomeLinkTypes() throws Exception { // Read in test data from test/resources String html = readFromFile("parser-files/all-link-types.html"); // Create FetchedDatum using data String url = "http://domain.com/all-link-types.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse Set<String> linkTags = new HashSet<String>() { { add("a"); add("img"); add("link"); } }; Set<String> linkAttributeTypes = new HashSet<String>() { { add("href"); add("src"); } }; ParserPolicy policy = new ParserPolicy(ParserPolicy.DEFAULT_MAX_PARSE_DURATION, linkTags, linkAttributeTypes); SimpleParser parser = new SimpleParser(policy); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify outlinks are correct (and we only get the a href ones). Outlink[] outlinks = parsedDatum.getOutlinks(); Assert.assertEquals(4, outlinks.length); Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl()); Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl()); Assert.assertEquals("link1", outlinks[1].getAnchor()); Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl()); Assert.assertEquals("link2", outlinks[2].getAnchor()); Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl()); }
@Test public void testLinkExtractorWithMetaTags() throws Exception { String html = readFromFile("parser-files/meta-nofollow.html"); String url = "http://domain.com/meta-nofollow.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE); SimpleParser parser = new SimpleParser(policy); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify we got no URLs Assert.assertEquals(0, parsedDatum.getOutlinks().length); }
@Test public void testAllLinkTypes() throws Exception { // Read in test data from test/resources String html = readFromFile("parser-files/all-link-types.html"); // Create FetchedDatum using data String url = "http://domain.com/all-link-types.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); // Call parser.parse ParserPolicy policy = new ParserPolicy( ParserPolicy.DEFAULT_MAX_PARSE_DURATION, BaseLinkExtractor.ALL_LINK_TAGS, BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES); SimpleParser parser = new SimpleParser(policy); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify outlinks are correct (and we only get the a href ones). Outlink[] outlinks = parsedDatum.getOutlinks(); Assert.assertEquals(7, outlinks.length); Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl()); Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl()); Assert.assertEquals("link1", outlinks[1].getAnchor()); Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl()); Assert.assertEquals("link2", outlinks[2].getAnchor()); Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl()); Assert.assertEquals( "http://en.wikipedia.org/wiki/Australia's_Big_Things", outlinks[4].getToUrl()); Assert.assertEquals("http://newdomain.com/giant-dog.jpg", outlinks[5].getToUrl()); Assert.assertEquals( "http://www.brucelawson.co.uk/index.php/2005/stupid-stock-photography/", outlinks[6].getToUrl()); }
@SuppressWarnings("serial") @Test public void testCustomContentExtractor() throws Exception { String html = readFromFile("parser-files/simple-content.html"); String url = "http://domain.com/simple-content.html"; String contentType = "text/html; charset=utf-8"; HttpHeaders headers = new HttpHeaders(); headers.add(HttpHeaderNames.CONTENT_TYPE, contentType); ContentBytes content = new ContentBytes(html.getBytes("utf-8")); FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content, contentType, 0); SimpleParser parser = new SimpleParser( new BaseContentExtractor() { @Override public String getContent() { return "Custom"; } }, new BaseLinkExtractor() { @Override public Outlink[] getLinks() { return new Outlink[0]; } }, new ParserPolicy()); ParsedDatum parsedDatum = parser.parse(fetchedDatum); // Verify content is correct Assert.assertEquals("Simple", parsedDatum.getTitle()); compareTermsInStrings("Custom", parsedDatum.getParsedText()); }