public void testRelativeLinksWithLeadingSlash() throws IOException { String url1 = "http://www.example.com/blah/branch1/index.html"; String url2 = "http://www.example.com/blah/branch2/index.html"; String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html"; String url4 = "http://www.example.com/css/foo.css"; String url5 = "http://www.example.com/javascript/bar.js"; String source = "<html><head><title>Test</title></head><body>" + "<a href= branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=\" branch2/index.html\">link2</a>" + "<a href =\" /journals/american_imago/toc/aim60.1.html\">" + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >" + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>" + "Number 1, Spring 2003</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/"); mcu.setContent(source); extractor.extractUrls( mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb); Set expected = SetUtil.set(url1, url2, url3, url4, url5); assertEquals(expected, cb.getFoundUrls()); }
private Set parseSingleSource(String source) throws IOException { MockArchivalUnit mau = new MockArchivalUnit(); LinkExtractor ue = new RegexpCssLinkExtractor(); mau.setLinkExtractor("text/css", ue); MockCachedUrl mcu = new MockCachedUrl("http://www.example.com", mau); mcu.setContent(source); cb.reset(); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); return cb.getFoundUrls(); }
public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException { String url = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>"; MockCachedUrl mcu = new MockCachedUrl(startUrl); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb); Set expected = SetUtil.set(url); assertEquals(expected, cb.getFoundUrls()); }
public void testRelativeLinksWithSameName() throws IOException { String url1 = "http://www.example.com/branch1/index.html"; String url2 = "http://www.example.com/branch2/index.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=branch2/index.html>link2</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); Set expected = SetUtil.set(url1, url2); assertEquals(expected, cb.getFoundUrls()); }
private void singleTagParse( String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse) throws IOException { MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); String content = makeContent(url, startTag, endTag); mcu.setContent(content); MyLinkExtractorCallback cb = new MyLinkExtractorCallback(); extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb); if (shouldParse) { Set expected = SetUtil.set(url); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } else { Set expected = SetUtil.set(); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } }