public void testRelativeLinksWithLeadingSlash() throws IOException { String url1 = "http://www.example.com/blah/branch1/index.html"; String url2 = "http://www.example.com/blah/branch2/index.html"; String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html"; String url4 = "http://www.example.com/css/foo.css"; String url5 = "http://www.example.com/javascript/bar.js"; String source = "<html><head><title>Test</title></head><body>" + "<a href= branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=\" branch2/index.html\">link2</a>" + "<a href =\" /journals/american_imago/toc/aim60.1.html\">" + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >" + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>" + "Number 1, Spring 2003</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/"); mcu.setContent(source); extractor.extractUrls( mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb); Set expected = SetUtil.set(url1, url2, url3, url4, url5); assertEquals(expected, cb.getFoundUrls()); }
public void testThrowsOnNullCallback() throws IOException { try { extractor.extractUrls( mau, new StringInputStream("blah"), ENC, "http://www.example.com/", null); fail("Calling extractUrls with a null callback should have thrown"); } catch (IllegalArgumentException iae) { } }
public void testThrowsOnNullSourceUrl() throws IOException { try { extractor.extractUrls( mau, new StringInputStream("Blah"), ENC, null, new MyLinkExtractorCallback()); fail("Calling extractUrls with a null CachedUrl should have thrown"); } catch (IllegalArgumentException iae) { } }
private Set parseSingleSource(String source) throws IOException { MockArchivalUnit mau = new MockArchivalUnit(); LinkExtractor ue = new RegexpCssLinkExtractor(); mau.setLinkExtractor("text/css", ue); MockCachedUrl mcu = new MockCachedUrl("http://www.example.com", mau); mcu.setContent(source); cb.reset(); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); return cb.getFoundUrls(); }
private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } }
public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException { String url = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>"; MockCachedUrl mcu = new MockCachedUrl(startUrl); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb); Set expected = SetUtil.set(url); assertEquals(expected, cb.getFoundUrls()); }
public void testRelativeLinksWithSameName() throws IOException { String url1 = "http://www.example.com/branch1/index.html"; String url2 = "http://www.example.com/branch2/index.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=branch2/index.html>link2</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); Set expected = SetUtil.set(url1, url2); assertEquals(expected, cb.getFoundUrls()); }
private void singleTagParse( String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse) throws IOException { MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); String content = makeContent(url, startTag, endTag); mcu.setContent(content); MyLinkExtractorCallback cb = new MyLinkExtractorCallback(); extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb); if (shouldParse) { Set expected = SetUtil.set(url); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } else { Set expected = SetUtil.set(); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } }