private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } }
public void testRelativeLinksWithLeadingSlash() throws IOException { String url1 = "http://www.example.com/blah/branch1/index.html"; String url2 = "http://www.example.com/blah/branch2/index.html"; String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html"; String url4 = "http://www.example.com/css/foo.css"; String url5 = "http://www.example.com/javascript/bar.js"; String source = "<html><head><title>Test</title></head><body>" + "<a href= branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=\" branch2/index.html\">link2</a>" + "<a href =\" /journals/american_imago/toc/aim60.1.html\">" + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >" + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>" + "Number 1, Spring 2003</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/"); mcu.setContent(source); extractor.extractUrls( mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb); Set expected = SetUtil.set(url1, url2, url3, url4, url5); assertEquals(expected, cb.getFoundUrls()); }
public void testThrowsOnNullCallback() throws IOException { try { extractor.extractUrls( mau, new StringInputStream("blah"), ENC, "http://www.example.com/", null); fail("Calling extractUrls with a null callback should have thrown"); } catch (IllegalArgumentException iae) { } }
public void testThrowsOnNullSourceUrl() throws IOException { try { extractor.extractUrls( mau, new StringInputStream("Blah"), ENC, null, new MyLinkExtractorCallback()); fail("Calling extractUrls with a null CachedUrl should have thrown"); } catch (IllegalArgumentException iae) { } }
private Set parseSingleSource(String source) throws IOException { MockArchivalUnit mau = new MockArchivalUnit(); LinkExtractor ue = new RegexpCssLinkExtractor(); mau.setLinkExtractor("text/css", ue); MockCachedUrl mcu = new MockCachedUrl("http://www.example.com", mau); mcu.setContent(source); cb.reset(); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); return cb.getFoundUrls(); }
public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException { String url = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>"; MockCachedUrl mcu = new MockCachedUrl(startUrl); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb); Set expected = SetUtil.set(url); assertEquals(expected, cb.getFoundUrls()); }
public void testRelativeLinksWithSameName() throws IOException { String url1 = "http://www.example.com/branch1/index.html"; String url2 = "http://www.example.com/branch2/index.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=branch2/index.html>link2</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); Set expected = SetUtil.set(url1, url2); assertEquals(expected, cb.getFoundUrls()); }
private void singleTagParse( String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse) throws IOException { MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); String content = makeContent(url, startTag, endTag); mcu.setContent(content); MyLinkExtractorCallback cb = new MyLinkExtractorCallback(); extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb); if (shouldParse) { Set expected = SetUtil.set(url); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } else { Set expected = SetUtil.set(); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } }
public void testGetAttribute() throws IOException { // no value found assertEquals(null, extractor.getAttributeValue("href", "a bar=foo")); assertEquals(null, extractor.getAttributeValue("href", "a href")); assertEquals(null, extractor.getAttributeValue("href", "a href=")); assertEquals(null, extractor.getAttributeValue("href", "a href= ")); // find proper attribute assertEquals("foo", extractor.getAttributeValue("tag", "a tag=foo tag=bar")); assertEquals("bar", extractor.getAttributeValue("tag", "a ta=foo tag=bar")); assertEquals("bar", extractor.getAttributeValue("tag", "a xy=foo\n tag=bar")); // whitespace assertEquals("foo", extractor.getAttributeValue("href", "a href=foo")); assertEquals("foo", extractor.getAttributeValue("href", "a href =foo")); assertEquals("foo", extractor.getAttributeValue("href", "a href = foo")); assertEquals("foo", extractor.getAttributeValue("href", "a href = foo\n")); assertEquals("foo", extractor.getAttributeValue("href", "a href= foo")); assertEquals("foo", extractor.getAttributeValue("href", "a href\t = \n foo")); // quoted strings & whitespace assertEquals("foo", extractor.getAttributeValue("href", "a href=\"foo\"")); assertEquals("foo", extractor.getAttributeValue("href", "a href=\"foo\"")); assertEquals("fo o", extractor.getAttributeValue("href", "a href =\"fo o\"")); assertEquals("fo'o", extractor.getAttributeValue("href", "a href= \"fo'o\"")); assertEquals("foo", extractor.getAttributeValue("href", "a href =\"foo\"")); assertEquals("foo", extractor.getAttributeValue("href", "a href='foo'")); assertEquals("foo", extractor.getAttributeValue("href", "a href='foo'")); assertEquals("fo o", extractor.getAttributeValue("href", "a href ='fo o'")); assertEquals("fo\"o", extractor.getAttributeValue("href", "a href= 'fo\"o'")); assertEquals("foo", extractor.getAttributeValue("href", "a href ='foo'")); // empty quoted strings assertEquals("", extractor.getAttributeValue("href", "a href=\"\"")); assertEquals("", extractor.getAttributeValue("href", "a href=''")); // dangling quoted strings assertEquals("", extractor.getAttributeValue("href", "a href=\"")); assertEquals("xy", extractor.getAttributeValue("href", "a href=\"xy")); assertEquals( "/cgi/reprint/21/1/2.pdf", extractor.getAttributeValue( "href", "a target=\"_self\" href=\"/cgi/reprint/21/1/2.pdf\" onclick=\"cancelLoadPDF()\"")); }