public void testHeadNotPopular() throws Exception { VersionCounts versionCounts = VersionCounts.make(); VoteBlock vb1 = makeVoteBlock("http://test.com/foo1"); byte[] hash1 = addVersion(vb1, "content 1 for foo1"); byte[] hash2 = addVersion(vb1, "content 2 for foo1"); VoteBlock vb2 = makeVoteBlock("http://test.com/foo1"); addVersion(vb2, "content 1 for foo1"); addVersion(vb2, "content 2 for foo1"); VoteBlock vb3 = makeVoteBlock("http://test.com/foo1"); addVersion(vb3, "content 3 for foo1"); addVersion(vb3, "content 2 for foo1"); versionCounts.vote(vb1, participant1); versionCounts.vote(vb2, participant2); versionCounts.vote(vb3, participant3); Map<ParticipantUserData, HashResult> repairCandidates; repairCandidates = versionCounts.getRepairCandidates(0); assertSameElements( SetUtil.set(participant1, participant2, participant3), repairCandidates.keySet()); repairCandidates = versionCounts.getRepairCandidates(1); assertSameElements( SetUtil.set(participant1, participant2, participant3), repairCandidates.keySet()); repairCandidates = versionCounts.getRepairCandidates(2); assertSameElements(SetUtil.set(participant1, participant2), repairCandidates.keySet()); repairCandidates = versionCounts.getRepairCandidates(3); assertEmpty(repairCandidates.keySet()); }
public void testDefaults() throws Exception { Properties p = initProps(); KeyStore ks = KeyStoreUtil.createKeyStore(p); List aliases = ListUtil.fromIterator(new EnumerationIterator(ks.aliases())); assertIsomorphic(SetUtil.set("mykey", "mycert"), SetUtil.theSet(aliases)); assertNotNull(ks.getCertificate("mycert")); assertNull(ks.getCertificate("foocert")); assertEquals("JCEKS", ks.getType()); }
// ensure that scanning continues after a nested parser throws an error // XXX Need to cause an IOException while reading CSS from the stream public void xxxtestDoCrawlStyleError() throws IOException { String url1 = "http://example.com/blah1.html"; String url2 = "http://example.com/blah2.html"; String url3 = "http://example.com/blah3.html"; String source = "<html><head>" + "<style type=\"text/css\">\n" + "<!--\n" + "@import url(\'" + url1 + "\');\n" + // ensure css parser got invoked "foo {bgcolor: #FFFF};" + // and that this causes an error "@import url(\'" + url2 + "\');\n" + // so that this one isn't found "-->\n" + " </style>\n" + "<a href=" + url3 + "></a>" + // and this one is "</head></html>"; assertEquals(SetUtil.set(url1, url3), parseSingleSource(source)); }
public void testKeepsSingleQuoteInUrl() throws IOException { String url = "http://www.example.com/link'with'quotes.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=\"" + url + "\">Link</a>"; assertEquals(SetUtil.set(url), parseSingleSource(source)); }
public void testRelativeLinksWithLeadingSlash() throws IOException { String url1 = "http://www.example.com/blah/branch1/index.html"; String url2 = "http://www.example.com/blah/branch2/index.html"; String url3 = "http://www.example.com/journals/american_imago/toc/aim60.1.html"; String url4 = "http://www.example.com/css/foo.css"; String url5 = "http://www.example.com/javascript/bar.js"; String source = "<html><head><title>Test</title></head><body>" + "<a href= branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=\" branch2/index.html\">link2</a>" + "<a href =\" /journals/american_imago/toc/aim60.1.html\">" + "<link rel=\"stylesheet\" href=\"/css/foo.css\" >" + "<script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>" + "Number 1, Spring 2003</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com/blah/"); mcu.setContent(source); extractor.extractUrls( mau, new StringInputStream(source), ENC, "http://www.example.com/blah/", cb); Set expected = SetUtil.set(url1, url2, url3, url4, url5); assertEquals(expected, cb.getFoundUrls()); }
public void testStore() throws Exception { File dir = getTempDir(); File file = new File(dir, "test.ks"); Properties p = initProps(); p.put(KeyStoreUtil.PROP_KEYSTORE_FILE, file.toString()); assertFalse(file.exists()); KeyStore ks = KeyStoreUtil.createKeyStore(p); assertTrue(file.exists()); KeyStore ks2 = loadKeyStore(ks.getType(), file, PASSWD); List aliases = ListUtil.fromIterator(new EnumerationIterator(ks2.aliases())); assertIsomorphic(SetUtil.set("mykey", "mycert"), SetUtil.theSet(aliases)); assertNotNull(ks2.getCertificate("mycert")); assertNull(ks2.getCertificate("foocert")); assertEquals("JCEKS", ks2.getType()); }
public void testMultipleIdenticalVersions() throws Exception { VersionCounts versionCounts = VersionCounts.make(); VoteBlock vb1 = makeVoteBlock("http://test.com/foo1"); byte[] hash1 = addVersion(vb1, "content 1 for foo1"); byte[] hash2 = addVersion(vb1, "content 2 for foo1"); VoteBlock vb2 = makeVoteBlock("http://test.com/foo1"); addVersion(vb2, "content 1 for foo1"); addVersion(vb2, "content 1 for foo1"); addVersion(vb2, "content 1 for foo1"); addVersion(vb2, "content 1 for foo1"); addVersion(vb2, "content 2 for foo1"); VoteBlock vb3 = makeVoteBlock("http://test.com/foo1"); addVersion(vb3, "content 1 for foo1"); addVersion(vb3, "content 2 for foo1"); addVersion(vb3, "content 2 for foo1"); addVersion(vb3, "content 2 for foo1"); addVersion(vb3, "content 2 for foo1"); versionCounts.vote(vb1, participant1); versionCounts.vote(vb2, participant2); versionCounts.vote(vb3, participant3); Map<ParticipantUserData, HashResult> repairCandidates; repairCandidates = versionCounts.getRepairCandidates(2); assertSameElements( SetUtil.set(participant1, participant2, participant3), repairCandidates.keySet()); // With only three candidates, no version should reach a threshold // of 4, unless counting multiples is wrong. repairCandidates = versionCounts.getRepairCandidates(4); assertEmpty(repairCandidates.keySet()); }
public void testResolvesHtmlEntities() throws IOException { String url1 = "http://www.example.com/bioone/?" + "request=get-toc&issn=0044-7447&volume=32&issue=1"; String source = "<html><head><title>Test</title></head><body>" + "<a href=http://www.example.com/bioone/?" + "request=get-toc&issn=0044-7447&volume=32&issue=1>link1</a>"; assertEquals(SetUtil.set(url1), parseSingleSource(source)); // ensure character entities processed before rel url resolution source = "<html><head><title>Test</title></head><body>" + "<base href=http://www.example.com/foo/bar>" + "<a href=../xxx>link1</a>"; assertEquals(SetUtil.set("http://www.example.com/xxx"), parseSingleSource(source)); }
public void testIgnoresCRInUrl() throws IOException { String url = "http://www.example.com/linkwithspace.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=\"http://www.example.com/link\rwith\rspace.html\">Link</a>"; assertEquals(SetUtil.set(url), parseSingleSource(source)); }
public void testIgnoresNewLineInField() throws IOException { String url = "http://www.example.com/link.html"; String source = "<html><head><title>Test</title></head><body>" + "<img\nsrc=\"http://www.example.com/link.html\">Link</a>"; assertEquals(SetUtil.set(url), parseSingleSource(source)); }
public void testParsesFileWithQuotedUrls() throws IOException { String url = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=\"http://www.example.com/link3.html\">link3</a>"; assertEquals(SetUtil.set(url), parseSingleSource(source)); }
/** Test that a single links is found. */ public void testOneUrl() throws Exception { try { Set urls = SetUtil.set("http://www.foo.com/blah.jpg"); assertEquals(urls, extractUrls(constructValidRDF(urls))); } catch (Throwable ex) { fail("", ex); } }
private void singleTagParse( String url, String startTag, String endTag, ArchivalUnit au, boolean shouldParse) throws IOException { MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); String content = makeContent(url, startTag, endTag); mcu.setContent(content); MyLinkExtractorCallback cb = new MyLinkExtractorCallback(); extractor.extractUrls(mau, new StringInputStream(content), ENC, "http://www.example.com", cb); if (shouldParse) { Set expected = SetUtil.set(url); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } else { Set expected = SetUtil.set(); assertEquals("Misparsed: " + content, expected, cb.getFoundUrls()); } }
public void testHttpEquiv() throws IOException { String url1 = "http://example.com/blah.html"; String source = "<html><head>" + "<meta http-equiv=\"refresh\" " + "content=\"0; url=http://example.com/blah.html\">" + "</head></html>"; assertEquals(SetUtil.set(url1), parseSingleSource(source)); source = "<html><head>" + "<meta http-equiv=\"refresh\" " + "content=\"0;url=http://example.com/blah.html\">" + "</head></html>"; assertEquals(SetUtil.set(url1), parseSingleSource(source)); }
public void testSkipsMalformedComments() throws IOException { String url = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<!--<a href=http://www.example.com/link1.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=http://www.example.com/link2.html>link2</a>--!>" + "<a href=http://www.example.com/link3.html>link3</a>"; assertEquals(SetUtil.set(url), parseSingleSource(source)); }
public void testHeadNotAllowed() throws Exception { VersionCounts versionCounts = VersionCounts.make(); VoteBlock vb1 = makeVoteBlock("http://test.com/foo1"); byte[] hash1 = addVersion(vb1, "content 1 for foo1"); byte[] hash2 = addVersion(vb1, "content 2 for foo1"); versionCounts.vote(vb1, participant1); Map<ParticipantUserData, HashResult> repairCandidates; repairCandidates = versionCounts.getRepairCandidates(0); assertSameElements(SetUtil.set(participant1), repairCandidates.keySet()); // Same, but with an excluded version that doesn't matter. repairCandidates = versionCounts.getRepairCandidates(0, SetUtil.set(HashResult.make(hash2))); assertSameElements(SetUtil.set(participant1), repairCandidates.keySet()); // Same, but with an excluded version that does matter repairCandidates = versionCounts.getRepairCandidates(0, SetUtil.set(HashResult.make(hash1))); assertEmpty(repairCandidates); }
public void testDontParseJSByDefault() throws IOException { String url3 = "http://www.example.com/link1.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href = javascript:newWindow('http://www.example.com/link3.html')</a>" + "<a href = javascript:popup('http://www.example.com/link2.html')</a>" + "<img src = javascript:popup('" + url3 + "') </img>"; assertEquals(SetUtil.set(), parseSingleSource(source)); }
private void doScriptSkipTest(String openScript, String closeScript, String failMsg) throws IOException { String url = "http://www.example.com/link3.html"; String src = "<html><head><title>Test</title></head><body>" + openScript + "<a href=http://www.example.com/link1.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=http://www.example.com/link2.html>link2</a>" + closeScript + "<a href=http://www.example.com/link3.html>link3</a>"; assertEquals(failMsg, SetUtil.set(url), parseSingleSource(src)); }
public void testSortedRepairCandidates() throws Exception { VersionCounts versionCounts = VersionCounts.make(); VoteBlock vb1 = makeVoteBlock("http://test.com/foo1"); addVersion(vb1, "content 1 for foo1"); VoteBlock vb2 = makeVoteBlock("http://test.com/foo1"); addVersion(vb2, "content 2 for foo1"); VoteBlock vb3 = makeVoteBlock("http://test.com/foo1"); addVersion(vb3, "content 3 for foo1"); addVersion(vb3, "content 2 for foo1"); versionCounts.vote(vb1, participant1); versionCounts.vote(vb2, participant2); versionCounts.vote(vb3, participant3); Map<Integer, Collection<ParticipantUserData>> repairCandidates; repairCandidates = versionCounts.getSortedRepairCandidatesMap(2); assertEquals(SetUtil.set(2), repairCandidates.keySet()); assertSameElements(SetUtil.set(participant2), repairCandidates.get(2)); assertEquals(ListUtil.list(participant2), versionCounts.getSortedRepairCandidates(2)); repairCandidates = versionCounts.getSortedRepairCandidatesMap(1); assertIsomorphic(ListUtil.list(2, 1), repairCandidates.keySet()); assertSameElements(SetUtil.set(participant2), repairCandidates.get(2)); assertSameElements(SetUtil.set(participant1, participant3), repairCandidates.get(1)); List<ParticipantUserData> lst = versionCounts.getSortedRepairCandidates(1); assertTrue( "" + lst, (lst.equals(ListUtil.list(participant2, participant1, participant3)) || lst.equals(ListUtil.list(participant2, participant3, participant1)))); assertEmpty(versionCounts.getSortedRepairCandidatesMap(4)); assertEmpty(versionCounts.getSortedRepairCandidates(4)); }
public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws IOException { String url = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<img src=" + url + " alt=src>link3</a>"; MockCachedUrl mcu = new MockCachedUrl(startUrl); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, startUrl, cb); Set expected = SetUtil.set(url); assertEquals(expected, cb.getFoundUrls()); }
public void testRelativeLinksLocationTagsAndMultipleKeys() throws IOException { String url1 = "http://www.example.com/link1.html"; String url2 = "http://www.example.com/link2.html#ref"; String url3 = "http://www.example.com/dir/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=link1.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a blah1=blah href=link2.html#ref blah2=blah>link2</a>" + "<a href=dir/link3.html>link3</a>"; assertEquals(SetUtil.set(url1, url2, url3), parseSingleSource(source)); }
public void testIgnoresEmptyHrefInBaseTag() throws IOException { String url1 = "http://www.example.com/link1.html"; String url2 = "http://www.example.com/link2.html"; String url3 = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=link1.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<base href=\"\" blah=blah>" + "<a href=link2.html>link2</a>" + "<a href=link3.html>link3</a>"; assertEquals(SetUtil.set(url1, url2, url3), parseSingleSource(source)); }
/** Included to test a chunk of HighWire HTML that we're not parsing correctly */ public void testParseHWPDF() throws IOException { // Properties p = new Properties(); // p.setProperty(GoslingHtmlLinkExtractor.PARAM_PARSE_JS, "true"); // ConfigurationUtil.setCurrentConfigFromProps(p); // extractor = new GoslingHtmlLinkExtractor(); String url = "http://www.example.com/cgi/reprint/21/1/2.pdf"; String source = "<table cellspacing=\"0\" cellpadding=\"10\" width=\"250\" border=\"0\">" + "<tr><td align=center bgcolor=\"#DBDBDB\">\n\n " + "<font face=\"verdana,arial,helvetica,sans-serif\">" + "<strong><font size=+1>Automatic download</font><br>\n " + "<font size=\"-1\">[<a target=\"_self\" href=\"/cgi/reprint/21/1/2.pdf\" " + "onclick=\"cancelLoadPDF()\">Begin manual download</a>]</strong></font>\n"; assertEquals(SetUtil.set(url), parseSingleSource(source)); }
public void testRelativeLinksWithSameName() throws IOException { String url1 = "http://www.example.com/branch1/index.html"; String url2 = "http://www.example.com/branch2/index.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=branch1/index.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=branch2/index.html>link2</a>"; MockCachedUrl mcu = new MockCachedUrl("http://www.example.com"); mcu.setContent(source); extractor.extractUrls(mau, new StringInputStream(source), ENC, "http://www.example.com", cb); Set expected = SetUtil.set(url1, url2); assertEquals(expected, cb.getFoundUrls()); }
// Relative URLs before a malforned base tag should be extracted, as well // as any absolute URLs after the malformed base tag public void testInterpretsMalformedBaseTag() throws IOException { String url1 = "http://www.example.com/link1.html"; String url2 = "http://www.example2.com/link2.html"; String url3 = "http://www.example2.com/link3.html"; String url4 = "http://www.example3.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<base href=http://www.example.com>" + "<a href=link1.html>link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<base href=javascript:www.example2.com>" + "<a href=link2.html>link2</a>" + "<base href=www.example.com>" + "<a href=http://www.example2.com/link3.html>link3</a>" + "<base href=http://www.example3.com>" + "<a href=link3.html>link4</a>"; assertEquals(SetUtil.set(url1, url3, url4), parseSingleSource(source)); }
public void testParseJSIfConf() throws IOException { Properties p = new Properties(); p.setProperty(GoslingHtmlLinkExtractor.PARAM_PARSE_JS, "true"); ConfigurationUtil.setCurrentConfigFromProps(p); extractor = new GoslingHtmlLinkExtractor(); String url = "http://www.example.com/link3.html"; String url2 = "http://www.example.com/link2.html"; String url3 = "http://www.example.com/link1.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href = javascript:newWindow('http://www.example.com/link3.html')</a>" + "<a href = javascript:popup('http://www.example.com/link2.html')</a>" + "<img src = javascript:popup('" + url3 + "') </img>"; assertEquals(SetUtil.set(url, url2, url3), parseSingleSource(source)); }
public void testMultipleLinks() throws IOException { String url1 = "http://www.example.com/link1.html"; String url2 = "http://www.example.com/link2.html"; String url3 = "http://www.example.com/link3.html"; String source = "<html><head><title>Test</title></head><body>" + "<a href=" + url1 + ">link1</a>" + "Filler, with <b>bold</b> tags and<i>others</i>" + "<a href=" + url2 + ">link2</a>" + "<a href=" + url3 + ">link3</a>"; assertEquals(SetUtil.set(url1, url2, url3), parseSingleSource(source)); }
public void testDontParseMailto() throws IOException { String source = "<html><head><title>Test</title></head><body>" + "<a href = mailto:[email protected]</a>"; assertEquals(SetUtil.set(), parseSingleSource(source)); }
protected void performDoCrawlStyle( String openingStyleTag, String givenPrefix, String expectedPrefix) throws IOException { String url1 = "foo1.css"; String url2 = "foo2.css"; String url3 = "foo3.css"; String url4 = "foo4.css"; String url5 = "img5.gif"; String url6 = "img6.gif"; String source = "<html>\n" + " <head>\n" + " <title>Test</title>\n" + " " + openingStyleTag + "\n" + "<!--\n" + "@import url(\'" + givenPrefix + url1 + "\');\n" + "@import url(\"" + givenPrefix + url2 + "\");\n" + "@import \'" + givenPrefix + url3 + "\';\n" + "@import \"" + givenPrefix + url4 + "\";\n" + "foo {\n" + " bar: url(\'" + givenPrefix + url5 + "\');\n" + " baz: url(\"" + givenPrefix + url6 + "\");\n" + "}\n" + "/* Comment */" + "-->\n" + " </style>\n" + " </head>\n" + " <body>\n" + " <p>Fake content</p>\n" + " </body>\n" + "</html>\n"; assertEquals( SetUtil.set( expectedPrefix + url1, expectedPrefix + url2, expectedPrefix + url3, expectedPrefix + url4, expectedPrefix + url5, expectedPrefix + url6), parseSingleSource(source)); }
public void testEmptyAttribute() throws IOException { String source = "<html><head><title>Test</title></head><body>" + "<a href=>link3</a>"; assertEquals(SetUtil.set(), parseSingleSource(source)); }