public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
private void deleteBlock(CachedUrl cu) throws IOException { log.info("deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); }
public class TestElsevierXmlLinkExtractorFactory extends LinkExtractorTestCase { private static Logger logger = Logger.getLogger("TestElsevierXmlLinkExtractorFactory"); String srcUrl = "http://www.example.com/"; private static final String withLinks = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE dataset SYSTEM \"http://support.sciencedirect.com/xml/sdosftp10.dtd\">\n" + "<dataset identifier=\"OXM10160\" customer=\"OHL\"" + " status=\"Announcement\"" + " version=\"Network Dataset Announcement/Confirmation v1.0\">" + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n" + "<file name=\"01407007.tar\" size=\"21780480\"" + " md5=\"6c7266e0e246bf3e8cf1cd8b659a7a73\"/>\n" + "<file name=\"03064530.tar\" size=\"12748800\"" + " md5=\"df9519d3075e164d22f5dd4988a693c3\"/>\n" + "<file name=\"dataset.toc\" size=\"2216587\"" + " md5=\"cd21741eb91fa0fdfef2fa36485e21a0\"/>\n" + "</dataset>\n"; private static final String withoutLinks = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE dataset SYSTEM \"http://support.sciencedirect.com/xml/sdosftp10.dtd\">\n" + "<dataset identifier=\"OXM10160\" customer=\"OHL\"" + " status=\"Announcement\"" + " version=\"Network Dataset Announcement/Confirmation v1.0\">" + " <date year=\"2007\" month=\"May\" day=\"1\"/>\n" + "</dataset>\n"; private static final String[] links = { "01407007.tar", "03064530.tar", "dataset.toc", }; public String getMimeType() { return "text/xml"; } public LinkExtractorFactory getFactory() { return new ElsevierXmlLinkExtractorFactory(); } public void testFindCorrectEntries() throws Exception { Set expected = new HashSet(); for (String link : links) { expected.add(srcUrl + link); } assertEquals(expected, extractUrls(withLinks)); } public void testFindNoEntries() throws Exception { assertEmpty(extractUrls(withoutLinks)); } }
public class TestNatureArticleIteratorFactory extends LockssTestCase { static Logger log = Logger.getLogger("TestNatureArticleIteratorFactory"); private SimulatedArchivalUnit sau; // Simulated AU to generate content private ArchivalUnit nau; // Nature AU private MockLockssDaemon theDaemon; private static final int DEFAULT_FILESIZE = 3000; private static int fileSize = DEFAULT_FILESIZE; private static String PLUGIN_NAME = "org.lockss.plugin.nature.ClockssNaturePublishingGroupPlugin"; private static String BASE_URL = "http://www.nature.com/"; public void setUp() throws Exception { super.setUp(); String tempDirPath = getTempDir().getAbsolutePath() + File.separator; ConfigurationUtil.setFromArgs(LockssRepositoryImpl.PARAM_CACHE_LOCATION, tempDirPath); theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath)); nau = PluginTestUtil.createAndStartAu(PLUGIN_NAME, natureAuConfig()); } public void tearDown() throws Exception { sau.deleteContentTree(); theDaemon.stopDaemon(); super.tearDown(); } Configuration simAuConfig(String rootPath) { Configuration conf = ConfigManager.newConfiguration(); conf.put("root", rootPath); conf.put("base_url", BASE_URL); conf.put("depth", "1"); conf.put("branch", "4"); conf.put("numFiles", "7"); conf.put( "fileTypes", "" + (SimulatedContentGenerator.FILE_TYPE_HTML | SimulatedContentGenerator.FILE_TYPE_PDF)); conf.put("binFileSize", "" + fileSize); return conf; } Configuration natureAuConfig() { Configuration conf = ConfigManager.newConfiguration(); conf.put("base_url", BASE_URL); conf.put("journal_id", "aps"); conf.put("volume_name", "123"); conf.put("year", "2008"); return conf; } public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); } // public void testArticleCountAndDefaultType() throws Exception { // testArticleCountAndType("text/html", true, 24); // } // // public void testArticleCountAndPdf() throws Exception { // testArticleCountAndType("application/pdf", false, 0); // } private void deleteBlock(CachedUrl cu) throws IOException { log.info("deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); } }
public class TestHighWireArticleIteratorFactory extends ArticleIteratorTestCase { static Logger log = Logger.getLogger(TestHighWireArticleIteratorFactory.class); private SimulatedArchivalUnit sau; // Simulated AU to generate content private static String PLUGIN_NAME = "org.lockss.plugin.highwire.HighWirePressPlugin"; private static String BASE_URL = "http://pediatrics.aappublications.org/"; private static String SIM_ROOT = BASE_URL + "cgi/reprint/"; public void setUp() throws Exception { super.setUp(); String tempDirPath = setUpDiskSpace(); au = createAu(); sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath)); } public void tearDown() throws Exception { sau.deleteContentTree(); // theDaemon.stopDaemon(); super.tearDown(); } Configuration simAuConfig(String rootPath) { Configuration conf = ConfigManager.newConfiguration(); conf.put("root", rootPath); conf.put("base_url", SIM_ROOT); conf.put("depth", "0"); conf.put("branch", "0"); conf.put("numFiles", "2"); conf.put( "fileTypes", "" + (SimulatedContentGenerator.FILE_TYPE_PDF | SimulatedContentGenerator.FILE_TYPE_HTML)); conf.put("binFileSize", "7"); return conf; } protected ArchivalUnit createAu() throws ArchivalUnit.ConfigurationException { return PluginTestUtil.createAndStartAu( PLUGIN_NAME, ConfigurationUtil.fromArgs( "base_url", "http://pediatrics.aappublications.org/", "volume_name", "52", "journal_issn", "1098-4275")); } public void testRoots() throws Exception { SubTreeArticleIterator artIter = createSubTreeIter(); System.out.println("Root Urls::" + getRootUrls(artIter)); assertEquals( ListUtil.list( "http://pediatrics.aappublications.org/cgi/content/full/52/", "http://pediatrics.aappublications.org/cgi/reprint/52/"), getRootUrls(artIter)); } public void testUrlsWithPrefixes() throws Exception { SubTreeArticleIterator artIter = createSubTreeIter(); Pattern pat = getPattern(artIter); assertMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprint/foo;52/Supplement_3/S69.pdf"); assertMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprint/52/supplement_3/S69.pdf"); assertNotMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprin/1014174823t49006/j0143.pdfwrong"); assertNotMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprintt/1014174823t49006/j0143.pdfwrong"); assertNotMatchesRE(pat, "http://www.example.com/content/"); assertNotMatchesRE(pat, "http://www.example.com/content/j"); assertNotMatchesRE(pat, "http://www.example.com/content/j0123/j383.pdfwrong"); } public void testCreateArticleFiles() throws Exception { PluginTestUtil.crawlSimAu(sau); String pat0 = "001file[.]html"; String rep0 = "52/1/S1"; PluginTestUtil.copyAu(sau, au, ".*[.]html$", pat0, rep0); String pat1 = "001file[.]pdf"; String rep1 = "52/1/S1.pdf"; PluginTestUtil.copyAu(sau, au, ".*[.]pdf$", pat1, rep1); String pdfurl = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1.pdf"; String url = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1"; au.makeCachedUrl(url); CachedUrl cu = au.makeCachedUrl(pdfurl); assertNotNull(cu); SubTreeArticleIterator artIter = createSubTreeIter(); assertNotNull(artIter); ArticleFiles af = artIter.next(); assertNotNull(af); System.out.println("article files::" + af); assertEquals(url, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE).getUrl()); assertEquals(pdfurl, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF).getUrl()); } }