public void process(ArchiveEntry ae) { // By default the files have to go in the crawler's AU ArchivalUnit au = crawlFacade.getAu(); // By default the path should start at the AU's base url. Configuration config = au.getConfiguration(); String url = config.get(ConfigParamDescr.BASE_URL.getKey()); ae.setBaseUrl(url); ae.setRestOfUrl(ae.getName()); CIProperties cip = new CIProperties(); ae.setHeaderFields(cip); }
public void loadAuConfigDescrs(Configuration config) throws ConfigurationException { super.loadAuConfigDescrs(config); this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey()); // Now we can construct a valid CC permission checker. m_permissionCheckers = // ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl)); ListUtil.list(new CreativeCommonsPermissionChecker()); paramMap.putLong( KEY_AU_NEW_CONTENT_CRAWL_INTERVAL, CurrentConfig.getTimeIntervalParam( PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL)); if (log.isDebug2()) { log.debug2( "Setting Registry AU recrawl interval to " + StringUtil.timeIntervalToString( paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL))); } }
public class TestBaseAtyponMetadataExtractor extends LockssTestCase { static Logger log = Logger.getLogger("TestBaseAtyponMetadataExtractor"); private MockLockssDaemon theDaemon; private ArchivalUnit bau; private ArchivalUnit bau1; private static String PLUGIN_NAME = "org.lockss.plugin.atypon.BaseAtyponPlugin"; static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey(); private static String BASE_URL = "http://www.baseatypon.org/"; // the metadata that should be extracted static String goodDate = "2012-07-05"; static String[] goodAuthors = new String[] {"D. Author", "S. Author2"}; static String goodFormat = "text/HTML"; static String goodTitle = "Title of Article"; static String goodType = "research-article"; static String goodPublisher = "Base Atypon"; static String goodPublishingPlatform = "Atypon"; static String goodDOI = "10.1137/10081839X"; static String goodJID = "xxx"; static String goodJournal = "Journal Name"; static String goodStartPage = "22"; static String goodEndPage = "44"; static String goodVolume = "13"; static String goodIssue = "3"; static String goodIssn = "1540-3459"; static String doiURL = "http://dx.doi.org/" + goodDOI; private static final String ABS_URL = BASE_URL + "doi/abs/10.1175/2010WCAS1063.1"; private static final String RIS_URL = BASE_URL + "action/downloadCitation?doi=" + goodDOI + "&format=ris&include=cit"; public void setUp() throws Exception { super.setUp(); setUpDiskSpace(); // you need this to have startService work properly... theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); // in this directory this is file "test_baseatypon.tdb" but it becomes xml ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml")); Tdb tdb = ConfigManager.getCurrentConfig().getTdb(); TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0); assertNotNull("Didn't find named TdbAu", tdbau1); bau1 = PluginTestUtil.createAndStartAu(tdbau1); assertNotNull(bau1); TypedEntryMap auConfig = bau1.getProperties(); assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY)); } public void tearDown() throws Exception { theDaemon.stopDaemon(); super.tearDown(); } /* * Test the functionality of the MetadataUtilities * */ public void testNormalizeTitleValue() throws Exception { assertEquals( BaseAtyponMetadataUtil.normalizeTitle("The title goes here"), BaseAtyponMetadataUtil.normalizeTitle("Title Goes Here")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces"), BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Normalize -- hyphen"), BaseAtyponMetadataUtil.normalizeTitle("normalize \u2013\u2013 hyphen")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Title and title"), BaseAtyponMetadataUtil.normalizeTitle("Title & title")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle(" leading spaces"), BaseAtyponMetadataUtil.normalizeTitle("leading spaces")); // now checking the fall-back last ditch attempt assertEquals( BaseAtyponMetadataUtil.generateRawTitle("leading spaces:colon?"), BaseAtyponMetadataUtil.generateRawTitle("leadingspacescolon")); assertEquals( BaseAtyponMetadataUtil.generateRawTitle("relapsing-remitting"), BaseAtyponMetadataUtil.generateRawTitle("relapsing?remitting")); assertEquals( BaseAtyponMetadataUtil.generateRawTitle("foo\"blah"), BaseAtyponMetadataUtil.generateRawTitle("foo-blah")); } /** * Configuration method. * * @return */ /* "<meta name="dc.Title" content="Title of Article"></meta> "<meta name="dc.Creator" content="D. Author"></meta> "<meta name="dc.Creator" content="S. Author2"></meta> "<meta name="dc.Subject" content="weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57"></meta> "<meta name="dc.Description" content="Long test summary of article, probably taken directly from the adstract..."></meta> "<meta name="dc.Publisher" content="Name of Publisher"></meta> "<meta name="dc.Date" scheme="WTN8601" content="2012-07-05"></meta> "<meta name="dc.Type" content="research-article"></meta> "<meta name="dc.Format" content="text/HTML"></meta> "<meta name="dc.Identifier" scheme="publisher" content="81839"></meta> "<meta name="dc.Identifier" scheme="doi" content="10.1137/10081839X"></meta> "<meta name="dc.Source" content="http://dx.doi.org/10.1137/10081839X"></meta> "<meta name="dc.Language" content="en"></meta> "<meta name="dc.Coverage" content="world"></meta> "<meta name="keywords" content="weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57"></meta> */ // a chunk of html source code from the publisher's site from where the // metadata should be extracted String goodHtmlContent = "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>" + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>" + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>" + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>" + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>" + "<meta name=\"dc.Publisher\" content=\"Base Atypon\"></meta>" + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>" + "<meta name=\"dc.Type\" content=\"research-article\"></meta>" + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"doi\" content=\"10.1137/10081839X\"></meta>" + "<meta name=\"dc.Source\" content=\"http://dx.doi.org/10.1137/10081839X\"></meta>" + "<meta name=\"dc.Language\" content=\"en\"></meta>" + "<meta name=\"dc.Coverage\" content=\"world\"></meta>" + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>"; public void testExtractGoodHtmlContent() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT)); assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE)); assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR)); assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR)); } String goodHtmlContentNoDOIorPublisher = "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>" + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>" + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>" + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>" + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>" + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>" + "<meta name=\"dc.Type\" content=\"research-article\"></meta>" + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>" + "<meta name=\"dc.Language\" content=\"en\"></meta>" + "<meta name=\"dc.Coverage\" content=\"world\"></meta>" + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>"; public void testDOIExtraction() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); // gets pulled from the URL if not set in the metadata assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI)); // gets set manually if not in the metadata // first it would try the TDB assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); } private String createGoodRisContent() { StringBuilder sb = new StringBuilder(); sb.append("TY - JOUR"); for (String auth : goodAuthors) { sb.append("\nA1 - "); sb.append(auth); } sb.append("\nDA - "); sb.append(goodDate); sb.append("\nJF - "); sb.append(goodJournal); sb.append("\nSP - "); sb.append(goodStartPage); sb.append("\nEP - "); sb.append(goodEndPage); sb.append("\nVL - "); sb.append(goodVolume); sb.append("\nIS - "); sb.append(goodIssue); sb.append("\nSN - "); sb.append(goodIssn); sb.append("\nT1 - "); sb.append(goodTitle); sb.append("\nPB - "); sb.append(goodPublisher); sb.append("\nDO - "); sb.append(goodDOI); sb.append("\nUR - "); sb.append(doiURL); sb.append("\nER -"); return sb.toString(); } /** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractGoodRisContent() throws Exception { String goodContent = createGoodRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME)); assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE)); assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE)); assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE)); assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN)); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI)); // This shouldn't get set. It will default later to fuill_text_cu assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL)); } /* the extractor checks if data is missing it uses possible alternate RIS tags */ private String createAlternateRisContent() { StringBuilder sb = new StringBuilder(); sb.append("TY - JOUR"); for (String auth : goodAuthors) { sb.append("\nAU - "); sb.append(auth); } sb.append("\nY1 - "); sb.append(goodDate); sb.append("\nT2 - "); sb.append(goodJournal); sb.append("\nT1 - "); sb.append(goodTitle); sb.append("\nPB - "); sb.append(goodPublisher); sb.append("\nER -"); return sb.toString(); } /** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractAlternateRisContent() throws Exception { String goodContent = createAlternateRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); } /* private support methods */ private List<ArticleMetadata> setupContentForAU( ArchivalUnit au, String url, String content, boolean isHtmlExtractor) throws IOException, PluginException { FileMetadataExtractor me; InputStream input = null; CIProperties props = null; if (isHtmlExtractor) { input = IOUtils.toInputStream(content, "utf-8"); props = getContentHtmlProperties(); me = new BaseAtyponHtmlMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/html"); } else { input = IOUtils.toInputStream(content, "utf-8"); props = getContentRisProperties(); me = new BaseAtyponRisMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain"); } UrlData ud = new UrlData(input, props, url); UrlCacher uc = au.makeUrlCacher(ud); uc.storeContent(); CachedUrl cu = uc.getCachedUrl(); FileMetadataListExtractor mle = new FileMetadataListExtractor(me); return mle.extract(MetadataTarget.Any(), cu); } private CIProperties getContentHtmlProperties() { CIProperties cProps = new CIProperties(); // the CU checks the X-Lockss-content-type, not the content-type to determine encoding cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html; charset=UTF-8"); cProps.put("Content-type", "text/html; charset=UTF-8"); return cProps; } private CIProperties getContentRisProperties() { CIProperties cProps = new CIProperties(); // the CU checks the X-Lockss-content-type, not the content-type to determine encoding cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/plain; charset=UTF-8"); cProps.put("Content-type", "text/plain; charset=UTF-8"); return cProps; } }
public class TestPalgraveBookArticleIteratorFactory extends ArticleIteratorTestCase { // private SimulatedArchivalUnit sau; // Simulated AU to generate content private static final String PLUGIN_NAME = "org.lockss.plugin.palgrave.ClockssPalgraveBookPlugin"; private static final String BASE_URL = "http://www.palgraveconnect.com/"; private static final String BOOK_ISBN = "9781137024497"; private static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey(); private static final String BOOK_ISBN_KEY = "book_isbn"; private static final int DEFAULT_FILESIZE = 3000; private final String EXPECTED_PDF_LANDING_PAGE = "http://www.palgraveconnect.com/pc/doifinder/10.1057/9781137024497"; private final String EXPECTED_PDF_URL = "http://www.palgraveconnect.com/pc/busman2013/browse/inside/download/9781137024497.pdf"; private final String EXPECTED_FULL_TEXT_URL = EXPECTED_PDF_URL; private CIProperties pdfHeader = new CIProperties(); private CIProperties textHeader = new CIProperties(); private CIProperties epubHeader = new CIProperties(); private static final String ContentString = "foo blah"; InputStream random_content_stream; @Override public void setUp() throws Exception { super.setUp(); String tempDirPath = setUpDiskSpace(); au = createAu(); // set up headers for creating mock CU's of the appropriate type pdfHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "application/pdf"); textHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html"); epubHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "application/epub"); // the content in the urls doesn't really matter for the test random_content_stream = new ByteArrayInputStream(ContentString.getBytes(Constants.ENCODING_UTF_8)); } @Override public void tearDown() throws Exception { super.tearDown(); } // Set configuration attributes to create plugin AU (archival unit) Configuration palgraveBookAuConfig() { Configuration conf = ConfigManager.newConfiguration(); conf.put(BASE_URL_KEY, BASE_URL); conf.put(BOOK_ISBN_KEY, BOOK_ISBN); return conf; } protected ArchivalUnit createAu() throws ArchivalUnit.ConfigurationException { return PluginTestUtil.createAndStartAu(PLUGIN_NAME, palgraveBookAuConfig()); } public void testRoots() throws Exception { SubTreeArticleIterator artIter = createSubTreeIter(); assertEquals(ListUtil.list(BASE_URL + "pc/"), getRootUrls(artIter)); } public void testUrlsWithPrefixes() throws Exception { SubTreeArticleIterator artIter = createSubTreeIter(); Pattern pat = getPattern(artIter); // PATTERN_TEMPLATE = "\"%spc/.+/browse/inside/(download|epub)?/[0-9]+\\.(html|pdf|epub)$\", // base_url"; // NEW PATTERN_TEMPLATE = "\"%spc/doifinder/download/10.1057/([0-9]+)(\\.epub)?$\", base_url"; assertNotMatchesRE( pat, "http://www.palgraveconnect.com/pc/busman2013/browsee/inside/download/9781137024497.pdfbad"); assertNotMatchesRE( pat, "http://www.palgraveconnect.com/pc/doifinder/download-this/10.1057/9781137289520"); assertNotMatchesRE( pat, "http://www.palgraveconnect.com/pc/busman2013/browse/inside/download/9781137024497.pdf"); assertNotMatchesRE( pat, "http://www.palgraveconnect.com/pc/busman2013/browse/inside/epub/9781137024497.epub"); // assertMatchesRE( pat, "http://www.palgraveconnect.com/pc/doifinder/download/10.1057/9781137024497"); assertMatchesRE( pat, "http://www.palgraveconnect.com/pc/doifinder/download/10.1057/9781137024497.epub"); assertMatchesRE( pat, "http://www.palgraveconnect.com/pc/doifinder/download/10.1057/9781137289520"); } public void testCreateArticleFiles() throws Exception { // create urls to store in UrlCacher String[] au_urls = { BASE_URL + "pc/doifinder/10.1057/9780123456789", BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789", BASE_URL + "pc/doifinder/download/10.1057/9780123456789", BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub", BASE_URL + "pc/doifinder/10.1057/9781234567890", BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890", BASE_URL + "pc/doifinder/download/10.1057/9781234567890", BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub", BASE_URL + "pc/doifinder/10.1057/9782345678901", BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901", BASE_URL + "pc/doifinder/download/10.1057/9782345678901", BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub" }; /* // get cached url content type and properties from simulated contents // for UrclCacher.storeContent() CachedUrl cuPdf = null; CachedUrl cuHtml = null; CachedUrl cuEpub = null; for (CachedUrl cu : AuUtil.getCuIterable(sau)) { if (cuPdf == null && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_PDF)) { log.info("pdf contenttype: " + cu.getContentType()); cuPdf = cu; } else if (cuHtml == null && cu.getContentType().toLowerCase().startsWith(Constants.MIME_TYPE_HTML)) { log.info("html contenttype: " + cu.getContentType()); cuHtml = cu; } else if (cuEpub == null && cu.getContentType().toLowerCase().startsWith("application/epub")) { log.info("epub contenttype: " + cu.getContentType()); cuEpub = cu; } if (cuPdf != null && cuHtml != null && cuEpub != null) { break; } } */ CachedUrl cu; // store content using cached url content type and properties for (String url : au_urls) { if (url.contains("download") && !url.endsWith(".epub")) { storeContent(random_content_stream, pdfHeader, url); } else if (url.contains("download")) { // epub storeContent(random_content_stream, epubHeader, url); } else { storeContent(random_content_stream, textHeader, url); } } // book 9780123456789 ArticleFiles af1 = new ArticleFiles(); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9780123456789"); af1.setRoleString( ArticleFiles.ROLE_ARTICLE_METADATA, BASE_URL + "pc/browse/citationExport?doi=10.1057/9780123456789"); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_EPUB, BASE_URL + "pc/doifinder/download/10.1057/9780123456789.epub"); // book 9780123456789 ArticleFiles af2 = new ArticleFiles(); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9781234567890"); af1.setRoleString( ArticleFiles.ROLE_ARTICLE_METADATA, BASE_URL + "pc/browse/citationExport?doi=10.1057/9781234567890"); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_EPUB, BASE_URL + "pc/doifinder/download/10.1057/9781234567890.epub"); // book 9780123456789 ArticleFiles af3 = new ArticleFiles(); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_PDF, BASE_URL + "pc/doifinder/download/10.1057/9782345678901"); af1.setRoleString( ArticleFiles.ROLE_ARTICLE_METADATA, BASE_URL + "pc/browse/citationExport?doi=10.1057/9782345678901"); af1.setRoleString( ArticleFiles.ROLE_FULL_TEXT_EPUB, BASE_URL + "pc/doifinder/download/10.1057/9782345678901.epub"); // key the expected content to the fullTextUrl for the ArticleFiles HashMap<String, ArticleFiles> fullUrlToAF = new HashMap<String, ArticleFiles>(); fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9780123456789", af1); fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9781234567890", af2); fullUrlToAF.put(BASE_URL + "pc/doifinder/download/10.1057/9782345678901", af3); // get article iterator, get article files and the appropriate urls according // to their roles. String[] expectedUrls = { EXPECTED_FULL_TEXT_URL, EXPECTED_PDF_URL, }; for (SubTreeArticleIterator artIter = createSubTreeIter(); artIter.hasNext(); ) { ArticleFiles af = artIter.next(); String[] actualUrls = { af.getFullTextUrl(), af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF), // af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE) }; log.info("actualUrls: " + actualUrls.length); for (int i = 0; i < actualUrls.length; i++) { log.info("e_url: " + expectedUrls[i]); log.info("url: " + actualUrls[i]); // assertEquals(expectedUrls[i], actualUrls[i]); } } } }
public class TestEmlsPlugin extends LockssTestCase { private DefinablePlugin plugin; static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey(); static final String VOL_KEY = ConfigParamDescr.VOLUME_NUMBER.getKey(); public void setUp() throws Exception { super.setUp(); plugin = new DefinablePlugin(); plugin.initPlugin(getMockLockssDaemon(), "org.lockss.plugin.emls.EmlsPlugin"); } public void testGetAuNullConfig() throws ArchivalUnit.ConfigurationException { try { plugin.configureAu(null, null); fail("Didn't throw ArchivalUnit.ConfigurationException"); } catch (ArchivalUnit.ConfigurationException e) { } } private DefinableArchivalUnit makeAuFromProps(Properties props) throws ArchivalUnit.ConfigurationException { Configuration config = ConfigurationUtil.fromProps(props); return (DefinableArchivalUnit) plugin.configureAu(config, null); } public void testGetAuHandlesBadUrl() throws ArchivalUnit.ConfigurationException, MalformedURLException { Properties props = new Properties(); props.setProperty(BASE_URL_KEY, "blah"); props.setProperty(VOL_KEY, "3"); try { DefinableArchivalUnit au = makeAuFromProps(props); fail("Didn't throw InstantiationException when given a bad url"); } catch (ArchivalUnit.ConfigurationException auie) { ConfigParamDescr.InvalidFormatException murle = (ConfigParamDescr.InvalidFormatException) auie.getCause(); assertNotNull(auie.getCause()); } } public void testGetAuConstructsProperAu() throws ArchivalUnit.ConfigurationException, MalformedURLException { Properties props = new Properties(); props.setProperty(BASE_URL_KEY, "http://extra.shu.ac.uk/emls/"); props.setProperty(VOL_KEY, "3"); DefinableArchivalUnit au = makeAuFromProps(props); assertEquals( "Early Modern Literary Studies Plugin, Base URL http://extra.shu.ac.uk/emls/, Volume 3", au.getName()); } public void testGetPluginId() { assertEquals("org.lockss.plugin.emls.EmlsPlugin", plugin.getPluginId()); } public void testGetAuConfigProperties() { for (Iterator iter = plugin.getLocalAuConfigDescrs().iterator(); iter.hasNext(); ) { ConfigParamDescr desc = (ConfigParamDescr) iter.next(); if (desc.equals(ConfigParamDescr.BASE_URL)) { continue; } if (desc.equals(ConfigParamDescr.VOLUME_NUMBER)) { continue; } if ("issues".equals(desc.getKey())) { assertEquals(ConfigParamDescr.TYPE_SET, desc.getType()); assertFalse(desc.isDefinitional()); continue; } fail("Unexpected config param: " + desc.getKey()); } } }
public class TestHighWireDrupalPlugin extends LockssTestCase { static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey(); static final String VOL_KEY = ConfigParamDescr.VOLUME_NAME.getKey(); private MockLockssDaemon theDaemon; private DefinablePlugin plugin; public TestHighWireDrupalPlugin(String msg) { super(msg); } @Override public void setUp() throws Exception { super.setUp(); setUpDiskSpace(); theDaemon = getMockLockssDaemon(); plugin = new DefinablePlugin(); plugin.initPlugin(getMockLockssDaemon(), "org.lockss.plugin.highwire.HighWireDrupalPlugin"); } public void testGetAuNullConfig() throws ArchivalUnit.ConfigurationException { try { plugin.configureAu(null, null); fail("Didn't throw ArchivalUnit.ConfigurationException"); } catch (ArchivalUnit.ConfigurationException e) { } } public void testCreateAu() throws ConfigurationException { Properties props = new Properties(); props.setProperty(BASE_URL_KEY, "http://www.example.com/"); props.setProperty(VOL_KEY, "32"); makeAuFromProps(props); } private DefinableArchivalUnit makeAuFromProps(Properties props) throws ArchivalUnit.ConfigurationException { Configuration config = ConfigurationUtil.fromProps(props); return (DefinableArchivalUnit) plugin.configureAu(config, null); } public void testGetAuConstructsProperAu() throws ArchivalUnit.ConfigurationException, MalformedURLException { Properties props = new Properties(); props.setProperty(VOL_KEY, "303"); props.setProperty(BASE_URL_KEY, "http://www.example.com/"); String starturl = "http://www.example.com/lockss-manifest/vol_303_manifest.html"; DefinableArchivalUnit au = makeAuFromProps(props); assertEquals( "HighWire Drupal Plugin, Base URL http://www.example.com/, Volume 303", au.getName()); assertEquals(ListUtil.list(starturl), au.getStartUrls()); } public void testGetPluginId() { assertEquals("org.lockss.plugin.highwire.HighWireDrupalPlugin", plugin.getPluginId()); } public void testGetAuConfigProperties() { assertEquals( ListUtil.list(ConfigParamDescr.BASE_URL, ConfigParamDescr.VOLUME_NAME), plugin.getLocalAuConfigDescrs()); } public void testHandles500Result() throws Exception { Properties props = new Properties(); props.setProperty(VOL_KEY, "322"); props.setProperty(BASE_URL_KEY, "http://www.example.com/"); String starturl = "http://www.example.com/lockss-manifest/vol_322_manifest.html"; DefinableArchivalUnit au = makeAuFromProps(props); MockLockssUrlConnection conn = new MockLockssUrlConnection(); conn.setURL("http://uuu17/"); CacheException exc = ((HttpResultMap) plugin.getCacheResultMap()).mapException(au, conn, 500, "foo"); assertClass(CacheException.RetryDeadLinkException.class, exc); conn.setURL(starturl); exc = ((HttpResultMap) plugin.getCacheResultMap()).mapException(au, conn, 500, "foo"); assertClass(CacheException.RetrySameUrlException.class, exc); } // Test the crawl rules for eLife public void testShouldCacheProperPages() throws Exception { String ROOT_URL = "http://highwire.org/"; Properties props = new Properties(); props.setProperty(BASE_URL_KEY, ROOT_URL); props.setProperty(VOL_KEY, "2015"); DefinableArchivalUnit au = null; try { au = makeAuFromProps(props); } catch (ConfigurationException ex) { } theDaemon.getLockssRepository(au); // Test for pages that should get crawled or not // permission page/start url shouldCacheTest(ROOT_URL + "lockss-manifest/vol_2015_manifest.html", true, au); shouldCacheTest(ROOT_URL + "clockss-manifest/vol_2015_manifest.html", false, au); shouldCacheTest(ROOT_URL + "manifest/year=2015", false, au); // toc page for a volume, issue shouldCacheTest(ROOT_URL + "content/2015", false, au); shouldCacheTest(ROOT_URL + "content/2015/1", true, au); shouldCacheTest(ROOT_URL + "content/2015/2.toc", true, au); // article files shouldCacheTest(ROOT_URL + "content/2015/1/2", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.abstract", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.extract", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.full", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.full.pdf", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.full.pdf+html", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.full-text.pdf+html", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2/DC1", true, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.print", false, au); shouldCacheTest(ROOT_URL + "content/2015/1/2.explore", false, au); shouldCacheTest(ROOT_URL + "content/2015/1/2/article-info", false, au); shouldCacheTest(ROOT_URL + "content/2015/1/2/submit?param=12", false, au); shouldCacheTest(ROOT_URL + "panels_ajax_tab/hw_tab_data/node:80746/1", true, au); shouldCacheTest(ROOT_URL + "panels_ajax_tab/hw_tab_art/node:80746/1", false, au); shouldCacheTest(ROOT_URL + "highwire/citation/12/ris", true, au); shouldCacheTest(ROOT_URL + "highwire/citation/9/1/ris", false, au); shouldCacheTest(ROOT_URL + "highwire/markup/113/expansion", true, au); shouldCacheTest(ROOT_URL + "sites/all/libraries/modernizr/modernizr.min.js", true, au); shouldCacheTest(ROOT_URL + "sites/default/files/js/js_0j8_f76rvZ212f4rg.js", true, au); shouldCacheTest(ROOT_URL + "sites/default/themes/hw/font/fontawesome-webfont.eot", true, au); shouldCacheTest(ROOT_URL + "sites/default/themes/font/fontawesome-webfont.eot", true, au); shouldCacheTest( ROOT_URL + "content/hw/suppl/2014/04/23/hw.02130.DC1/hw02130_Supplemental_files.zip", true, au); shouldCacheTest("http://cdn.cloudfront.net/content/2015/1/3/F1.medium.gif", true, au); shouldCacheTest("http://cdn.mathjax.org/mathjax/latest/MathJax.js", true, au); shouldCacheTest("https://ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js", true, au); shouldCacheTest("", false, au); // should not get crawled - LOCKSS shouldCacheTest("http://lockss.stanford.edu", false, au); } private void shouldCacheTest(String url, boolean shouldCache, ArchivalUnit au) { log.info("shouldCacheTest url: " + url); assertEquals(shouldCache, au.shouldBeCached(url)); } }