public class TestBaseAtyponMetadataExtractor extends LockssTestCase { static Logger log = Logger.getLogger("TestBaseAtyponMetadataExtractor"); private MockLockssDaemon theDaemon; private ArchivalUnit bau; private ArchivalUnit bau1; private static String PLUGIN_NAME = "org.lockss.plugin.atypon.BaseAtyponPlugin"; static final String BASE_URL_KEY = ConfigParamDescr.BASE_URL.getKey(); private static String BASE_URL = "http://www.baseatypon.org/"; // the metadata that should be extracted static String goodDate = "2012-07-05"; static String[] goodAuthors = new String[] {"D. Author", "S. Author2"}; static String goodFormat = "text/HTML"; static String goodTitle = "Title of Article"; static String goodType = "research-article"; static String goodPublisher = "Base Atypon"; static String goodPublishingPlatform = "Atypon"; static String goodDOI = "10.1137/10081839X"; static String goodJID = "xxx"; static String goodJournal = "Journal Name"; static String goodStartPage = "22"; static String goodEndPage = "44"; static String goodVolume = "13"; static String goodIssue = "3"; static String goodIssn = "1540-3459"; static String doiURL = "http://dx.doi.org/" + goodDOI; private static final String ABS_URL = BASE_URL + "doi/abs/10.1175/2010WCAS1063.1"; private static final String RIS_URL = BASE_URL + "action/downloadCitation?doi=" + goodDOI + "&format=ris&include=cit"; public void setUp() throws Exception { super.setUp(); setUpDiskSpace(); // you need this to have startService work properly... theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); // in this directory this is file "test_baseatypon.tdb" but it becomes xml ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml")); Tdb tdb = ConfigManager.getCurrentConfig().getTdb(); TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0); assertNotNull("Didn't find named TdbAu", tdbau1); bau1 = PluginTestUtil.createAndStartAu(tdbau1); assertNotNull(bau1); TypedEntryMap auConfig = bau1.getProperties(); assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY)); } public void tearDown() throws Exception { theDaemon.stopDaemon(); super.tearDown(); } /* * Test the functionality of the MetadataUtilities * */ public void testNormalizeTitleValue() throws Exception { assertEquals( BaseAtyponMetadataUtil.normalizeTitle("The title goes here"), BaseAtyponMetadataUtil.normalizeTitle("Title Goes Here")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces"), BaseAtyponMetadataUtil.normalizeTitle("Title with random spaces")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Normalize -- hyphen"), BaseAtyponMetadataUtil.normalizeTitle("normalize \u2013\u2013 hyphen")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle("Title and title"), BaseAtyponMetadataUtil.normalizeTitle("Title & title")); assertEquals( BaseAtyponMetadataUtil.normalizeTitle(" leading spaces"), BaseAtyponMetadataUtil.normalizeTitle("leading spaces")); // now checking the fall-back last ditch attempt assertEquals( BaseAtyponMetadataUtil.generateRawTitle("leading spaces:colon?"), BaseAtyponMetadataUtil.generateRawTitle("leadingspacescolon")); assertEquals( BaseAtyponMetadataUtil.generateRawTitle("relapsing-remitting"), BaseAtyponMetadataUtil.generateRawTitle("relapsing?remitting")); assertEquals( BaseAtyponMetadataUtil.generateRawTitle("foo\"blah"), BaseAtyponMetadataUtil.generateRawTitle("foo-blah")); } /** * Configuration method. * * @return */ /* "<meta name="dc.Title" content="Title of Article"></meta> "<meta name="dc.Creator" content="D. Author"></meta> "<meta name="dc.Creator" content="S. Author2"></meta> "<meta name="dc.Subject" content="weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57"></meta> "<meta name="dc.Description" content="Long test summary of article, probably taken directly from the adstract..."></meta> "<meta name="dc.Publisher" content="Name of Publisher"></meta> "<meta name="dc.Date" scheme="WTN8601" content="2012-07-05"></meta> "<meta name="dc.Type" content="research-article"></meta> "<meta name="dc.Format" content="text/HTML"></meta> "<meta name="dc.Identifier" scheme="publisher" content="81839"></meta> "<meta name="dc.Identifier" scheme="doi" content="10.1137/10081839X"></meta> "<meta name="dc.Source" content="http://dx.doi.org/10.1137/10081839X"></meta> "<meta name="dc.Language" content="en"></meta> "<meta name="dc.Coverage" content="world"></meta> "<meta name="keywords" content="weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57"></meta> */ // a chunk of html source code from the publisher's site from where the // metadata should be extracted String goodHtmlContent = "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>" + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>" + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>" + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>" + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>" + "<meta name=\"dc.Publisher\" content=\"Base Atypon\"></meta>" + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>" + "<meta name=\"dc.Type\" content=\"research-article\"></meta>" + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"doi\" content=\"10.1137/10081839X\"></meta>" + "<meta name=\"dc.Source\" content=\"http://dx.doi.org/10.1137/10081839X\"></meta>" + "<meta name=\"dc.Language\" content=\"en\"></meta>" + "<meta name=\"dc.Coverage\" content=\"world\"></meta>" + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>"; public void testExtractGoodHtmlContent() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContent, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodTitle, md.get(MetadataField.DC_FIELD_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodFormat, md.get(MetadataField.DC_FIELD_FORMAT)); assertEquals(goodType, md.get(MetadataField.DC_FIELD_TYPE)); assertEquals(Arrays.asList(goodAuthors), md.getList(MetadataField.FIELD_AUTHOR)); assertEquals(goodAuthors[0], md.get(MetadataField.DC_FIELD_CREATOR)); } String goodHtmlContentNoDOIorPublisher = "<meta name=\"dc.Title\" content=\"Title of Article\"></meta>" + "<meta name=\"dc.Creator\" content=\"D. Author\"></meta>" + "<meta name=\"dc.Creator\" content=\"S. Author2\"></meta>" + "<meta name=\"dc.Subject\" content=\"weighted regularity; elliptic problem; oscillatory diffusion; $hp$ finite elements; 65N30; 35B65; 35J57\"></meta>" + "<meta name=\"dc.Description\" content=\"Long test summary of article, probably taken directly from the adstract...\"></meta>" + "<meta name=\"dc.Date\" scheme=\"WTN8601\" content=\"2012-07-05\"></meta>" + "<meta name=\"dc.Type\" content=\"research-article\"></meta>" + "<meta name=\"dc.Format\" content=\"text/HTML\"></meta>" + "<meta name=\"dc.Identifier\" scheme=\"publisher\" content=\"81839\"></meta>" + "<meta name=\"dc.Language\" content=\"en\"></meta>" + "<meta name=\"dc.Coverage\" content=\"world\"></meta>" + "<meta name=\"keywords\" content=\"weighted regularity, elliptic problem, oscillatory diffusion, $hp$ finite elements, 65N30, 35B65, 35J57\"></meta>"; public void testDOIExtraction() throws Exception { List<ArticleMetadata> mdlist = setupContentForAU(bau1, ABS_URL, goodHtmlContentNoDOIorPublisher, true); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); // gets pulled from the URL if not set in the metadata assertEquals("10.1175/2010WCAS1063.1", md.get(MetadataField.FIELD_DOI)); // gets set manually if not in the metadata // first it would try the TDB assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); } private String createGoodRisContent() { StringBuilder sb = new StringBuilder(); sb.append("TY - JOUR"); for (String auth : goodAuthors) { sb.append("\nA1 - "); sb.append(auth); } sb.append("\nDA - "); sb.append(goodDate); sb.append("\nJF - "); sb.append(goodJournal); sb.append("\nSP - "); sb.append(goodStartPage); sb.append("\nEP - "); sb.append(goodEndPage); sb.append("\nVL - "); sb.append(goodVolume); sb.append("\nIS - "); sb.append(goodIssue); sb.append("\nSN - "); sb.append(goodIssn); sb.append("\nT1 - "); sb.append(goodTitle); sb.append("\nPB - "); sb.append(goodPublisher); sb.append("\nDO - "); sb.append(goodDOI); sb.append("\nUR - "); sb.append(doiURL); sb.append("\nER -"); return sb.toString(); } /** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractGoodRisContent() throws Exception { String goodContent = createGoodRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME)); assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE)); assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE)); assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE)); assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN)); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI)); // This shouldn't get set. It will default later to fuill_text_cu assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL)); } /* the extractor checks if data is missing it uses possible alternate RIS tags */ private String createAlternateRisContent() { StringBuilder sb = new StringBuilder(); sb.append("TY - JOUR"); for (String auth : goodAuthors) { sb.append("\nAU - "); sb.append(auth); } sb.append("\nY1 - "); sb.append(goodDate); sb.append("\nT2 - "); sb.append(goodJournal); sb.append("\nT1 - "); sb.append(goodTitle); sb.append("\nPB - "); sb.append(goodPublisher); sb.append("\nER -"); return sb.toString(); } /** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractAlternateRisContent() throws Exception { String goodContent = createAlternateRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); } /* private support methods */ private List<ArticleMetadata> setupContentForAU( ArchivalUnit au, String url, String content, boolean isHtmlExtractor) throws IOException, PluginException { FileMetadataExtractor me; InputStream input = null; CIProperties props = null; if (isHtmlExtractor) { input = IOUtils.toInputStream(content, "utf-8"); props = getContentHtmlProperties(); me = new BaseAtyponHtmlMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/html"); } else { input = IOUtils.toInputStream(content, "utf-8"); props = getContentRisProperties(); me = new BaseAtyponRisMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain"); } UrlData ud = new UrlData(input, props, url); UrlCacher uc = au.makeUrlCacher(ud); uc.storeContent(); CachedUrl cu = uc.getCachedUrl(); FileMetadataListExtractor mle = new FileMetadataListExtractor(me); return mle.extract(MetadataTarget.Any(), cu); } private CIProperties getContentHtmlProperties() { CIProperties cProps = new CIProperties(); // the CU checks the X-Lockss-content-type, not the content-type to determine encoding cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/html; charset=UTF-8"); cProps.put("Content-type", "text/html; charset=UTF-8"); return cProps; } private CIProperties getContentRisProperties() { CIProperties cProps = new CIProperties(); // the CU checks the X-Lockss-content-type, not the content-type to determine encoding cProps.put(CachedUrl.PROPERTY_CONTENT_TYPE, "text/plain; charset=UTF-8"); cProps.put("Content-type", "text/plain; charset=UTF-8"); return cProps; } }
public class TestNatureArticleIteratorFactory extends LockssTestCase { static Logger log = Logger.getLogger("TestNatureArticleIteratorFactory"); private SimulatedArchivalUnit sau; // Simulated AU to generate content private ArchivalUnit nau; // Nature AU private MockLockssDaemon theDaemon; private static final int DEFAULT_FILESIZE = 3000; private static int fileSize = DEFAULT_FILESIZE; private static String PLUGIN_NAME = "org.lockss.plugin.nature.ClockssNaturePublishingGroupPlugin"; private static String BASE_URL = "http://www.nature.com/"; public void setUp() throws Exception { super.setUp(); String tempDirPath = getTempDir().getAbsolutePath() + File.separator; ConfigurationUtil.setFromArgs(LockssRepositoryImpl.PARAM_CACHE_LOCATION, tempDirPath); theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath)); nau = PluginTestUtil.createAndStartAu(PLUGIN_NAME, natureAuConfig()); } public void tearDown() throws Exception { sau.deleteContentTree(); theDaemon.stopDaemon(); super.tearDown(); } Configuration simAuConfig(String rootPath) { Configuration conf = ConfigManager.newConfiguration(); conf.put("root", rootPath); conf.put("base_url", BASE_URL); conf.put("depth", "1"); conf.put("branch", "4"); conf.put("numFiles", "7"); conf.put( "fileTypes", "" + (SimulatedContentGenerator.FILE_TYPE_HTML | SimulatedContentGenerator.FILE_TYPE_PDF)); conf.put("binFileSize", "" + fileSize); return conf; } Configuration natureAuConfig() { Configuration conf = ConfigManager.newConfiguration(); conf.put("base_url", BASE_URL); conf.put("journal_id", "aps"); conf.put("volume_name", "123"); conf.put("year", "2008"); return conf; } public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); } // public void testArticleCountAndDefaultType() throws Exception { // testArticleCountAndType("text/html", true, 24); // } // // public void testArticleCountAndPdf() throws Exception { // testArticleCountAndType("application/pdf", false, 0); // } private void deleteBlock(CachedUrl cu) throws IOException { log.info("deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); } }
public class CrawlRuleTester extends Thread { protected static Logger log = Logger.getLogger(CrawlRuleTester.class); /** Proxy host */ public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host"; /** Proxy port */ public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port"; public static final int DEFAULT_PROXY_PORT = -1; /** User-Agent */ public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent"; /* Message Types */ public static final int ERROR_MESSAGE = 0; public static final int WARNING_MESSAGE = 1; public static final int PLAIN_MESSAGE = 2; public static final int URL_SUMMARY_MESSAGE = 3; public static final int TEST_SUMMARY_MESSAGE = 4; private String m_baseUrl; private int m_crawlDepth; private long m_crawlDelay; private int m_curDepth; private ArchivalUnit m_au; private String m_outputFile = null; private BufferedWriter m_outWriter = null; private Deadline fetchDeadline = Deadline.in(0); private boolean useLocalWriter = true; private MessageHandler m_msgHandler; private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool(); private String proxyHost; private String userAgent; private int proxyPort; // our storage for extracted urls private TreeSet m_extracted = new TreeSet(); private TreeSet m_incls = new TreeSet(); private TreeSet m_excls = new TreeSet(); private TreeSet m_reported = new TreeSet(); public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { super("crawlrule tester"); m_crawlDepth = crawlDepth; long minFetchDelay = CurrentConfig.getLongParam( BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY); m_crawlDelay = Math.max(crawlDelay, minFetchDelay); m_baseUrl = baseUrl; m_au = au; } /** * RuleTest * * @param outFile String * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outputFile = outFile; } /** * RuleTest * * @param outWriter BufferedWriter * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outWriter = outWriter; } /** * RuleTest * * @param msgHandler MessageHandler to take all output * @param crawlDepth the crawl depth to use * @param crawlDelay the type to wait between fetches * @param baseUrl the url to start from * @param crawlSpec a CrawlSpec to use for url checking. */ public CrawlRuleTester( MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_msgHandler = msgHandler; } public void run() { try { setConfig(ConfigManager.getCurrentConfig()); if (m_outWriter == null && m_msgHandler == null) { useLocalWriter = true; } else { useLocalWriter = false; } if (useLocalWriter) { openOutputFile(); } checkRules(); if (useLocalWriter) { closeOutputFile(); } } finally { if (m_msgHandler != null) { m_msgHandler.close(); } } } void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else { log.info("Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } } private void openOutputFile() { if (m_outputFile != null) { try { m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false)); return; } catch (Exception ex) { System.err.println("Error opening output file, writing to stdout: " + ex); } } m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out)); } private void closeOutputFile() { try { if (m_outWriter != null) { m_outWriter.close(); } } catch (IOException ex) { System.err.println("Error closing output file."); } } int[] depth_incl; int[] depth_fetched; int[] depth_parsed; private void checkRules() { outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE); outputMessage( "crawl depth: " + m_crawlDepth + " crawl delay: " + m_crawlDelay + " ms.", PLAIN_MESSAGE); TreeSet crawlList = new TreeSet(); TreeSet fetched = new TreeSet(); // inialize with the baseUrl crawlList.add(m_baseUrl); depth_incl = new int[m_crawlDepth]; depth_fetched = new int[m_crawlDepth]; depth_parsed = new int[m_crawlDepth]; long start_time = TimeBase.nowMs(); for (int depth = 1; depth <= m_crawlDepth; depth++) { if (isInterrupted()) { return; } m_curDepth = depth; if (crawlList.isEmpty() && depth <= m_crawlDepth) { outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE); break; } String[] urls = (String[]) crawlList.toArray(new String[0]); crawlList.clear(); outputMessage("\nDepth " + depth, PLAIN_MESSAGE); for (int ix = 0; ix < urls.length; ix++) { if (isInterrupted()) { return; } pauseBeforeFetch(); String urlstr = urls[ix]; m_incls.clear(); m_excls.clear(); // crawl the page buildUrlSets(urlstr); fetched.add(urlstr); // output incl/excl results, // add the new_incls to the crawlList for next crawl depth loop crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls)); } } long elapsed_time = TimeBase.nowMs() - start_time; outputSummary(m_baseUrl, fetched, crawlList, elapsed_time); } private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } } private void pauseBeforeFetch() { if (!fetchDeadline.expired()) { try { fetchDeadline.sleep(); } catch (InterruptedException ie) { // no action } } fetchDeadline.expireIn(m_crawlDelay); } private void outputMessage(String msg, int msgType) { if (isInterrupted()) { return; } if (m_msgHandler != null) { m_msgHandler.outputMessage(msg + "\n", msgType); } else { try { m_outWriter.write(msg); m_outWriter.newLine(); } catch (Exception ex) { System.err.println(msg); } } } private void outputErrResults(String url, String errMsg) { outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE); } private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; } private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String) iter.next(); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); } public interface MessageHandler { void outputMessage(String message, int messageType); void close(); } private class MyLinkExtractorCallback implements LinkExtractor.Callback { MyLinkExtractorCallback() {} public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } } } class MyMockCachedUrl implements CachedUrl { private String url; private boolean doesExist = false; private Reader reader = null; public MyMockCachedUrl(String url, Reader reader) { this.url = url; this.reader = reader; } public ArchivalUnit getArchivalUnit() { throw new UnsupportedOperationException("Not implemented"); } public String getUrl() { return url; } public CachedUrl getCuVersion(int version) { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions() { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions(int maxVersions) { throw new UnsupportedOperationException("Not implemented"); } public int getVersion() { return 1; } public Reader openForReading() { return reader; } public LinkRewriterFactory getLinkRewriterFactory() { throw new UnsupportedOperationException("Not implemented"); } public String getEncoding() { return Constants.DEFAULT_ENCODING; } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream() { throw new UnsupportedOperationException("Not implemented"); } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream() { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @return InputStream */ public InputStream openForHashing() { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @param hasher HashedInputStream.Hasher for unfiltered content * @return InputStream */ public InputStream openForHashing(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * getContentSize * * @return long */ public long getContentSize() { throw new UnsupportedOperationException("Not implemented"); } public String getContentType() { throw new UnsupportedOperationException("Not implemented"); } public void setOption(String option, String val) {} public boolean hasContent() { return doesExist; } public boolean isLeaf() { return true; } public int getType() { return CachedUrlSetNode.TYPE_CACHED_URL; } public CIProperties getProperties() { return null; } public void addProperty(String key, String value) {} public void release() {} public String toString() { StringBuffer sb = new StringBuffer(url.length() + 17); sb.append("[MyMockCachedUrl: "); sb.append(url); sb.append("]"); return sb.toString(); } @Override public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) { return null; } public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) { throw new UnsupportedOperationException("Not implemented"); } @Override public boolean isArchiveMember() { return false; } } }
// SingleThreadModel causes servlet instances to be assigned to only a // single thread (request) at a time. public abstract class LockssServlet extends HttpServlet implements SingleThreadModel { protected static Logger log = Logger.getLogger("LockssServlet"); // Constants static final String PARAM_LOCAL_IP = Configuration.PREFIX + "localIPAddress"; static final String PARAM_PLATFORM_VERSION = Configuration.PREFIX + "platform.version"; /** Inactive HTTP session (cookie) timeout */ static final String PARAM_UI_SESSION_TIMEOUT = Configuration.PREFIX + "ui.sessionTimeout"; static final long DEFAULT_UI_SESSION_TIMEOUT = 2 * Constants.DAY; /** Maximum size of uploaded file accepted */ static final String PARAM_MAX_UPLOAD_FILE_SIZE = Configuration.PREFIX + "ui.maxUploadFileSize"; static final int DEFAULT_MAX_UPLOAD_FILE_SIZE = 500000; /** The warning string to display when the UI is disabled. */ static final String PARAM_UI_WARNING = Configuration.PREFIX + "ui.warning"; // session keys static final String SESSION_KEY_OBJECT_ID = "obj_id"; static final String SESSION_KEY_OBJ_MAP = "obj_map"; public static final String SESSION_KEY_RUNNING_SERVLET = "running_servlet"; public static final String SESSION_KEY_REQUEST_HOST = "request_host"; // Name given to form element whose value is the action that should be // performed when the form is submitted. (Not always the submit button.) public static final String ACTION_TAG = "lockssAction"; public static final String JAVASCRIPT_RESOURCE = "org/lockss/htdocs/admin.js"; public static final String ATTR_INCLUDE_SCRIPT = "IncludeScript"; public static final String ATTR_ALLOW_ROLES = "AllowRoles"; /** User may configure admin access (add/delete/modify users, set admin access list) */ public static final String ROLE_USER_ADMIN = "userAdminRole"; /** User may configure content access (set content access list) */ public static final String ROLE_CONTENT_ADMIN = "contentAdminRole"; /** User may change AU configuration (add/delete content) */ public static final String ROLE_AU_ADMIN = "auAdminRole"; public static final String ROLE_DEBUG = "debugRole"; protected ServletContext context; private LockssApp theApp = null; private ServletManager servletMgr; private AccountManager acctMgr; // Request-local storage. Convenient, but requires servlet instances // to be single threaded, and must ensure reset them to avoid carrying // over state between requests. protected HttpServletRequest req; protected HttpServletResponse resp; protected URL reqURL; protected HttpSession session; private String adminDir = null; protected String clientAddr; // client addr, even if no param protected String localAddr; protected MultiPartRequest multiReq; private Vector footnotes; private int footNumber; private int tabindex; ServletDescr _myServletDescr = null; private String myName = null; // number submit buttons sequentially so unit tests can find them protected int submitButtonNumber = 0; /** Run once when servlet loaded. */ public void init(ServletConfig config) throws ServletException { super.init(config); context = config.getServletContext(); theApp = (LockssApp) context.getAttribute(ServletManager.CONTEXT_ATTR_LOCKSS_APP); servletMgr = (ServletManager) context.getAttribute(ServletManager.CONTEXT_ATTR_SERVLET_MGR); if (theApp instanceof LockssDaemon) { acctMgr = getLockssDaemon().getAccountManager(); } } public ServletManager getServletManager() { return servletMgr; } protected ServletDescr[] getServletDescrs() { return servletMgr.getServletDescrs(); } /** Servlets must implement this method. */ protected abstract void lockssHandleRequest() throws ServletException, IOException; /** Common request handling. */ public void service(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { resetState(); boolean success = false; HttpSession session = req.getSession(false); try { this.req = req; this.resp = resp; if (log.isDebug()) { logParams(); } resp.setContentType("text/html"); if (!mayPageBeCached()) { resp.setHeader("pragma", "no-cache"); resp.setHeader("Cache-control", "no-cache"); } reqURL = new URL(UrlUtil.getRequestURL(req)); clientAddr = getLocalIPAddr(); // check that current user has permission to run this servlet if (!isServletAllowed(myServletDescr())) { displayWarningInLieuOfPage("You are not authorized to use " + myServletDescr().heading); return; } // check whether servlet is disabled String reason = ServletUtil.servletDisabledReason(myServletDescr().getServletName()); if (reason != null) { displayWarningInLieuOfPage("This function is disabled. " + reason); return; } if (session != null) { session.setAttribute(SESSION_KEY_RUNNING_SERVLET, getHeading()); String reqHost = req.getRemoteHost(); String forw = req.getHeader(HttpFields.__XForwardedFor); if (!StringUtil.isNullString(forw)) { reqHost += " (proxies for " + forw + ")"; } session.setAttribute(SESSION_KEY_REQUEST_HOST, reqHost); } lockssHandleRequest(); success = (errMsg == null); } catch (ServletException e) { log.error("Servlet threw", e); throw e; } catch (IOException e) { log.error("Servlet threw", e); throw e; } catch (RuntimeException e) { log.error("Servlet threw", e); throw e; } finally { if (session != null) { session.setAttribute(SESSION_KEY_RUNNING_SERVLET, null); session.setAttribute(LockssFormAuthenticator.__J_AUTH_ACTIVITY, TimeBase.nowMs()); } if ("please".equalsIgnoreCase(req.getHeader("X-Lockss-Result"))) { log.debug3("X-Lockss-Result: " + (success ? "Ok" : "Fail")); resp.setHeader("X-Lockss-Result", success ? "Ok" : "Fail"); } resetMyLocals(); resetLocals(); } } protected void resetState() { multiReq = null; footNumber = 0; submitButtonNumber = 0; tabindex = 1; statusMsg = null; errMsg = null; isFramed = false; } protected void resetLocals() {} protected void resetMyLocals() { // Don't hold on to stuff forever req = null; resp = null; session = null; reqURL = null; adminDir = null; localAddr = null; footnotes = null; _myServletDescr = null; myName = null; multiReq = null; } /** * Return true if generated page may be cached (e.g., by browser). Default is false as most * servlets generate dynamic results */ protected boolean mayPageBeCached() { return false; } /** Set the session timeout to the configured value */ protected void setSessionTimeout(HttpSession session) { Configuration config = CurrentConfig.getCurrentConfig(); setSessionTimeout( session, config.getTimeInterval(PARAM_UI_SESSION_TIMEOUT, DEFAULT_UI_SESSION_TIMEOUT)); } /** Set the session timeout */ protected void setSessionTimeout(HttpSession session, long time) { session.setMaxInactiveInterval((int) (time / Constants.SECOND)); } /** Get the current session, creating it if necessary (and set the timeout if so) */ protected HttpSession getSession() { if (session == null) { session = req.getSession(true); if (session.isNew()) { setSessionTimeout(session); } } return session; } /** Return true iff a session has already been established */ protected boolean hasSession() { return req.getSession(false) != null; } /** Get an unused ID string for storing an object in the session */ protected String getNewSessionObjectId() { HttpSession session = getSession(); synchronized (session) { Integer id = (Integer) getSession().getAttribute(SESSION_KEY_OBJECT_ID); if (id == null) { id = new Integer(1); } session.setAttribute(SESSION_KEY_OBJECT_ID, new Integer(id.intValue() + 1)); return id.toString(); } } /** Get the object associated with the ID in the session */ protected Object getSessionIdObject(String id) { HttpSession session = getSession(); synchronized (session) { BidiMap map = (BidiMap) session.getAttribute(SESSION_KEY_OBJ_MAP); if (map == null) { return null; } return map.getKey(id); } } /** Get the String associated with the ID in the session */ protected String getSessionIdString(String id) { return (String) getSessionIdObject(id); } /** Get the ID with which the object is associated with the session, if any */ protected String getSessionObjectId(Object obj) { HttpSession session = getSession(); BidiMap map; synchronized (session) { map = (BidiMap) session.getAttribute(SESSION_KEY_OBJ_MAP); if (map == null) { map = new DualHashBidiMap(); session.setAttribute(SESSION_KEY_OBJ_MAP, map); } } synchronized (map) { String id = (String) map.get(obj); if (id == null) { id = getNewSessionObjectId(); map.put(obj, id); } return id; } } // Return descriptor of running servlet protected ServletDescr myServletDescr() { if (_myServletDescr == null) { _myServletDescr = servletMgr.findServletDescr(this); } return _myServletDescr; } // By default, servlet heading is in descr. Override method to // compute other heading protected String getHeading(ServletDescr d) { if (d == null) return "Unknown Servlet"; return d.heading; } protected String getHeading() { return getHeading(myServletDescr()); } String getLocalIPAddr() { if (localAddr == null) { try { IPAddr localHost = IPAddr.getLocalHost(); localAddr = localHost.getHostAddress(); } catch (UnknownHostException e) { // shouldn't happen log.error("LockssServlet: getLocalHost: " + e.toString()); return "???"; } } return localAddr; } // Return IP addr used by LCAP. If specified by (misleadingly named) // localIPAddress prop, might not really be our address (if we are // behind NAT). String getLcapIPAddr() { String ip = CurrentConfig.getParam(PARAM_LOCAL_IP); if (ip == null || ip.length() <= 0) { return getLocalIPAddr(); } return ip; } String getRequestHost() { return reqURL.getHost(); } String getMachineName() { return PlatformUtil.getLocalHostname(); } // String getMachineName0() { // if (myName == null) { // // Return the canonical name of the interface the request was aimed // // at. (localIPAddress prop isn't necessarily right here, as it // // might be the address of a NAT that we're behind.) // String host = reqURL.getHost(); // try { // IPAddr localHost = IPAddr.getByName(host); // String ip = localHost.getHostAddress(); // myName = getMachineName(ip); // } catch (UnknownHostException e) { // // shouldn't happen // log.error("getMachineName", e); // return host; // } // } // return myName; // } // String getMachineName(String ip) { // try { // IPAddr inet = IPAddr.getByName(ip); // return inet.getHostName(); // } catch (UnknownHostException e) { // log.warning("getMachineName", e); // } // return ip; // } // return IP given name or IP String getMachineIP(String name) { try { IPAddr inet = IPAddr.getByName(name); return inet.getHostAddress(); } catch (UnknownHostException e) { return null; } } boolean isServletLinkInNav(ServletDescr d) { return !isThisServlet(d) || linkMeInNav(); } boolean isThisServlet(ServletDescr d) { return d == myServletDescr(); } /** servlets may override this to determine whether they should be a link in nav table */ protected boolean linkMeInNav() { return false; } boolean isLargeLogo() { return myServletDescr().isLargeLogo(); } // user predicates String getUsername() { Principal user = req.getUserPrincipal(); return user != null ? user.toString() : null; } protected UserAccount getUserAccount() { if (acctMgr != null) { return acctMgr.getUser(getUsername()); } return AccountManager.NOBODY_ACCOUNT; } protected boolean isDebugUser() { return doesUserHaveRole(ROLE_DEBUG); } protected boolean doesUserHaveRole(String role) { if ((req.isUserInRole(role) || req.isUserInRole(ROLE_USER_ADMIN)) && !hasNoRoleParsm(role)) { return true; } return hasTestRole(role); } static Map<String, String> noRoleParams = new HashMap<String, String>(); static { noRoleParams.put(ROLE_USER_ADMIN, "noadmin"); noRoleParams.put(ROLE_CONTENT_ADMIN, "nocontent"); noRoleParams.put(ROLE_AU_ADMIN, "noau"); noRoleParams.put(ROLE_DEBUG, "nodebug"); } protected boolean hasNoRoleParsm(String roleName) { String noRoleParam = noRoleParams.get(roleName); return (noRoleParam != null && !StringUtil.isNullString(req.getParameter(noRoleParam))); } protected boolean hasTestRole(String role) { // Servlet test harness puts roles in context List roles = (List) context.getAttribute(ATTR_ALLOW_ROLES); return roles != null && (roles.contains(role) || roles.contains(ROLE_USER_ADMIN)); } protected boolean isServletAllowed(ServletDescr d) { if (d.needsUserAdminRole() && !doesUserHaveRole(ROLE_USER_ADMIN)) return false; if (d.needsContentAdminRole() && !doesUserHaveRole(ROLE_CONTENT_ADMIN)) return false; if (d.needsAuAdminRole() && !doesUserHaveRole(ROLE_AU_ADMIN)) return false; return d.isEnabled(getLockssDaemon()); } protected boolean isServletDisplayed(ServletDescr d) { if (!isServletAllowed(d)) return false; if (d.needsDebugRole() && !doesUserHaveRole(ROLE_DEBUG)) return false; return true; } protected boolean isServletInNav(ServletDescr d) { if (d.cls == ServletDescr.UNAVAILABLE_SERVLET_MARKER) return false; return d.isInNav(this) && isServletDisplayed(d); } // Called when a servlet doesn't get the parameters it expects/needs protected void paramError() throws IOException { // FIXME: As of 2006-03-15 this method and its only caller checkParam() are not called from // anywhere PrintWriter wrtr = resp.getWriter(); Page page = new Page(); // add referer, params, msg to contact lockss unless from old bookmark // or manually entered url page.add("Parameter error"); page.write(wrtr); } // return true iff error protected boolean checkParam(boolean ok, String msg) throws IOException { if (ok) return false; log.error(myServletDescr().getPath() + ": " + msg); paramError(); return true; } /** Construct servlet URL */ String srvURL(ServletDescr d) { return srvURL((String) null, d, null); } /** Construct servlet URL with params */ String srvURL(ServletDescr d, String params) { return srvURL((String) null, d, params); } /** Construct servlet URL with params */ String srvURL(ServletDescr d, Properties params) { return srvURL(d, concatParams(params)); } /** Construct servlet absolute URL, with params as necessary. */ String srvAbsURL(ServletDescr d, String params) { return srvURL(getRequestHost(), d, params); } /** * Construct servlet URL, with params as necessary. Avoid generating a hostname different from * that used in the original request, or browsers will prompt again for login */ String srvURL(String host, ServletDescr d, String params) { return srvURLFromStem(srvUrlStem(host), d, params); } String srvURL(PeerIdentity peer, ServletDescr d, String params) { return srvURLFromStem(peer.getUiUrlStem(reqURL.getPort()), d, params); } /** * Construct servlet URL, with params as necessary. Avoid generating a hostname different from * that used in the original request, or browsers will prompt again for login */ String srvURLFromStem(String stem, ServletDescr d, String params) { if (d.isPathIsUrl()) { return d.getPath(); } StringBuilder sb = new StringBuilder(80); if (stem != null) { sb.append(stem); if (stem.charAt(stem.length() - 1) != '/') { sb.append('/'); } } else { // ensure absolute path even if no scheme/host/port sb.append('/'); } sb.append(d.getPath()); if (params != null) { sb.append('?'); sb.append(params); } return sb.toString(); } String srvUrlStem(String host) { if (host == null) { return null; } StringBuilder sb = new StringBuilder(); sb.append(reqURL.getProtocol()); sb.append("://"); sb.append(host); sb.append(':'); sb.append(reqURL.getPort()); return sb.toString(); } /** Return a link to a servlet */ String srvLink(ServletDescr d, String text) { return srvLink(d, text, (String) null); } /** Return a link to a servlet with params */ String srvLink(ServletDescr d, String text, String params) { return new Link(srvURL(d, params), (text != null ? text : d.heading)).toString(); } /** Return a link to a servlet with params */ String srvLink(ServletDescr d, String text, Properties params) { return new Link(srvURL(d, params), text).toString(); } /** Return an absolute link to a servlet with params */ String srvAbsLink(ServletDescr d, String text, Properties params) { return srvAbsLink(d, text, concatParams(params)); } /** Return an absolute link to a servlet with params */ String srvAbsLink(ServletDescr d, String text, String params) { return new Link(srvAbsURL(d, params), (text != null ? text : d.heading)).toString(); } /** Return an absolute link to a servlet with params */ String srvAbsLink(String host, ServletDescr d, String text, String params) { return new Link(srvURL(host, d, params), (text != null ? text : d.heading)).toString(); } /** Return an absolute link to a servlet with params */ String srvAbsLink(PeerIdentity peer, ServletDescr d, String text, String params) { return new Link(srvURL(peer, d, params), (text != null ? text : d.heading)).toString(); } /** Return text as a link iff isLink */ String conditionalSrvLink(ServletDescr d, String text, String params, boolean isLink) { if (isLink) { return srvLink(d, text, params); } else { return text; } } /** Return text as a link iff isLink */ String conditionalSrvLink(ServletDescr d, String text, boolean isLink) { return conditionalSrvLink(d, text, null, isLink); } /** Concatenate params for URL string */ static String concatParams(String p1, String p2) { if (StringUtil.isNullString(p1)) { return p2; } if (StringUtil.isNullString(p2)) { return p1; } return p1 + "&" + p2; } /** Concatenate params for URL string */ String concatParams(Properties props) { if (props == null) { return null; } java.util.List list = new ArrayList(); for (Iterator iter = props.keySet().iterator(); iter.hasNext(); ) { String key = (String) iter.next(); String val = props.getProperty(key); if (!StringUtil.isNullString(val)) { list.add(key + "=" + urlEncode(val)); } } return StringUtil.separatedString(list, "&"); } String modifyParams(String key, String val) { Properties props = getParamsAsProps(); props.setProperty(key, val); return concatParams(props); } /** * Return the request parameters as a Properties. Only the first value of multivalued parameters * is included. */ Properties getParamsAsProps() { Properties props = new Properties(); for (Enumeration en = req.getParameterNames(); en.hasMoreElements(); ) { String name = (String) en.nextElement(); props.setProperty(name, req.getParameter(name)); } return props; } /** * Return the request parameters as a Map<String,String>. Only the first value of multivalued * parameters is included. */ Map<String, String> getParamsAsMap() { Map<String, String> map = new HashMap<String, String>(); for (Enumeration en = req.getParameterNames(); en.hasMoreElements(); ) { String name = (String) en.nextElement(); map.put(name, req.getParameter(name)); } return map; } protected String urlEncode(String param) { return UrlUtil.encodeUrl(param); } protected String getRequestKey() { String key = req.getPathInfo(); if (key != null && key.startsWith("/")) { return key.substring(1); } return key; } /** Common page setup. */ protected Page newPage() { // Compute heading String heading = getHeading(); if (heading == null) { heading = "Box Administration"; } // Create page and layout header Page page = ServletUtil.doNewPage(getPageTitle(), isFramed()); Iterator inNavIterator; if (myServletDescr().hasNoNavTable()) { inNavIterator = CollectionUtil.EMPTY_ITERATOR; } else { inNavIterator = new FilterIterator( new ObjectArrayIterator(getServletDescrs()), new Predicate() { public boolean evaluate(Object obj) { return isServletInNav((ServletDescr) obj); } }); } ServletUtil.layoutHeader( this, page, heading, isLargeLogo(), getMachineName(), getLockssApp().getStartDate(), inNavIterator); String warnMsg = CurrentConfig.getParam(PARAM_UI_WARNING); if (warnMsg != null) { Composite warning = new Composite(); warning.add("<center><font color=red size=+1>"); warning.add(warnMsg); warning.add("</font></center><br>"); page.add(warning); } return page; } protected Page addBarePageHeading(Page page) { // FIXME: Move the following fragment elsewhere // It causes the doctype statement to appear in the middle, // after the <body> tag. page.add("<!doctype html public \"-//w3c//dtd html 4.0 transitional//en\">"); page.addHeader("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">"); page.addHeader("<meta http-equiv=\"content-type\" content=\"text/html;charset=ISO-8859-1\">"); page.addHeader("<link rel=\"shortcut icon\" href=\"/favicon.ico\" type=\"image/x-icon\" />"); return page; } private boolean isFramed = false; protected String errMsg; protected String statusMsg; protected boolean isFramed() { return isFramed; } protected void setFramed(boolean v) { isFramed = v; } protected String getPageTitle() { String heading = getHeading(); if (heading != null) { return "LOCKSS: " + heading; } else { return "LOCKSS"; } } /** Return a button that invokes the javascript submit routine with the specified action */ protected Element submitButton(String label, String action) { return submitButton(label, action, null, null); } /** Return a button that invokes javascript when clicked. */ Input jsButton(String label, String js) { Input btn = new Input("button", null); btn.attribute("value", label); setTabOrder(btn); btn.attribute("onClick", js); return btn; } /** * Return a button that invokes the javascript submit routine with the specified action, first * storing the value in the specified form prop. */ protected Element submitButton(String label, String action, String prop, String value) { StringBuilder sb = new StringBuilder(40); sb.append("lockssButton(this, '"); sb.append(action); sb.append("'"); if (prop != null && value != null) { sb.append(", '"); sb.append(prop); sb.append("', '"); sb.append(value); sb.append("'"); } sb.append(")"); Input btn = jsButton(label, sb.toString()); btn.attribute("id", "lsb." + (++submitButtonNumber)); return btn; } /** * Return a (possibly labelled) checkbox. * * @param label appears to right of checkbox if non null * @param value value included in result set if box checked * @param key form key to which result set is assigned * @param checked if true, box is initially checked * @return a checkbox Element */ Element checkBox(String label, String value, String key, boolean checked) { Input in = new Input(Input.Checkbox, key, value); if (checked) { in.check(); } setTabOrder(in); if (StringUtil.isNullString(label)) { return in; } else { Composite c = new Composite(); c.add(in); c.add(" "); c.add(label); return c; } } /** * Return a labelled rasio button * * @param label label to right of circle, and form value if checked * @param key form key to which value is assigned * @param checked if true, is initially checked * @return a readio button Element */ protected Element radioButton(String label, String key, boolean checked) { return radioButton(label, label, key, checked); } /** * Return a labelled rasio button * * @param label appears to right of circle if non null * @param value value assigned to key if box checked * @param key form key to which value is assigned * @param checked if true, is initially checked * @return a readio button Element */ protected Element radioButton(String label, String value, String key, boolean checked) { Composite c = new Composite(); Input in = new Input(Input.Radio, key, value); if (checked) { in.check(); } setTabOrder(in); c.add(in); c.add(label); return c; } /** Add html tags to grey the text if isGrey is true */ protected String greyText(String txt, boolean isGrey) { if (!isGrey) { return txt; } return "<font color=gray>" + txt + "</font>"; } /** * Set this element next in the tab order. Returns the element for easier nesting in expressions. */ protected Element setTabOrder(Element ele) { ele.attribute("tabindex", tabindex++); return ele; } /** * Store a footnote, assign it a number, return html for footnote reference. If footnote in null * or empty, no footnote is added and an empty string is returned. Footnote numbers get turned * into links; <b>Do not put the result of addFootnote inside a link!</b>. */ protected String addFootnote(String s) { if (s == null || s.length() == 0) { return ""; } if (footNumber == 0) { if (footnotes == null) { footnotes = new Vector(10, 10); } else { footnotes.removeAllElements(); } } int n = footnotes.indexOf(s); if (n < 0) { n = footNumber++; footnotes.addElement(s); } return "<sup><font size=-1><a href=#foottag" + (n + 1) + ">" + (n + 1) + "</a></font></sup>"; } /** * Add javascript to page. Normally adds a link to the script file, but can be told to include the * script directly in the page, to accomodate unit testing of individual servlets, when other * fetches won't work. */ protected void addJavaScript(Composite comp) { String include = (String) context.getAttribute(ATTR_INCLUDE_SCRIPT); if (StringUtil.isNullString(include)) { linkToJavaScript(comp); } else { includeJavaScript0(comp); } } private void includeJavaScript0(Composite comp) { Script script = new Script(getJavascript()); comp.add(script); } private void linkToJavaScript(Composite comp) { Script script = new Script(""); script.attribute("src", "admin.js"); comp.add(script); } private static String jstext = null; private static synchronized String getJavascript() { if (jstext == null) { InputStream istr = null; try { ClassLoader loader = Thread.currentThread().getContextClassLoader(); istr = loader.getResourceAsStream(JAVASCRIPT_RESOURCE); jstext = StringUtil.fromInputStream(istr); istr.close(); } catch (Exception e) { log.error("Can't load javascript", e); } finally { IOUtil.safeClose(istr); } } return jstext; } /** Display a message in lieu of the normal page */ protected void displayMsgInLieuOfPage(String msg) throws IOException { // TODO: Look at HTML Page page = newPage(); Composite warning = new Composite(); warning.add(msg); warning.add("<br>"); page.add(warning); layoutFooter(page); page.write(resp.getWriter()); } /** Display a warning in red, in lieu of the normal page */ protected void displayWarningInLieuOfPage(String msg) throws IOException { displayMsgInLieuOfPage("<center><font color=red size=+1>" + msg + "</font></center>"); } /** Display "The cache isn't ready yet, come back later" */ protected void displayNotStarted() throws IOException { displayWarningInLieuOfPage( "This LOCKSS box is still starting. Please " + srvLink(myServletDescr(), "try again", getParamsAsProps()) + " in a moment."); } public MultiPartRequest getMultiPartRequest() throws FormDataTooLongException, IOException { int maxUpload = CurrentConfig.getIntParam(PARAM_MAX_UPLOAD_FILE_SIZE, DEFAULT_MAX_UPLOAD_FILE_SIZE); return getMultiPartRequest(maxUpload); } public MultiPartRequest getMultiPartRequest(int maxLen) throws FormDataTooLongException, IOException { if (req.getContentType() == null || !req.getContentType().startsWith("multipart/form-data")) { return null; } if (req.getContentLength() > maxLen) { throw new FormDataTooLongException(req.getContentLength() + " bytes, " + maxLen + " allowed"); } MultiPartRequest multi = new MultiPartRequest(req); if (log.isDebug2()) { String[] parts = multi.getPartNames(); log.debug3("Multipart request, " + parts.length + " parts"); if (log.isDebug3()) { for (int p = 0; p < parts.length; p++) { String name = parts[p]; String cont = multi.getString(parts[p]); log.debug3(name + ": " + cont); } } } multiReq = multi; return multi; } public String getParameter(String name) { String val = req.getParameter(name); if (val == null && multiReq != null) { val = multiReq.getString(name); } if (val == null) { return null; } val = StringUtils.strip(val, " \t"); // if (StringUtil.isNullString(val)) { if ("".equals(val)) { return null; } return val; } protected void layoutFooter(Page page) { ServletUtil.doLayoutFooter( page, (footnotes == null ? null : footnotes.iterator()), getLockssApp().getVersionInfo()); if (footnotes != null) { footnotes.removeAllElements(); } } /** Return the app instance. */ protected LockssApp getLockssApp() { return theApp; } /** * Return the daemon instance, assumes that the servlet is running in the daemon. * * @throws ClassCastException if the servlet is running in an app other than the daemon */ protected LockssDaemon getLockssDaemon() { return (LockssDaemon) theApp; } protected void logParams() { Enumeration en = req.getParameterNames(); while (en.hasMoreElements()) { String name = (String) en.nextElement(); String vals[]; String dispval; if (StringUtil.indexOfIgnoreCase(name, "passw") >= 0) { dispval = req.getParameter(name).length() == 0 ? "" : "********"; } else if (log.isDebug2() && (vals = req.getParameterValues(name)).length > 1) { dispval = StringUtil.separatedString(vals, ", "); } else { dispval = req.getParameter(name); } log.debug(name + " = " + dispval); } } /** Convenience method */ protected String encodeText(String s) { return HtmlUtil.encode(s, HtmlUtil.ENCODE_TEXT); } /** Convenience method */ protected String encodeTextArea(String s) { return HtmlUtil.encode(s, HtmlUtil.ENCODE_TEXTAREA); } /** Convenience method */ protected String encodeAttr(String s) { return HtmlUtil.encode(s, HtmlUtil.ENCODE_ATTR); } /** * Create message and error message block * * @param composite TODO */ protected void layoutErrorBlock(Composite composite) { if (errMsg != null || statusMsg != null) { ServletUtil.layoutErrorBlock(composite, errMsg, statusMsg); } } /** Exception thrown if multipart form data is longer than the caller-supplied max */ public static class FormDataTooLongException extends Exception { public FormDataTooLongException(String message) { super(message); } } }
/** Central repository of loaded keystores */ public class LockssKeyStoreManager extends BaseLockssDaemonManager implements ConfigurableManager { protected static Logger log = Logger.getLogger("LockssKeyStoreManager"); static final String PREFIX = Configuration.PREFIX + "keyMgr."; /** Default type for newly created keystores. */ public static final String PARAM_DEFAULT_KEYSTORE_TYPE = PREFIX + "defaultKeyStoreType"; public static final String DEFAULT_DEFAULT_KEYSTORE_TYPE = "JCEKS"; /** Default keystore provider. */ public static final String PARAM_DEFAULT_KEYSTORE_PROVIDER = PREFIX + "defaultKeyStoreProvider"; public static final String DEFAULT_DEFAULT_KEYSTORE_PROVIDER = null; /** * Root of keystore definitions. For each keystore, pick a unique identifier and use it in place * of <id> in the following */ public static final String PARAM_KEYSTORE = PREFIX + "keystore"; /** If true the daemon will exit if a critical keystore is missing. */ public static final String PARAM_EXIT_IF_MISSING_KEYSTORE = PREFIX + "exitIfMissingKeystore"; public static final boolean DEFAULT_EXIT_IF_MISSING_KEYSTORE = true; /** keystore name, used by clients to refer to it */ public static final String KEYSTORE_PARAM_NAME = "name"; /** keystore file. Only one of file, resource or url should be set */ public static final String KEYSTORE_PARAM_FILE = "file"; /** keystore resource. Only one of file, resource or url should be set */ public static final String KEYSTORE_PARAM_RESOURCE = "resource"; /** keystore url. Only one of file, resource or url should be set */ public static final String KEYSTORE_PARAM_URL = "url"; /** keystore type */ public static final String KEYSTORE_PARAM_TYPE = "type"; /** keystore provider */ public static final String KEYSTORE_PARAM_PROVIDER = "provider"; /** keystore password */ public static final String KEYSTORE_PARAM_PASSWORD = "******"; /** private key password */ public static final String KEYSTORE_PARAM_KEY_PASSWORD = "******"; /** private key password file */ public static final String KEYSTORE_PARAM_KEY_PASSWORD_FILE = "keyPasswordFile"; /** * If true, and the keystore doesn't exist, a keystore with a self-signed certificate will be be * created. */ public static final String KEYSTORE_PARAM_CREATE = "create"; protected String defaultKeyStoreType = DEFAULT_DEFAULT_KEYSTORE_TYPE; protected String defaultKeyStoreProvider = DEFAULT_DEFAULT_KEYSTORE_PROVIDER; protected boolean paramExitIfMissingKeyStore = DEFAULT_EXIT_IF_MISSING_KEYSTORE; // Pseudo params for param doc public static final String DOC_PREFIX = PARAM_KEYSTORE + ".<id>."; /** Name by which daemon component(s) refer to this keystore */ public static final String PARAM_KEYSTORE_NAME = DOC_PREFIX + KEYSTORE_PARAM_NAME; /** Keystore filename. Only one of file, resource or url should be set */ public static final String PARAM_KEYSTORE_FILE = DOC_PREFIX + KEYSTORE_PARAM_FILE; /** Keystore resource. Only one of file, resource or url should be set */ public static final String PARAM_KEYSTORE_RESOURCE = DOC_PREFIX + KEYSTORE_PARAM_RESOURCE; /** Keystore url. Only one of file, resource or url should be set */ public static final String PARAM_KEYSTORE_URL = DOC_PREFIX + KEYSTORE_PARAM_URL; /** Keystore type (JKS, JCEKS, etc.) */ public static final String PARAM_KEYSTORE_TYPE = DOC_PREFIX + KEYSTORE_PARAM_TYPE; /** Keystore provider (SunJCE, etc.) */ public static final String PARAM_KEYSTORE_PROVIDER = DOC_PREFIX + KEYSTORE_PARAM_PROVIDER; /** Keystore password. Default is machine's fqdn */ public static final String PARAM_KEYSTORE_PASSWORD = DOC_PREFIX + KEYSTORE_PARAM_PASSWORD; /** Private key password */ public static final String PARAM_KEYSTORE_KEY_PASSWORD = DOC_PREFIX + KEYSTORE_PARAM_KEY_PASSWORD; /** private key password file */ public static final String PARAM_KEYSTORE_KEY_PASSWORD_FILE = DOC_PREFIX + KEYSTORE_PARAM_KEY_PASSWORD_FILE; /** * If true, and the keystore doesn't exist, a keystore with a self-signed certificate will be be * created. */ public static final String PARAM_KEYSTORE_CREATE = DOC_PREFIX + KEYSTORE_PARAM_CREATE; public static boolean DEFAULT_CREATE = false; protected Map<String, LockssKeyStore> keystoreMap = new HashMap<String, LockssKeyStore>(); public void startService() { super.startService(); loadKeyStores(); } public synchronized void setConfig( Configuration config, Configuration prevConfig, Configuration.Differences changedKeys) { if (changedKeys.contains(PREFIX)) { defaultKeyStoreType = config.get(PARAM_DEFAULT_KEYSTORE_TYPE, DEFAULT_DEFAULT_KEYSTORE_TYPE); defaultKeyStoreProvider = config.get(PARAM_DEFAULT_KEYSTORE_PROVIDER, DEFAULT_DEFAULT_KEYSTORE_PROVIDER); paramExitIfMissingKeyStore = config.getBoolean(PARAM_EXIT_IF_MISSING_KEYSTORE, DEFAULT_EXIT_IF_MISSING_KEYSTORE); if (changedKeys.contains(PARAM_KEYSTORE)) { configureKeyStores(config); // defer initial set of keystore loading until startService if (isInited()) { // load any newly added keystores loadKeyStores(); } } } } /** * Return the named LockssKeyStore or null * * @param name the keystore name */ public LockssKeyStore getLockssKeyStore(String name) { return getLockssKeyStore(name, null); } /** * Return the named LockssKeyStore * * @param name the keystore name * @param criticalServiceName if non-null, this is a criticial keystore whose unavailability * should cause the daemon to exit (if org.lockss.keyMgr.exitIfMissingKeystore is true) */ public LockssKeyStore getLockssKeyStore(String name, String criticalServiceName) { LockssKeyStore res = keystoreMap.get(name); checkFact(res, name, criticalServiceName, null); return res; } /** * Convenience method to return the KeyManagerFactory from the named LockssKeyStore, or null * * @param name the keystore name */ public KeyManagerFactory getKeyManagerFactory(String name) { return getKeyManagerFactory(name, null); } /** * Convenience method to return the KeyManagerFactory from the named LockssKeyStore. * * @param name the keystore name * @param criticalServiceName if non-null, this is a criticial keystore whose unavailability * should cause the daemon to exit (if org.lockss.keyMgr.exitIfMissingKeystore is true) */ public KeyManagerFactory getKeyManagerFactory(String name, String criticalServiceName) { LockssKeyStore lk = getLockssKeyStore(name, criticalServiceName); if (lk != null) { KeyManagerFactory fact = lk.getKeyManagerFactory(); checkFact(fact, name, criticalServiceName, "found but contains no private keys"); return fact; } return null; } /** Convenience method to return the TrustManagerFactory from the named LockssKeyStore, or null */ public TrustManagerFactory getTrustManagerFactory(String name) { return getTrustManagerFactory(name, null); } /** * Convenience method to return the TrustManagerFactory from the named LockssKeyStore. * * @param name the keystore name * @param criticalServiceName if non-null, this is a criticial keystore whose unavailability * should cause the daemon to exit (if org.lockss.keyMgr.exitIfMissingKeystore is true) */ public TrustManagerFactory getTrustManagerFactory(String name, String criticalServiceName) { LockssKeyStore lk = getLockssKeyStore(name, criticalServiceName); if (lk != null) { TrustManagerFactory fact = lk.getTrustManagerFactory(); checkFact(fact, name, criticalServiceName, "found but contains no trusted certificates"); return fact; } return null; } private void checkFact(Object fact, String name, String criticalServiceName, String message) { if (fact == null && criticalServiceName != null) { String msg = StringUtil.isNullString(name) ? ("No keystore name given for critical keystore" + " needed for service " + criticalServiceName + ", daemon exiting") : ("Critical keystore " + name + " " + ((message != null) ? message : "not found or not loadable") + " for service " + criticalServiceName + ", daemon exiting"); log.critical(msg); if (paramExitIfMissingKeyStore) { System.exit(Constants.EXIT_CODE_KEYSTORE_MISSING); } else { throw new IllegalArgumentException(msg); } } } /** Create LockssKeystores from config subtree below {@link #PARAM_KEYSTORE} */ void configureKeyStores(Configuration config) { Configuration allKs = config.getConfigTree(PARAM_KEYSTORE); for (Iterator iter = allKs.nodeIterator(); iter.hasNext(); ) { String id = (String) iter.next(); Configuration oneKs = allKs.getConfigTree(id); try { LockssKeyStore lk = createLockssKeyStore(oneKs); String name = lk.getName(); if (name == null) { log.error("KeyStore definition missing name: " + oneKs); continue; } LockssKeyStore old = keystoreMap.get(name); if (old != null && !lk.equals(old)) { log.warning( "Keystore " + name + " redefined. " + "New definition may not take effect until daemon restart"); } log.debug("Adding keystore " + name); keystoreMap.put(name, lk); } catch (Exception e) { log.error("Couldn't create keystore: " + oneKs, e); } } } /** Create LockssKeystore from a config subtree */ LockssKeyStore createLockssKeyStore(Configuration config) { log.debug2("Creating LockssKeyStore from config: " + config); String name = config.get(KEYSTORE_PARAM_NAME); LockssKeyStore lk = new LockssKeyStore(name); String file = config.get(KEYSTORE_PARAM_FILE); String resource = config.get(KEYSTORE_PARAM_RESOURCE); String url = config.get(KEYSTORE_PARAM_URL); if (!StringUtil.isNullString(file)) { lk.setLocation(file, LocationType.File); } else if (!StringUtil.isNullString(resource)) { lk.setLocation(resource, LocationType.Resource); } else if (!StringUtil.isNullString(url)) { lk.setLocation(url, LocationType.Url); } lk.setType(config.get(KEYSTORE_PARAM_TYPE, defaultKeyStoreType)); lk.setProvider(config.get(KEYSTORE_PARAM_PROVIDER, defaultKeyStoreProvider)); lk.setPassword(config.get(KEYSTORE_PARAM_PASSWORD)); lk.setKeyPassword(config.get(KEYSTORE_PARAM_KEY_PASSWORD)); lk.setKeyPasswordFile(config.get(KEYSTORE_PARAM_KEY_PASSWORD_FILE)); lk.setMayCreate(config.getBoolean(KEYSTORE_PARAM_CREATE, DEFAULT_CREATE)); return lk; } void loadKeyStores() { List<LockssKeyStore> lst = new ArrayList<LockssKeyStore>(keystoreMap.values()); for (LockssKeyStore lk : lst) { try { lk.load(); } catch (Exception e) { log.error("Can't load keystore " + lk.getName(), e); keystoreMap.remove(lk.getName()); } } } }
/** * LockssRepository is used to organize the urls being cached. It keeps a memory cache of the most * recently used nodes as a least-recently-used map, and also caches weak references to the * instances as they're doled out. This ensures that two instances of the same node are never * created, as the weak references only disappear when the object is finalized (they go to null when * the last hard reference is gone, then are removed from the cache on finalize()). */ public class LockssRepositoryImpl extends BaseLockssDaemonManager implements LockssRepository { private static Logger logger = Logger.getLogger("LockssRepository"); /** Configuration parameter name for Lockss cache location. */ public static final String PARAM_CACHE_LOCATION = Configuration.PREFIX + "cache.location"; /** Restores pre repo rescanning bugfix behavior (4050) */ public static final String PARAM_CLEAR_DIR_MAP = RepositoryManager.PREFIX + "clearDirMapOnAuStop"; public static final boolean DEFAULT_CLEAR_DIR_MAP = false; /** Name of top directory in which the urls are cached. */ public static final String CACHE_ROOT_NAME = "cache"; // XXX This is a remnant from the single-disk days, and should go away. // It is used only by unit tests, which want it set to the (last) // individual repository dir created. private static String staticCacheLocation = null; // Maps local repository name (disk path) to LocalRepository instance static Map localRepositories = new HashMap(); // starts with a '#' so no possibility of clashing with a URL public static final String AU_ID_FILE = "#au_id_file"; static final String AU_ID_PROP = "au.id"; static final String PLUGIN_ID_PROP = "plugin.id"; static final char ESCAPE_CHAR = '#'; static final String ESCAPE_STR = "#"; static final char ENCODED_SEPARATOR_CHAR = 's'; static final String INITIAL_PLUGIN_DIR = String.valueOf((char) ('a' - 1)); static String lastPluginDir = INITIAL_PLUGIN_DIR; // PJG: Windows prohibits use of ':' in file name -- replace with '~' for development static final String PORT_SEPARATOR = SystemUtils.IS_OS_WINDOWS ? "%" : ":"; // this contains a '#' so that it's not defeatable by strings which // match the prefix in a url (like '../tmp/') private static final String TEST_PREFIX = "/#tmp"; private RepositoryManager repoMgr; private String rootLocation; UniqueRefLruCache nodeCache; private boolean isGlobalNodeCache = RepositoryManager.DEFAULT_GLOBAL_CACHE_ENABLED; LockssRepositoryImpl(String rootPath) { if (rootPath.endsWith(File.separator)) { rootLocation = rootPath; } else { // shouldn't happen StringBuilder sb = new StringBuilder(rootPath.length() + File.separator.length()); sb.append(rootPath); sb.append(File.separator); rootLocation = sb.toString(); } // Test code still needs this. nodeCache = new UniqueRefLruCache(RepositoryManager.DEFAULT_MAX_PER_AU_CACHE_SIZE); rootLocation = rootLocation .replace("?", "") .replace("COM8", "COMEIGHT") .replace("%5c", "/"); // //windows folder structure fix } public void startService() { super.startService(); repoMgr = getDaemon().getRepositoryManager(); isGlobalNodeCache = repoMgr.isGlobalNodeCache(); if (isGlobalNodeCache) { nodeCache = repoMgr.getGlobalNodeCache(); } else { // nodeCache = // new UniqueRefLruCache(repoMgr.paramNodeCacheSize); setNodeCacheSize(repoMgr.paramNodeCacheSize); } } public void stopService() { // mainly important in testing to blank this lastPluginDir = INITIAL_PLUGIN_DIR; if (CurrentConfig.getBooleanParam(PARAM_CLEAR_DIR_MAP, DEFAULT_CLEAR_DIR_MAP)) { // This should be in RepositoryManager.stopService() localRepositories = new HashMap(); } super.stopService(); } public void setNodeCacheSize(int size) { if (nodeCache != null && !isGlobalNodeCache && nodeCache.getMaxSize() != size) { nodeCache.setMaxSize(size); } } /** * Called between initService() and startService(), then whenever the AU's config changes. * * @param auConfig the new configuration */ public void setAuConfig(Configuration auConfig) {} void queueSizeCalc(RepositoryNode node) { repoMgr.queueSizeCalc(node); } public RepositoryNode getNode(String url) throws MalformedURLException { return getNode(url, false); } public RepositoryNode createNewNode(String url) throws MalformedURLException { return getNode(url, true); } public void deleteNode(String url) throws MalformedURLException { RepositoryNode node = getNode(url, false); if (node != null) { node.markAsDeleted(); } } public void deactivateNode(String url) throws MalformedURLException { RepositoryNode node = getNode(url, false); if (node != null) { node.deactivateContent(); } } /** * This function returns a RepositoryNode with a canonicalized path. * * @param url the url in String form * @param create true iff the node should be created if absent * @return RepositoryNode the node * @throws MalformedURLException */ private synchronized RepositoryNode getNode(String url, boolean create) throws MalformedURLException { String canonUrl; boolean isAuUrl = false; if (AuUrl.isAuUrl(url)) { // path information is lost here, but is unimportant if it's an AuUrl canonUrl = AuUrl.PROTOCOL; isAuUrl = true; } else { // create a canonical path, handling all illegal path traversal canonUrl = canonicalizePath(url); } // check LRUMap cache for node RepositoryNode node = (RepositoryNode) nodeCache.get(nodeCacheKey(canonUrl)); if (node != null) { return node; } String nodeLocation; if (isAuUrl) { // base directory of ArchivalUnit nodeLocation = rootLocation; node = new AuNodeImpl(canonUrl, nodeLocation, this); } else { // determine proper node location nodeLocation = LockssRepositoryImpl.mapUrlToFileLocation(rootLocation, canonUrl) .replace("?", "") .replace("COM8", "COMEIGHT") .replace("%5c", "/"); // //windows folder structure fix node = new RepositoryNodeImpl(canonUrl, nodeLocation, this); } if (!create) { // if not creating, check for existence File nodeDir = new File(nodeLocation); if (!nodeDir.exists()) { // return null if the node doesn't exist and shouldn't be created return null; } if (!nodeDir.isDirectory()) { logger.error("Cache file not a directory: " + nodeLocation); throw new LockssRepository.RepositoryStateException("Invalid cache file."); } } // add to node cache nodeCache.put(nodeCacheKey(canonUrl), node); return node; } Object nodeCacheKey(String canonUrl) { if (isGlobalNodeCache) { return new KeyPair(this, canonUrl); } return canonUrl; } // functions for testing int getCacheHits() { return nodeCache.getCacheHits(); } int getCacheMisses() { return nodeCache.getCacheMisses(); } int getRefHits() { return nodeCache.getRefHits(); } int getRefMisses() { return nodeCache.getRefMisses(); } public void nodeConsistencyCheck() { // traverse the node tree from the top RepositoryNode topNode; try { topNode = getNode(AuUrl.PROTOCOL_COLON); recurseConsistencyCheck((RepositoryNodeImpl) topNode); } catch (MalformedURLException ignore) { } } /** * Checks the consistency of the node, and continues with its children if it's consistent. * * @param node RepositoryNodeImpl the node to check */ private void recurseConsistencyCheck(RepositoryNodeImpl node) { logger.debug2("Checking node '" + node.getNodeUrl() + "'..."); // check consistency at each node // correct/deactivate as necessary // 'checkNodeConsistency()' will repair if possible if (node.checkNodeConsistency()) { logger.debug3("Node consistent; recursing on children..."); List children = node.getNodeList(null, false); Iterator iter = children.iterator(); while (iter.hasNext()) { RepositoryNodeImpl child = (RepositoryNodeImpl) iter.next(); recurseConsistencyCheck(child); } } else { logger.debug3("Node inconsistent; deactivating..."); deactivateInconsistentNode(node); } } /** * This is called when a node is in an inconsistent state. It simply creates some necessary * directories and deactivates the node. Future polls should restore it properly. * * @param node the inconsistent node */ void deactivateInconsistentNode(RepositoryNodeImpl node) { logger.warning("Deactivating inconsistent node."); FileUtil.ensureDirExists(node.contentDir); // manually deactivate node.deactivateContent(); } /** * A method to remove any non-canonical '..' or '.' elements in the path, as well as protecting * against illegal path traversal. * * @param url the raw url * @return String the canonicalized url * @throws MalformedURLException */ public String canonicalizePath(String url) throws MalformedURLException { String canonUrl = UrlUtil.normalizeUrl(url, UrlUtil.PATH_TRAVERSAL_ACTION_THROW); // canonicalize "dir" and "dir/" // XXX if these are ever two separate nodes, this is wrong if (canonUrl.endsWith(UrlUtil.URL_PATH_SEPARATOR)) { canonUrl = canonUrl.substring(0, canonUrl.length() - 1); } return canonUrl; } // static calls /** * Factory method to create new LockssRepository instances. * * @param au the {@link ArchivalUnit} * @return the new LockssRepository instance */ public static LockssRepository createNewLockssRepository(ArchivalUnit au) { String root = getRepositoryRoot(au); if (root == null || root.equals("null")) { logger.error("No repository dir set in config"); throw new LockssRepository.RepositoryStateException("No repository dir set in config"); } String auDir = LockssRepositoryImpl.mapAuToFileLocation(root, au); if (logger.isDebug2()) { logger.debug2("repo: " + auDir + ", au: " + au.getName()); } staticCacheLocation = extendCacheLocation(root); LockssRepositoryImpl repo = new LockssRepositoryImpl(auDir); Plugin plugin = au.getPlugin(); if (plugin != null) { LockssDaemon daemon = plugin.getDaemon(); if (daemon != null) { RepositoryManager mgr = daemon.getRepositoryManager(); if (mgr != null) { mgr.setRepositoryForPath(auDir, repo); } } } return repo; } public static String getRepositorySpec(ArchivalUnit au) { Configuration auConfig = au.getConfiguration(); if (auConfig != null) { // can be null in unit tests String repoSpec = auConfig.get(PluginManager.AU_PARAM_REPOSITORY); if (repoSpec != null && repoSpec.startsWith("local:")) { return repoSpec; } } return "local:" + CurrentConfig.getParam(PARAM_CACHE_LOCATION); } public static String getRepositoryRoot(ArchivalUnit au) { return getLocalRepositoryPath(getRepositorySpec(au)); } public static String getLocalRepositoryPath(String repoSpec) { if (repoSpec != null) { if (repoSpec.startsWith("local:")) { return repoSpec.substring(6); } } return null; } // The OpenBSD platform has renamed the first disk from /cache to // /cache.wd0, leaving behind a symbolic link in /cache . This is // transparent everywhere except the repository status table, which needs // to match AU configs with AUs it finds when enumerating the repository. // Existing AU configs have repository=local:/cache, so the relative link // needs to be resolved to detect that that's the same as // local:/cache.wd0 private static Map canonicalRoots = new HashMap(); public static boolean isDirInRepository(String dir, String repoRoot) { if (dir.startsWith(repoRoot)) { return true; } return canonRoot(dir).startsWith(canonRoot(repoRoot)); } static String canonRoot(String root) { synchronized (canonicalRoots) { String canon = (String) canonicalRoots.get(root); if (canon == null) { try { canon = new File(root).getCanonicalPath(); canonicalRoots.put(root, canon); } catch (IOException e) { logger.warning("Can't canonicalize: " + root, e); return root; } } return canon; } } static String getCacheLocation() { return staticCacheLocation; } /** * Adds the 'cache' directory to the HD location. * * @param cacheDir the root location. * @return String the extended location */ static String extendCacheLocation(String cacheDir) { StringBuilder buffer = new StringBuilder(cacheDir); if (!cacheDir.endsWith(File.separator)) { buffer.append(File.separator); } buffer.append(CACHE_ROOT_NAME); buffer.append(File.separator); return buffer.toString(); } /** * mapAuToFileLocation() is the method used to resolve {@link ArchivalUnit}s into directory names. * This maps a given au to directories, using the cache root as the base. Given an au with * PluginId of 'plugin' and AuId of 'au', it would return the string '<rootLocation>/plugin/au/'. * * @param repoRoot the root of a LOCKSS repository * @param au the ArchivalUnit to resolve * @return the directory location */ public static String mapAuToFileLocation(String repoRoot, ArchivalUnit au) { return getAuDir(au, repoRoot, true); } /** * mapUrlToFileLocation() is the method used to resolve urls into file names. This maps a given * url to a file location, using the au top directory as the base. It creates directories which * mirror the html string, so 'http://www.journal.org/issue1/index.html' would be cached in the * file: <rootLocation>/www.journal.org/http/issue1/index.html * * @param rootLocation the top directory for ArchivalUnit this URL is in * @param urlStr the url to translate * @return the url file location * @throws java.net.MalformedURLException */ public static String mapUrlToFileLocation(String rootLocation, String urlStr) throws MalformedURLException { int totalLength = rootLocation.length() + urlStr.length(); URL url = new URL(urlStr); StringBuilder buffer = new StringBuilder(totalLength); buffer.append(rootLocation); if (!rootLocation.endsWith(File.separator)) { buffer.append(File.separator); } buffer.append(url.getHost().toLowerCase()); int port = url.getPort(); if (port != -1) { buffer.append(PORT_SEPARATOR); buffer.append(port); } buffer.append(File.separator); buffer.append(url.getProtocol()); if (RepositoryManager.isEnableLongComponents()) { String escapedPath = escapePath( StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator)); String query = url.getQuery(); if (query != null) { escapedPath = escapedPath + "?" + escapeQuery(query); } String encodedPath = RepositoryNodeImpl.encodeUrl(escapedPath); // encodeUrl strips leading / from path buffer.append(File.separator); buffer.append(encodedPath); } else { buffer.append( escapePath( StringUtil.replaceString(url.getPath(), UrlUtil.URL_PATH_SEPARATOR, File.separator))); String query = url.getQuery(); if (query != null) { buffer.append("?"); buffer.append(escapeQuery(query)); } } return buffer.toString(); } // name mapping functions /** * Return true iff a repository for the auid exists under the root * * @param auid * @param repoRoot the repository root * @return true iff a repository for the auid exists */ static boolean doesAuDirExist(String auid, String repoRoot) { return null != getAuDir(auid, repoRoot, false); } /** * Finds the directory for this AU. If none found in the map, designates a new dir for it. * * @param au the AU * @param repoRoot root of the repository * @return the dir {@link String} */ static String getAuDir(ArchivalUnit au, String repoRoot, boolean create) { return getAuDir(au.getAuId(), repoRoot, create); } /** * Finds the directory for this AU. If none found in the map, designates a new dir for it. * * @param auid AU id representing the au * @param repoRoot path to the root of the repository * @return the dir String */ static String getAuDir(String auid, String repoRoot, boolean create) { String repoCachePath = extendCacheLocation(repoRoot); LocalRepository localRepo = getLocalRepository(repoRoot); synchronized (localRepo) { Map aumap = localRepo.getAuMap(); String auPathSlash = (String) aumap.get(auid); if (auPathSlash != null) { return auPathSlash; } if (!create) { return null; } logger.debug3("Creating new au directory for '" + auid + "'."); String auDir = localRepo.getPrevAuDir(); for (int cnt = RepositoryManager.getMaxUnusedDirSearch(); cnt > 0; cnt--) { // loop through looking for an available dir auDir = getNextDirName(auDir); File testDir = new File(repoCachePath, auDir); if (logger.isDebug3()) logger.debug3("Probe for unused: " + testDir); if (!testDir.exists()) { if (RepositoryManager.isStatefulUnusedDirSearch()) { localRepo.setPrevAuDir(auDir); } String auPath = testDir.toString(); logger.debug3("New au directory: " + auPath); auPathSlash = auPath + File.separator; // write the new au property file to the new dir // XXX this data should be backed up elsewhere to avoid single-point // corruption Properties idProps = new Properties(); idProps.setProperty(AU_ID_PROP, auid); saveAuIdProperties(auPath, idProps); aumap.put(auid, auPathSlash); return auPathSlash; } else { if (logger.isDebug3()) { logger.debug3("Existing directory found at '" + auDir + "'. Checking next..."); } } } } throw new RuntimeException( "Can't find unused repository dir after " + RepositoryManager.getMaxUnusedDirSearch() + " tries in " + repoCachePath); } static LocalRepository getLocalRepository(ArchivalUnit au) { return getLocalRepository(getRepositoryRoot(au)); } static LocalRepository getLocalRepository(String repoRoot) { synchronized (localRepositories) { LocalRepository localRepo = (LocalRepository) localRepositories.get(repoRoot); if (localRepo == null) { logger.debug2("Creating LocalRepository(" + repoRoot + ")"); localRepo = new LocalRepository(repoRoot); localRepositories.put(repoRoot, localRepo); } return localRepo; } } /** Return next string in the sequence "a", "b", ... "z", "aa", "ab", ... */ static String getNextDirName(String old) { StringBuilder sb = new StringBuilder(old); // go through and increment the first non-'z' char // counts back from the last char, so 'aa'->'ab', not 'ba' for (int ii = sb.length() - 1; ii >= 0; ii--) { char curChar = sb.charAt(ii); if (curChar < 'z') { sb.setCharAt(ii, (char) (curChar + 1)); return sb.toString(); } sb.setCharAt(ii, 'a'); } sb.insert(0, 'a'); return sb.toString(); } public static File getAuIdFile(String location) { return new File(location + File.separator + AU_ID_FILE); } static Properties getAuIdProperties(String location) { File propFile = new File(location + File.separator + AU_ID_FILE); try { InputStream is = new BufferedInputStream(new FileInputStream(propFile)); Properties idProps = new Properties(); idProps.load(is); is.close(); return idProps; } catch (Exception e) { logger.warning("Error loading au id from " + propFile.getPath() + "."); return null; } } static void saveAuIdProperties(String location, Properties props) { // XXX these AU_ID_FILE entries need to be backed up elsewhere to avoid // single-point corruption File propDir = new File(location); if (!propDir.exists()) { logger.debug("Creating directory '" + propDir.getAbsolutePath() + "'"); propDir.mkdirs(); } File propFile = new File(propDir, AU_ID_FILE); try { logger.debug3("Saving au id properties at '" + location + "'."); OutputStream os = new BufferedOutputStream(new FileOutputStream(propFile)); props.store(os, "ArchivalUnit id info"); os.close(); propFile.setReadOnly(); } catch (IOException ioe) { logger.error("Couldn't write properties for " + propFile.getPath() + ".", ioe); throw new LockssRepository.RepositoryStateException("Couldn't write au id properties file."); } } // lockss filename-specific encoding methods /** * Escapes instances of the ESCAPE_CHAR from the path. This avoids name conflicts with the * repository files, such as '#nodestate.xml'. * * @param path the path * @return the escaped path */ static String escapePath(String path) { // XXX escaping disabled because of URL encoding if (false && path.indexOf(ESCAPE_CHAR) >= 0) { return StringUtil.replaceString(path, ESCAPE_STR, ESCAPE_STR + ESCAPE_STR); } else { return path; } } /** * Escapes instances of File.separator from the query. These are safe from filename overlap, but * can't convert into extended paths and directories. * * @param query the query * @return the escaped query */ static String escapeQuery(String query) { if (query.indexOf(File.separator) >= 0) { return StringUtil.replaceString(query, File.separator, ESCAPE_STR + ENCODED_SEPARATOR_CHAR); } else { return query; } } /** * Extracts '#x' encoding and converts back to 'x'. * * @param orig the original * @return the unescaped version. */ static String unescape(String orig) { if (orig.indexOf(ESCAPE_CHAR) < 0) { // fast treatment of non-escaped strings return orig; } int index = -1; StringBuilder buffer = new StringBuilder(orig.length()); String oldStr = orig; while ((index = oldStr.indexOf(ESCAPE_CHAR)) >= 0) { buffer.append(oldStr.substring(0, index)); buffer.append(convertCode(oldStr.substring(index, index + 2))); if (oldStr.length() > 2) { oldStr = oldStr.substring(index + 2); } else { oldStr = ""; } } buffer.append(oldStr); return buffer.toString(); } /** * Returns the second char in the escaped segment, unless it is 's', which is a stand-in for the * File.separatorChar. * * @param code the code segment (length 2) * @return the encoded char */ static char convertCode(String code) { char encodedChar = code.charAt(1); if (encodedChar == ENCODED_SEPARATOR_CHAR) { return File.separatorChar; } else { return encodedChar; } } public static class Factory implements LockssAuManager.Factory { public LockssAuManager createAuManager(ArchivalUnit au) { return createNewLockssRepository(au); } } /** Maintains state for a local repository root dir (<i>eg</i>, auid of each au subdir). */ static class LocalRepository { String repoPath; File repoCacheFile; Map auMap; String prevAuDir; LocalRepository(String repoPath) { this.repoPath = repoPath; repoCacheFile = new File(repoPath, CACHE_ROOT_NAME); } public String getRepositoryPath() { return repoPath; } public String getPrevAuDir() { if (prevAuDir == null) { prevAuDir = lastPluginDir; } return prevAuDir; } public void setPrevAuDir(String dir) { prevAuDir = dir; } /** * Return the auid -> au-subdir-path mapping. Enumerating the directories if necessary to * initialize the map */ Map getAuMap() { if (auMap == null) { logger.debug3("Loading name map for '" + repoCacheFile + "'."); auMap = new HashMap(); if (!repoCacheFile.exists()) { logger.debug3("Creating cache dir:" + repoCacheFile + "'."); if (!repoCacheFile.mkdirs()) { logger.critical("Couldn't create directory, check owner/permissions: " + repoCacheFile); // return empty map return auMap; } } else { // read each dir's property file and store mapping auid -> dir File[] auDirs = repoCacheFile.listFiles(); for (int ii = 0; ii < auDirs.length; ii++) { String dirName = auDirs[ii].getName(); // if (dirName.compareTo(lastPluginDir) == 1) { // // adjust the 'lastPluginDir' upwards if necessary // lastPluginDir = dirName; // } String path = auDirs[ii].getAbsolutePath(); Properties idProps = getAuIdProperties(path); if (idProps != null) { String auid = idProps.getProperty(AU_ID_PROP); StringBuilder sb = new StringBuilder(path.length() + File.separator.length()); sb.append(path); sb.append(File.separator); auMap.put(auid, sb.toString()); logger.debug3("Mapping to: " + auMap.get(auid) + ": " + auid); } else { logger.debug3("Not mapping " + path + ", no auid file."); } } } } return auMap; } public String toString() { return "[LR: " + repoPath + "]"; } } String getRootLocation() { return rootLocation; } }
/** Export the contents and metadata from an AU */ public abstract class Exporter { private static final Logger log = Logger.getLogger(Exporter.class); static final String PREFIX = Configuration.PREFIX + "exporter."; /** Abort export after this many errors */ public static final String PARAM_MAX_ERRORS = PREFIX + "maxErrors"; public static final int DEFAULT_MAX_ERRORS = 5; protected static int maxErrors = DEFAULT_MAX_ERRORS; protected LockssDaemon daemon; protected ArchivalUnit au; protected File dir; protected String prefix; protected long maxSize = -1; protected int maxVersions = 1; protected boolean compress = false; protected boolean excludeDirNodes = false; protected FilenameTranslation xlate = FilenameTranslation.XLATE_NONE; protected List errors = new ArrayList(); protected boolean isDiskFull = false; protected abstract void start() throws IOException; protected abstract void finish() throws IOException; protected abstract void writeCu(CachedUrl cu) throws IOException; protected Exporter(LockssDaemon daemon, ArchivalUnit au) { this.daemon = daemon; this.au = au; } /** Called by org.lockss.config.MiscConfig */ public static void setConfig( Configuration config, Configuration oldConfig, Configuration.Differences diffs) { if (diffs.contains(PREFIX)) { maxErrors = config.getInt(PARAM_MAX_ERRORS, DEFAULT_MAX_ERRORS); } } public void setCompress(boolean val) { compress = val; } public boolean getCompress() { return compress; } public void setExcludeDirNodes(boolean val) { excludeDirNodes = val; } public boolean getExcludeDirNodes() { return excludeDirNodes; } public void setFilenameTranslation(FilenameTranslation val) { xlate = val; } public FilenameTranslation getFilenameTranslation() { return xlate; } public void setDir(File val) { dir = val; } public File getDir() { return dir; } public void setPrefix(String val) { prefix = val; } public String getPrefix() { return prefix; } public void setMaxSize(long val) { maxSize = val; } public long getMaxSize() { return maxSize; } public void setMaxVersions(int val) { maxVersions = val; } public int getMaxVersions() { return maxVersions; } public List getErrors() { return errors; } protected void checkArgs() { if (getDir() == null) { throw new IllegalArgumentException("Must supply output directory"); } if (getPrefix() == null) { throw new IllegalArgumentException("Must supply file name/prefix"); } } public void export() { log.debug("export(" + au.getName() + ")"); log.debug( "dir: " + dir + ", pref: " + prefix + ", size: " + maxSize + ", ver: " + maxVersions + (compress ? ", (C)" : "")); checkArgs(); try { start(); } catch (IOException e) { recordError("Error opening file", e); return; } writeFiles(); try { finish(); } catch (IOException e) { if (!isDiskFull) { // If we already knew (and reported) disk full, also reporting it // as a close error is misleading. recordError("Error closing file", e); } } } protected String xlateFilename(String url) { return xlate.xlate(url); } protected void recordError(String msg, Throwable t) { log.error(msg, t); errors.add(msg + ": " + t.toString()); } protected void recordError(String msg) { log.error(msg); errors.add(msg); } protected String getSoftwareVersion() { String releaseName = BuildInfo.getBuildProperty(BuildInfo.BUILD_RELEASENAME); StringBuilder sb = new StringBuilder(); sb.append("LOCKSS Daemon "); if (releaseName != null) { sb.append(releaseName); } return sb.toString(); } protected String getHostIp() { try { IPAddr localHost = IPAddr.getLocalHost(); return localHost.getHostAddress(); } catch (UnknownHostException e) { log.error("getHostIp()", e); return "1.1.1.1"; } } protected String getHostName() { String res = ConfigManager.getPlatformHostname(); if (res == null) { try { InetAddress inet = InetAddress.getLocalHost(); return inet.getHostName(); } catch (UnknownHostException e) { log.warning("Can't get hostname", e); return "unknown"; } } return res; } protected Properties filterResponseProps(Properties props) { Properties res = new Properties(); for (Map.Entry ent : props.entrySet()) { String key = (String) ent.getKey(); if (StringUtil.startsWithIgnoreCase(key, "x-lockss") || StringUtil.startsWithIgnoreCase(key, "x_lockss") || key.equalsIgnoreCase("org.lockss.version.number")) { continue; } // We've lost the original case - capitalize them the way most people // expect res.put(StringUtil.titleCase(key, '-'), (String) ent.getValue()); } return res; } protected String getHttpResponseString(CachedUrl cu) { Properties cuProps = cu.getProperties(); Properties filteredProps = filterResponseProps(cuProps); String hdrString = PropUtil.toHeaderString(filteredProps); StringBuilder sb = new StringBuilder(hdrString.length() + 30); String line1 = inferHttpResponseCode(cu, cuProps); sb.append(line1); sb.append(Constants.CRLF); sb.append(hdrString); sb.append(Constants.CRLF); return sb.toString(); } String inferHttpResponseCode(CachedUrl cu, Properties cuProps) { if (cuProps.get("location") == null) { return "HTTP/1.1 200 OK"; } else { return "HTTP/1.1 302 Found"; } } // return the next CU with content private CachedUrl getNextCu(CuIterator iter) { return iter.hasNext() ? iter.next() : null; } /** * Return true if, interpreting URLs as filenames, dirCu is a directory containing fileCu. Used to * exclude directory content from output files, so they can be unpacked by standard utilities * (e.g., unzip). Shouldn't be called with equal URLs, but return false in that case, as we * wouldn't want to exclude the URL */ boolean isDirOf(CachedUrl dirCu, CachedUrl fileCu) { String dir = dirCu.getUrl(); String file = fileCu.getUrl(); if (!dir.endsWith("/")) { dir = dir + "/"; } return file.startsWith(dir) && !file.equals(dir); } private void writeFiles() { PlatformUtil platutil = PlatformUtil.getInstance(); CuIterator iter = AuUtil.getCuIterator(au); int errs = 0; CachedUrl curCu = null; CachedUrl nextCu = getNextCu(iter); while (nextCu != null) { curCu = nextCu; nextCu = getNextCu(iter); if (excludeDirNodes && nextCu != null && isDirOf(curCu, nextCu)) { continue; } CachedUrl[] cuVersions = curCu.getCuVersions(maxVersions > 0 ? maxVersions : Integer.MAX_VALUE); for (CachedUrl cu : cuVersions) { try { log.debug2("Exporting " + cu.getUrl()); writeCu(cu); } catch (IOException e) { if (platutil.isDiskFullError(e)) { recordError("Disk full, can't write export file."); isDiskFull = true; return; } } catch (Exception e) { // XXX Would like to differentiate between errors opening or // reading CU, which shouldn't cause abort, and errors writing // to export file, which should. recordError("Unable to copy " + cu.getUrl(), e); if (errs++ >= maxErrors) { recordError("Aborting after " + errs + " errors"); return; } } } } } private Set<File> fileSet = new HashSet<File>(); private List<File> fileList = new ArrayList<File>(); protected void recordExportFile(File file) { if (fileSet.add(file)) { fileList.add(file); } } /** * Return the list of export files written * * @return List of File written */ public List<File> getExportFiles() { return fileList; } public interface Factory { public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au); } static final String WINDOWS_FROM = "?<>:*|\\"; static final String WINDOWS_TO = "_______"; static final String MAC_FROM = ":"; static final String MAC_TO = "_"; /** Enum of filename translation types */ public static enum FilenameTranslation { XLATE_NONE("None") { public String xlate(String s) { return s; } }, XLATE_WINDOWS("Windows") { public String xlate(String s) { return StringUtils.replaceChars(s, WINDOWS_FROM, WINDOWS_TO); } }, XLATE_MAC("MacOS") { public String xlate(String s) { return StringUtils.replaceChars(s, MAC_FROM, MAC_TO); } }; private final String label; FilenameTranslation(String label) { this.label = label; } public String getLabel() { return label; } public abstract String xlate(String s); }; /** Enum of Exporter types, and factories */ public static enum Type implements Factory { ARC_RESOURCE("ARC (content only)") { public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) { return new ArcExporter(daemon, au, false); } }, ARC_RESPONSE("ARC (response and content)") { public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) { return new ArcExporter(daemon, au, true); } }, WARC_RESOURCE("WARC (content only)") { public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) { return new WarcExporter(daemon, au, false); } }, WARC_RESPONSE("WARC (response and content)") { public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) { return new WarcExporter(daemon, au, true); } }, ZIP("ZIP") { public Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au) { return new ZipExporter(daemon, au); } }; private final String label; Type(String label) { this.label = label; } public abstract Exporter makeExporter(LockssDaemon daemon, ArchivalUnit au); public String getLabel() { return label; } }; }
/** * RepositoryManager is the center of the per AU repositories. It manages the repository config * parameters. */ public class RepositoryManager extends BaseLockssDaemonManager implements ConfigurableManager { private static Logger log = Logger.getLogger("RepositoryManager"); public static final String PREFIX = Configuration.PREFIX + "repository."; /** Maximum size of per-AU repository node cache */ public static final String PARAM_MAX_PER_AU_CACHE_SIZE = PREFIX + "nodeCache.size"; public static final int DEFAULT_MAX_PER_AU_CACHE_SIZE = 10; /* * This needs to be a small multiple of the number of simultaneous * polls (poller and voter), as there is a cache entry per active AU. * Each poll will have one active AU at a time. */ public static final String PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE = PREFIX + "suspectVersionsCache.size"; public static final int DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE = 10; static final String GLOBAL_CACHE_PREFIX = PREFIX + "globalNodeCache."; public static final String PARAM_MAX_GLOBAL_CACHE_SIZE = GLOBAL_CACHE_PREFIX + "size"; public static final int DEFAULT_MAX_GLOBAL_CACHE_SIZE = 500; public static final String PARAM_GLOBAL_CACHE_ENABLED = GLOBAL_CACHE_PREFIX + "enabled"; public static final boolean DEFAULT_GLOBAL_CACHE_ENABLED = false; /** Max times to loop looking for unused AU directory. */ public static final String PARAM_MAX_UNUSED_DIR_SEARCH = PREFIX + "maxUnusedDirSearch"; public static final int DEFAULT_MAX_UNUSED_DIR_SEARCH = 30000; /** If true, LocalRepository keeps track of next subdir name to probe. */ public static final String PARAM_IS_STATEFUL_UNUSED_DIR_SEARCH = PREFIX + "enableStatefulUnusedDirSearch"; public static final boolean DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH = true; /** Max percent of time size calculation thread may run. */ public static final String PARAM_SIZE_CALC_MAX_LOAD = PREFIX + "sizeCalcMaxLoad"; public static final float DEFAULT_SIZE_CALC_MAX_LOAD = 0.5F; /** * If true, path components longer than the maximum filesystem path component are encodes are * multiple levels of directories */ public static final String PARAM_ENABLE_LONG_COMPONENTS = PREFIX + "enableLongComponents"; public static final boolean DEFAULT_ENABLE_LONG_COMPONENTS = true; /** * Prior to 1.61.6, when long component support is enabled, backslahes were normalized to %5c * instead of %5C. This is harmless if checkUnnormalized is set to Fix, as they'll be normalized. * Otherwise, this can be set true for compatibility with old repositories. Setting * checkUnnormalized to Fix is preferred. */ public static final String PARAM_ENABLE_LONG_COMPONENTS_COMPATIBILITY = PREFIX + "enableLongComponentsCompatibility"; public static final boolean DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY = false; /** Maximum length of a filesystem path component. */ public static final String PARAM_MAX_COMPONENT_LENGTH = PREFIX + "maxComponentLength"; public static final int DEFAULT_MAX_COMPONENT_LENGTH = 255; /** @see #PARAM_CHECK_UNNORMALIZED */ public enum CheckUnnormalizedMode { No, Log, Fix }; /** * Check for existing nodes with unnormalized names (created by very old daemon that didn't * normalize): None, Log, Fix */ public static final String PARAM_CHECK_UNNORMALIZED = PREFIX + "checkUnnormalized"; public static final CheckUnnormalizedMode DEFAULT_CHECK_UNNORMALIZED = CheckUnnormalizedMode.Log; static final String WDOG_PARAM_SIZE_CALC = "SizeCalc"; static final long WDOG_DEFAULT_SIZE_CALC = Constants.DAY; static final String PRIORITY_PARAM_SIZE_CALC = "SizeCalc"; static final int PRIORITY_DEFAULT_SIZE_CALC = Thread.NORM_PRIORITY - 1; static final String DISK_PREFIX = PREFIX + "diskSpace."; static final String PARAM_DISK_WARN_FRRE_MB = DISK_PREFIX + "warn.freeMB"; static final int DEFAULT_DISK_WARN_FRRE_MB = 5000; static final String PARAM_DISK_FULL_FRRE_MB = DISK_PREFIX + "full.freeMB"; static final int DEFAULT_DISK_FULL_FRRE_MB = 100; static final String PARAM_DISK_WARN_FRRE_PERCENT = DISK_PREFIX + "warn.freePercent"; static final double DEFAULT_DISK_WARN_FRRE_PERCENT = .02; static final String PARAM_DISK_FULL_FRRE_PERCENT = DISK_PREFIX + "full.freePercent"; static final double DEFAULT_DISK_FULL_FRRE_PERCENT = .01; private PlatformUtil platInfo = PlatformUtil.getInstance(); private List repoList = Collections.EMPTY_LIST; int paramNodeCacheSize = DEFAULT_MAX_PER_AU_CACHE_SIZE; boolean paramIsGlobalNodeCache = DEFAULT_GLOBAL_CACHE_ENABLED; int paramGlobalNodeCacheSize = DEFAULT_MAX_GLOBAL_CACHE_SIZE; int paramSuspectVersionsCacheSize = DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE; UniqueRefLruCache globalNodeCache = new UniqueRefLruCache(DEFAULT_MAX_GLOBAL_CACHE_SIZE); UniqueRefLruCache suspectVersionsCache = new UniqueRefLruCache(DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE); Map localRepos = new HashMap(); private static int maxUnusedDirSearch = DEFAULT_MAX_UNUSED_DIR_SEARCH; private static boolean isStatefulUnusedDirSearch = DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH; private static boolean enableLongComponents = DEFAULT_ENABLE_LONG_COMPONENTS; private static boolean enableLongComponentsCompatibility = DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY; private static int maxComponentLength = DEFAULT_MAX_COMPONENT_LENGTH; private static CheckUnnormalizedMode checkUnnormalized = DEFAULT_CHECK_UNNORMALIZED; PlatformUtil.DF paramDFWarn = PlatformUtil.DF.makeThreshold(DEFAULT_DISK_WARN_FRRE_MB, DEFAULT_DISK_WARN_FRRE_PERCENT); PlatformUtil.DF paramDFFull = PlatformUtil.DF.makeThreshold(DEFAULT_DISK_FULL_FRRE_MB, DEFAULT_DISK_FULL_FRRE_PERCENT); private float sizeCalcMaxLoad = DEFAULT_SIZE_CALC_MAX_LOAD; public void startService() { super.startService(); localRepos = new HashMap(); } public void setConfig( Configuration config, Configuration oldConfig, Configuration.Differences changedKeys) { // Build list of repositories from list of disk (fs) paths). Needs to // be generalized if ever another repository implementation. if (changedKeys.contains(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST)) { List lst = new ArrayList(); String dspace = config.get(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST, ""); List paths = StringUtil.breakAt(dspace, ';'); if (paths != null) { for (Iterator iter = paths.iterator(); iter.hasNext(); ) { lst.add("local:" + (String) iter.next()); } } repoList = lst; } if (changedKeys.contains(PARAM_MAX_PER_AU_CACHE_SIZE)) { paramNodeCacheSize = config.getInt(PARAM_MAX_PER_AU_CACHE_SIZE, DEFAULT_MAX_PER_AU_CACHE_SIZE); for (Iterator iter = getDaemon().getAllLockssRepositories().iterator(); iter.hasNext(); ) { LockssRepository repo = (LockssRepository) iter.next(); if (repo instanceof LockssRepositoryImpl) { LockssRepositoryImpl repoImpl = (LockssRepositoryImpl) repo; repoImpl.setNodeCacheSize(paramNodeCacheSize); } } } if (changedKeys.contains(PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE)) { paramSuspectVersionsCacheSize = config.getInt( PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE, DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE); suspectVersionsCache.setMaxSize(paramSuspectVersionsCacheSize); } if (changedKeys.contains(GLOBAL_CACHE_PREFIX)) { paramIsGlobalNodeCache = config.getBoolean(PARAM_GLOBAL_CACHE_ENABLED, DEFAULT_GLOBAL_CACHE_ENABLED); if (paramIsGlobalNodeCache) { paramGlobalNodeCacheSize = config.getInt(PARAM_MAX_GLOBAL_CACHE_SIZE, DEFAULT_MAX_GLOBAL_CACHE_SIZE); log.debug("global node cache size: " + paramGlobalNodeCacheSize); globalNodeCache.setMaxSize(paramGlobalNodeCacheSize); } } if (changedKeys.contains(DISK_PREFIX)) { int minMB = config.getInt(PARAM_DISK_WARN_FRRE_MB, DEFAULT_DISK_WARN_FRRE_MB); double minPer = config.getPercentage(PARAM_DISK_WARN_FRRE_PERCENT, DEFAULT_DISK_WARN_FRRE_PERCENT); paramDFWarn = PlatformUtil.DF.makeThreshold(minMB, minPer); minMB = config.getInt(PARAM_DISK_FULL_FRRE_MB, DEFAULT_DISK_FULL_FRRE_MB); minPer = config.getPercentage(PARAM_DISK_FULL_FRRE_PERCENT, DEFAULT_DISK_FULL_FRRE_PERCENT); paramDFFull = PlatformUtil.DF.makeThreshold(minMB, minPer); } if (changedKeys.contains(PARAM_SIZE_CALC_MAX_LOAD)) { sizeCalcMaxLoad = config.getPercentage(PARAM_SIZE_CALC_MAX_LOAD, DEFAULT_SIZE_CALC_MAX_LOAD); } if (changedKeys.contains(PREFIX)) { maxUnusedDirSearch = config.getInt(PARAM_MAX_UNUSED_DIR_SEARCH, DEFAULT_MAX_UNUSED_DIR_SEARCH); isStatefulUnusedDirSearch = config.getBoolean( PARAM_IS_STATEFUL_UNUSED_DIR_SEARCH, DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH); enableLongComponents = config.getBoolean(PARAM_ENABLE_LONG_COMPONENTS, DEFAULT_ENABLE_LONG_COMPONENTS); enableLongComponentsCompatibility = config.getBoolean( PARAM_ENABLE_LONG_COMPONENTS_COMPATIBILITY, DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY); maxComponentLength = config.getInt(PARAM_MAX_COMPONENT_LENGTH, DEFAULT_MAX_COMPONENT_LENGTH); checkUnnormalized = (CheckUnnormalizedMode) config.getEnum( CheckUnnormalizedMode.class, PARAM_CHECK_UNNORMALIZED, DEFAULT_CHECK_UNNORMALIZED); } } public static boolean isEnableLongComponents() { return enableLongComponents; } public static boolean isEnableLongComponentsCompatibility() { return enableLongComponentsCompatibility; } public static int getMaxComponentLength() { return maxComponentLength; } public static CheckUnnormalizedMode getCheckUnnormalizedMode() { return checkUnnormalized; } /** * Return list of known repository names. Needs a registration mechanism if ever another * repository implementation. */ public List<String> getRepositoryList() { return repoList; } public PlatformUtil.DF getRepositoryDF(String repoName) { String path = LockssRepositoryImpl.getLocalRepositoryPath(repoName); log.debug("path: " + path); // try { return platInfo.getJavaDF(path); // } catch (PlatformUtil.UnsupportedException e) { // return null; // } } public Map<String, PlatformUtil.DF> getRepositoryMap() { Map<String, PlatformUtil.DF> repoMap = new LinkedMap(); for (String repo : getRepositoryList()) { repoMap.put(repo, getRepositoryDF(repo)); } return repoMap; } public String findLeastFullRepository() { return findLeastFullRepository(getRepositoryMap()); } public String findLeastFullRepository(Map<String, PlatformUtil.DF> repoMap) { String mostFree = null; for (String repo : repoMap.keySet()) { PlatformUtil.DF df = repoMap.get(repo); if (df != null) { if (mostFree == null || (repoMap.get(mostFree)).getAvail() < df.getAvail()) { mostFree = repo; } } } return mostFree; } public PlatformUtil.DF getDiskWarnThreshold() { return paramDFWarn; } public PlatformUtil.DF getDiskFullThreshold() { return paramDFFull; } public static int getMaxUnusedDirSearch() { return maxUnusedDirSearch; } public static boolean isStatefulUnusedDirSearch() { return isStatefulUnusedDirSearch; } public List findExistingRepositoriesFor(String auid) { List res = null; for (Iterator iter = getRepositoryList().iterator(); iter.hasNext(); ) { String repoName = (String) iter.next(); String path = LockssRepositoryImpl.getLocalRepositoryPath(repoName); if (LockssRepositoryImpl.doesAuDirExist(auid, path)) { if (res == null) { res = new ArrayList(); } res.add(repoName); } } return res == null ? Collections.EMPTY_LIST : res; } // hack only local public synchronized LockssRepositoryImpl getRepositoryFromPath(String path) { LockssRepositoryImpl repo = (LockssRepositoryImpl) localRepos.get(path); if (repo == null) { repo = new LockssRepositoryImpl(path); repo.initService(getDaemon()); repo.startService(); localRepos.put(path, repo); } return repo; } /** * Return the disk space used by the AU, including all overhead, optionally calculating it if * necessary. * * @param repoAuPath the full path to an AU dir in a LockssRepositoryImpl * @param calcIfUnknown if true, size will calculated if unknown (time consumeing) * @return the AU's disk usage in bytes, or -1 if unknown */ public long getRepoDiskUsage(String repoAuPath, boolean calcIfUnknown) { LockssRepository repo = getRepositoryFromPath(repoAuPath); if (repo != null) { try { RepositoryNode repoNode = repo.getNode(AuCachedUrlSetSpec.URL); if (repoNode instanceof AuNodeImpl) { return ((AuNodeImpl) repoNode).getDiskUsage(calcIfUnknown); } } catch (MalformedURLException ignore) { } } return -1; } public synchronized void setRepositoryForPath(String path, LockssRepositoryImpl repo) { localRepos.put(path, repo); } public boolean isGlobalNodeCache() { return paramIsGlobalNodeCache; } public UniqueRefLruCache getGlobalNodeCache() { return globalNodeCache; } public UniqueRefLruCache getSuspectVersionsCache() { return suspectVersionsCache; } // Background thread to (re)calculate AU size and disk usage. private Set sizeCalcQueue = new HashSet(); private BinarySemaphore sizeCalcSem = new BinarySemaphore(); private SizeCalcThread sizeCalcThread; /** engqueue a size calculation for the AU */ public void queueSizeCalc(ArchivalUnit au) { queueSizeCalc(AuUtil.getAuRepoNode(au)); } /** engqueue a size calculation for the node */ public void queueSizeCalc(RepositoryNode node) { synchronized (sizeCalcQueue) { if (sizeCalcQueue.add(node)) { log.debug2("Queue size calc: " + node); startOrKickThread(); } } } public int sizeCalcQueueLen() { synchronized (sizeCalcQueue) { return sizeCalcQueue.size(); } } void startOrKickThread() { if (sizeCalcThread == null) { log.debug2("Starting thread"); sizeCalcThread = new SizeCalcThread(); sizeCalcThread.start(); sizeCalcThread.waitRunning(); } sizeCalcSem.give(); } void stopThread() { if (sizeCalcThread != null) { log.debug2("Stopping thread"); sizeCalcThread.stopSizeCalc(); sizeCalcThread = null; } } void doSizeCalc(RepositoryNode node) { node.getTreeContentSize(null, true); if (node instanceof AuNodeImpl) { ((AuNodeImpl) node).getDiskUsage(true); } } long sleepTimeToAchieveLoad(long runDuration, float maxLoad) { return Math.round(((double) runDuration / maxLoad) - runDuration); } private class SizeCalcThread extends LockssThread { private volatile boolean goOn = true; private SizeCalcThread() { super("SizeCalc"); } public void lockssRun() { setPriority(PRIORITY_PARAM_SIZE_CALC, PRIORITY_DEFAULT_SIZE_CALC); startWDog(WDOG_PARAM_SIZE_CALC, WDOG_DEFAULT_SIZE_CALC); triggerWDogOnExit(true); nowRunning(); while (goOn) { try { pokeWDog(); if (sizeCalcQueue.isEmpty()) { Deadline timeout = Deadline.in(Constants.HOUR); sizeCalcSem.take(timeout); } RepositoryNode node; synchronized (sizeCalcQueue) { node = (RepositoryNode) CollectionUtil.getAnElement(sizeCalcQueue); } if (node != null) { long start = TimeBase.nowMs(); log.debug2("CalcSize start: " + node); long dur = 0; try { doSizeCalc(node); dur = TimeBase.nowMs() - start; log.debug2("CalcSize finish (" + StringUtil.timeIntervalToString(dur) + "): " + node); } catch (RuntimeException e) { log.warning("doSizeCalc: " + node, e); } synchronized (sizeCalcQueue) { sizeCalcQueue.remove(node); } pokeWDog(); long sleep = sleepTimeToAchieveLoad(dur, sizeCalcMaxLoad); Deadline.in(sleep).sleep(); } } catch (InterruptedException e) { // just wakeup and check for exit } } if (!goOn) { triggerWDogOnExit(false); } } private void stopSizeCalc() { goOn = false; interrupt(); } } }
public abstract class V3Serializer { private static final String PREFIX = Configuration.PREFIX + "poll.v3."; protected File pollDir; protected LockssDaemon daemon; static final Logger log = Logger.getLogger("V3Serializer"); public V3Serializer(LockssDaemon daemon) throws PollSerializerException { this.daemon = daemon; Configuration config = CurrentConfig.getCurrentConfig(); File stateDir = PollUtil.ensurePollStateRoot(); if (!FileUtil.ensureDirExists(stateDir)) { throw new PollSerializerException("Could not create state directory " + stateDir); } try { this.pollDir = FileUtil.createTempDir("pollstate-", "", stateDir); } catch (IOException ex) { throw new PollSerializerException("Cannot create temp dir in state directory" + stateDir, ex); } } /** * Create a new PollSerializer. The parameter pollDir is optional. If it is specified, it must be * a poll serialization directory that already exists. If it is null, a new poll serialization * directory will be created. * * @param dir Optionally, a pre-existing serialization directory to use. * @throws PollSerializerException */ public V3Serializer(LockssDaemon daemon, File dir) throws PollSerializerException { if (dir == null) { throw new NullPointerException("Poll serialization directory must not " + "be null"); } this.daemon = daemon; this.pollDir = dir; if (!pollDir.exists()) { throw new IllegalArgumentException( "Poll directories passed as " + "arguments must already exist"); } } /** Make an XStreamSerializer */ protected ObjectSerializer getSerializer() { return new XStreamSerializer(daemon); } /** Clean up all resources used by this poll. Removes the poll directory. */ public void closePoll() { if (pollDir != null && pollDir.isDirectory() && !FileUtil.delTree(pollDir)) log.warning("Unable to delete poll state directory: " + pollDir); } /** * PollSerializerException. Simply exception handling by wrapping IOException and * ObjectSerializer.SerializationException. */ public static class PollSerializerException extends Exception { public PollSerializerException() { super(); } public PollSerializerException(String msg) { super(msg); } public PollSerializerException(String msg, Throwable cause) { super(msg, cause); } public PollSerializerException(Throwable cause) { super(cause); } } }
/** Functional tests on the simulated content generator. */ public class FuncSimulatedContent extends LockssTestCase { static final Logger log = Logger.getLogger("FuncSimulatedContent"); private PluginManager pluginMgr; private Plugin simPlugin; private SimulatedArchivalUnit sau1; private SimulatedContentGenerator scgen = null; private MockLockssDaemon theDaemon; String tempDirPath; String tempDirPath2; private static String DAMAGED_CACHED_URL = "/branch2/branch2/002file.txt"; public FuncSimulatedContent(String msg) { super(msg); } public void setUp() throws Exception { super.setUp(); tempDirPath = getTempDir().getAbsolutePath() + File.separator; theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.getHashService(); MockSystemMetrics metrics = new MyMockSystemMetrics(); metrics.initService(theDaemon); theDaemon.setSystemMetrics(metrics); theDaemon.setDaemonInited(true); Properties props = new Properties(); props.setProperty(SystemMetrics.PARAM_HASH_TEST_DURATION, "1000"); props.setProperty(SystemMetrics.PARAM_HASH_TEST_BYTE_STEP, "1024"); props.setProperty(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST, tempDirPath); ConfigurationUtil.setCurrentConfigFromProps(props); pluginMgr = theDaemon.getPluginManager(); pluginMgr.startService(); theDaemon.getHashService().startService(); metrics.startService(); metrics.setHashSpeed(100); simPlugin = PluginTestUtil.findPlugin(SimulatedPlugin.class); } public void tearDown() throws Exception { theDaemon.getLockssRepository(sau1).stopService(); theDaemon.getNodeManager(sau1).stopService(); theDaemon.getPluginManager().stopService(); theDaemon.getHashService().stopService(); theDaemon.getSystemMetrics().stopService(); theDaemon.stopDaemon(); super.tearDown(); } SimulatedArchivalUnit setupSimAu(Configuration auConfig) throws ArchivalUnit.ConfigurationException { ArchivalUnit au = PluginTestUtil.createAndStartAu(simPlugin, auConfig); return (SimulatedArchivalUnit) au; } Configuration simAuConfig(String rootPath) { Configuration conf = ConfigManager.newConfiguration(); conf.put("root", rootPath); conf.put("depth", "2"); conf.put("branch", "2"); conf.put("numFiles", "2"); conf.put("badCachedFileLoc", "2,2"); conf.put("badCachedFileNum", "2"); return conf; } void enableFilter(SimulatedArchivalUnit sau, boolean enable) throws ArchivalUnit.ConfigurationException { Configuration auConfig = sau.getConfiguration().copy(); // no bad file when playing with filtering auConfig.remove("badCachedFileLoc"); auConfig.remove("badCachedFileNum"); if (enable) { auConfig.put(SimulatedPlugin.AU_PARAM_HASH_FILTER_SPEC, "true"); } else { auConfig.remove(SimulatedPlugin.AU_PARAM_HASH_FILTER_SPEC); } sau.setConfiguration(auConfig); } public void testSimulatedContent() throws Exception { sau1 = setupSimAu(simAuConfig(tempDirPath)); createContent(sau1); crawlContent(sau1); checkContent(sau1); doDamageRemoveTest(sau1); // must be before content read again checkFilter(sau1); hashContent(sau1); // this resets AU's config, do last to avoid messing up toBeDamaged set } public void testDualContentHash() throws Exception { sau1 = setupSimAu(simAuConfig(tempDirPath)); createContent(sau1); crawlContent(sau1); CachedUrlSet set = sau1.getAuCachedUrlSet(); byte[] nameH = getHash(set, true); byte[] contentH = getHash(set, false); tempDirPath2 = getTempDir().getAbsolutePath() + File.separator; SimulatedArchivalUnit sau2 = setupSimAu(simAuConfig(tempDirPath2)); createContent(sau2); crawlContent(sau2); set = sau2.getAuCachedUrlSet(); byte[] nameH2 = getHash(set, true); byte[] contentH2 = getHash(set, false); assertEquals(nameH, nameH2); assertEquals(contentH, contentH2); } public void testBaseUrl() throws Exception { sau1 = setupSimAu(simAuConfig(tempDirPath)); createContent(sau1); crawlContent(sau1); CachedUrlSet cus1 = sau1.getAuCachedUrlSet(); tempDirPath2 = getTempDir().getAbsolutePath() + File.separator; Configuration config2 = simAuConfig(tempDirPath2); config2.put("base_url", "http://anotherhost.org/"); SimulatedArchivalUnit sau2 = setupSimAu(config2); createContent(sau2); crawlContent(sau2); CachedUrlSet cus2 = sau1.getAuCachedUrlSet(); List urls1 = auUrls(sau1); List urls2 = auUrls(sau2); Pattern pat = Pattern.compile("http://([^/]+)(/.*)$"); List<String> l1 = auUrls(sau1); List<String> l2 = auUrls(sau2); assertEquals(l1.size(), l2.size()); for (int ix = 0; ix < l1.size(); ix++) { Matcher m1 = pat.matcher(l1.get(ix)); assertTrue(m1.matches()); Matcher m2 = pat.matcher(l2.get(ix)); assertTrue(m2.matches()); assertEquals("www.example.com", m1.group(1)); assertEquals("anotherhost.org", m2.group(1)); assertEquals(m1.group(2), m2.group(2)); } } public void testBaseUrlPath() throws Exception { sau1 = setupSimAu(simAuConfig(tempDirPath)); createContent(sau1); crawlContent(sau1); CachedUrlSet cus1 = sau1.getAuCachedUrlSet(); tempDirPath2 = getTempDir().getAbsolutePath() + File.separator; Configuration config2 = simAuConfig(tempDirPath2); config2.put("base_url", "http://anotherhost.org/some/path/"); SimulatedArchivalUnit sau2 = setupSimAu(config2); createContent(sau2); crawlContent(sau2); CachedUrlSet cus2 = sau1.getAuCachedUrlSet(); List urls1 = auUrls(sau1); List urls2 = auUrls(sau2); Pattern pat1 = Pattern.compile("http://www\\.example\\.com(/.*)$"); Pattern pat2 = Pattern.compile("http://anotherhost\\.org/some/path(/.*)$"); List<String> l1 = auUrls(sau1); List<String> l2 = auUrls(sau2); assertEquals(l1.size(), l2.size()); for (int ix = 0; ix < l1.size(); ix++) { Matcher m1 = pat1.matcher(l1.get(ix)); assertTrue(m1.matches()); Matcher m2 = pat2.matcher(l2.get(ix)); assertTrue(m2.matches()); assertEquals(m1.group(1), m2.group(1)); } } List<String> auUrls(ArchivalUnit au) { List<String> res = new ArrayList<String>(); for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next(); if (cusn.hasContent()) { res.add(cusn.getUrl()); } } return res; } protected void createContent(SimulatedArchivalUnit sau) { log.debug("createContent()"); scgen = sau.getContentGenerator(); scgen.setFileTypes( SimulatedContentGenerator.FILE_TYPE_HTML + SimulatedContentGenerator.FILE_TYPE_TXT); scgen.setAbnormalFile("1,1", 1); scgen.setOddBranchesHaveContent(true); sau.deleteContentTree(); sau.generateContentTree(); assertTrue(scgen.isContentTree()); } protected void crawlContent(SimulatedArchivalUnit sau) { log.debug("crawlContent()"); CrawlSpec spec = new SpiderCrawlSpec(sau.getNewContentCrawlUrls(), null); Crawler crawler = new NoCrawlEndActionsNewContentCrawler(sau, spec, new MockAuState()); crawler.doCrawl(); } protected void checkContent(SimulatedArchivalUnit sau) throws IOException { log.debug("checkContent()"); checkRoot(sau); checkLeaf(sau); checkStoredContent(sau); checkDepth(sau); } protected void checkFilter(SimulatedArchivalUnit sau) throws Exception { log.debug("checkFilter()"); CachedUrl cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html"); enableFilter(sau, true); InputStream is = cu.openForHashing(); String expected = "001file.html This is file 1, depth 0, branch 0. foobar "; assertEquals(expected, StringUtil.fromInputStream(is)); is.close(); enableFilter(sau, false); cu = sau.makeCachedUrl(sau.getUrlRoot() + "/001file.html"); is = cu.openForHashing(); expected = "<HTML><HEAD><TITLE>001file.html</TITLE></HEAD><BODY>\n" + "This is file 1, depth 0, branch 0.<br><!-- comment --> " + "Citation String foobar<br><script>" + "(defun fact (n) (cond ((= n 0) 1) (t (fact (sub1 n)))))</script>\n" + "</BODY></HTML>"; assertEquals(expected, StringUtil.fromInputStream(is)); is.close(); } private byte[] fromHex(String hex) { return ByteArray.fromHexString(hex); } protected void hashContent(SimulatedArchivalUnit sau) throws Exception { log.debug("hashContent()"); measureHashSpeed(sau); // If any changes are made to the contents or shape of the simulated // content tree, these hash values will have to be changed checkHashSet(sau, true, false, fromHex("6AB258B4E1FFD9F9B45316B4F54111FF5E5948D2")); checkHashSet(sau, true, true, fromHex("6AB258B4E1FFD9F9B45316B4F54111FF5E5948D2")); checkHashSet(sau, false, false, fromHex("409893F1A603F4C276632694DB1621B639BD5164")); checkHashSet(sau, false, true, fromHex("85E6213C3771BEAC5A4602CAF7982C6C222800D5")); } protected void checkDepth(SimulatedArchivalUnit sau) { log.debug("checkDepth()"); String URL_ROOT = sau.getUrlRoot(); assertEquals(0, sau.getLinkDepth(URL_ROOT + "/index.html")); assertEquals(0, sau.getLinkDepth(URL_ROOT + "/")); assertEquals(1, sau.getLinkDepth(URL_ROOT + "/001file.html")); assertEquals(1, sau.getLinkDepth(URL_ROOT + "/branch1/index.html")); assertEquals(1, sau.getLinkDepth(URL_ROOT + "/branch1/")); assertEquals(2, sau.getLinkDepth(URL_ROOT + "/branch1/001file.html")); } protected void checkRoot(SimulatedArchivalUnit sau) { log.debug("checkRoot()"); CachedUrlSet set = sau.getAuCachedUrlSet(); Iterator setIt = set.flatSetIterator(); ArrayList childL = new ArrayList(1); CachedUrlSet cus = null; while (setIt.hasNext()) { cus = (CachedUrlSet) setIt.next(); childL.add(cus.getUrl()); } String urlRoot = sau.getUrlRoot(); String[] expectedA = new String[1]; expectedA[0] = urlRoot; assertIsomorphic(expectedA, childL); setIt = cus.flatSetIterator(); childL = new ArrayList(7); while (setIt.hasNext()) { childL.add(((CachedUrlSetNode) setIt.next()).getUrl()); } expectedA = new String[] { urlRoot + "/001file.html", urlRoot + "/001file.txt", urlRoot + "/002file.html", urlRoot + "/002file.txt", urlRoot + "/branch1", urlRoot + "/branch2", urlRoot + "/index.html" }; assertIsomorphic(expectedA, childL); } protected void checkLeaf(SimulatedArchivalUnit sau) { log.debug("checkLeaf()"); String parent = sau.getUrlRoot() + "/branch1"; CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent); CachedUrlSet set = sau.makeCachedUrlSet(spec); Iterator setIt = set.contentHashIterator(); ArrayList childL = new ArrayList(16); while (setIt.hasNext()) { childL.add(((CachedUrlSetNode) setIt.next()).getUrl()); } String[] expectedA = new String[] { parent, parent + "/001file.html", parent + "/001file.txt", parent + "/002file.html", parent + "/002file.txt", parent + "/branch1", parent + "/branch1/001file.html", parent + "/branch1/001file.txt", parent + "/branch1/002file.html", parent + "/branch1/002file.txt", parent + "/branch1/index.html", parent + "/branch2", parent + "/branch2/001file.html", parent + "/branch2/001file.txt", parent + "/branch2/002file.html", parent + "/branch2/002file.txt", parent + "/branch2/index.html", parent + "/index.html", }; assertIsomorphic(expectedA, childL); } protected void checkUrlContent( SimulatedArchivalUnit sau, String path, int fileNum, int depth, int branchNum, boolean isAbnormal, boolean isDamaged) throws IOException { String file = sau.getUrlRoot() + path; CachedUrl url = sau.makeCachedUrl(file); String content = getUrlContent(url); String expectedContent; if (path.endsWith(".html")) { String fn = path.substring(path.lastIndexOf("/") + 1); expectedContent = scgen.getHtmlFileContent(fn, fileNum, depth, branchNum, isAbnormal); } else { expectedContent = scgen.getTxtContent(fileNum, depth, branchNum, isAbnormal); } if (isDamaged) { assertNotEquals(expectedContent, content); } else { assertEquals(expectedContent, content); } } protected void checkStoredContent(SimulatedArchivalUnit sau) throws IOException { checkUrlContent(sau, "/001file.txt", 1, 0, 0, false, false); checkUrlContent(sau, "/branch1/branch1/001file.txt", 1, 2, 1, true, false); checkUrlContent(sau, DAMAGED_CACHED_URL, 2, 2, 2, false, true); } protected void doDamageRemoveTest(SimulatedArchivalUnit sau) throws Exception { /* Cache the file again; this time the damage should be gone */ String file = sau.getUrlRoot() + DAMAGED_CACHED_URL; UrlCacher uc = sau.makeUrlCacher(file); BitSet fetchFlags = new BitSet(); fetchFlags.set(UrlCacher.REFETCH_FLAG); uc.setFetchFlags(fetchFlags); uc.cache(); checkUrlContent(sau, DAMAGED_CACHED_URL, 2, 2, 2, false, false); } private void measureHashSpeed(SimulatedArchivalUnit sau) throws Exception { MessageDigest dig = null; try { dig = MessageDigest.getInstance("SHA-1"); } catch (NoSuchAlgorithmException ex) { fail("No algorithm."); } CachedUrlSet set = sau.getAuCachedUrlSet(); CachedUrlSetHasher hasher = set.getContentHasher(dig); SystemMetrics metrics = theDaemon.getSystemMetrics(); int estimate = metrics.getBytesPerMsHashEstimate(hasher, dig); // should be protected against this being zero by MyMockSystemMetrics, // but otherwise use the proper calculation. This avoids test failure // due to really slow machines assertTrue(estimate > 0); long estimatedTime = set.estimatedHashDuration(); long size = ((Long) PrivilegedAccessor.getValue(set, "totalNodeSize")).longValue(); assertTrue(size > 0); System.out.println("b/ms: " + estimate); System.out.println("size: " + size); System.out.println("estimate: " + estimatedTime); assertEquals(estimatedTime, theDaemon.getHashService().padHashEstimate(size / estimate)); } private void checkHashSet( SimulatedArchivalUnit sau, boolean namesOnly, boolean filter, byte[] expected) throws Exception { enableFilter(sau, filter); CachedUrlSet set = sau.getAuCachedUrlSet(); byte[] hash = getHash(set, namesOnly); assertEquals(expected, hash); String parent = sau.getUrlRoot() + "/branch1"; CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent); set = sau.makeCachedUrlSet(spec); byte[] hash2 = getHash(set, namesOnly); assertFalse(Arrays.equals(hash, hash2)); } private byte[] getHash(CachedUrlSet set, boolean namesOnly) throws IOException { MessageDigest dig = null; try { dig = MessageDigest.getInstance("SHA-1"); } catch (NoSuchAlgorithmException ex) { fail("No algorithm."); } hash(set, dig, namesOnly); return dig.digest(); } private void hash(CachedUrlSet set, MessageDigest dig, boolean namesOnly) throws IOException { CachedUrlSetHasher hasher = null; if (namesOnly) { hasher = set.getNameHasher(dig); } else { hasher = set.getContentHasher(dig); } int bytesHashed = 0; long timeTaken = System.currentTimeMillis(); while (!hasher.finished()) { bytesHashed += hasher.hashStep(256); } timeTaken = System.currentTimeMillis() - timeTaken; if ((timeTaken > 0) && (bytesHashed > 500)) { System.out.println("Bytes hashed: " + bytesHashed); System.out.println("Time taken: " + timeTaken + "ms"); System.out.println("Bytes/sec: " + (bytesHashed * 1000 / timeTaken)); } else { System.out.println("No time taken, or insufficient bytes hashed."); System.out.println("Bytes hashed: " + bytesHashed); System.out.println("Time taken: " + timeTaken + "ms"); } } private String getUrlContent(CachedUrl url) throws IOException { InputStream content = url.getUnfilteredInputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); StreamUtil.copy(content, baos); content.close(); String contentStr = new String(baos.toByteArray()); baos.close(); return contentStr; } // this version doesn't fully override the 'measureHashSpeed()' function, but // protects against it returning '0' by returning the set speed private class MyMockSystemMetrics extends MockSystemMetrics { public int measureHashSpeed(CachedUrlSetHasher hasher, MessageDigest digest) throws IOException { int speed = super.measureHashSpeed(hasher, digest); if (speed == 0) { speed = getHashSpeed(); if (speed <= 0) { throw new RuntimeException("No hash speed set."); } } return speed; } } public static void main(String[] argv) { String[] testCaseList = {FuncSimulatedContent.class.getName()}; junit.swingui.TestRunner.main(testCaseList); } public static Test suite() { return new TestSuite(FuncSimulatedContent.class); } }
/** UI to invoke various daemon actions */ @SuppressWarnings("serial") public class DebugPanel extends LockssServlet { public static final String PREFIX = Configuration.PREFIX + "debugPanel."; /** Priority for crawls started from the debug panel */ public static final String PARAM_CRAWL_PRIORITY = PREFIX + "crawlPriority"; public static final int DEFAULT_CRAWL_PRIORITY = 10; /** Priority for crawls started from the debug panel */ public static final String PARAM_ENABLE_DEEP_CRAWL = PREFIX + "deepCrawlEnabled"; private static final boolean DEFAULT_ENABLE_DEEP_CRAWL = false; static final String KEY_ACTION = "action"; static final String KEY_MSG = "msg"; static final String KEY_NAME_SEL = "name_sel"; static final String KEY_NAME_TYPE = "name_type"; static final String KEY_AUID = "auid"; static final String KEY_URL = "url"; static final String KEY_REFETCH_DEPTH = "depth"; static final String KEY_TIME = "time"; static final String ACTION_MAIL_BACKUP = "Mail Backup File"; static final String ACTION_THROW_IOEXCEPTION = "Throw IOException"; static final String ACTION_FIND_URL = "Find Preserved URL"; public static final String ACTION_REINDEX_METADATA = "Reindex Metadata"; public static final String ACTION_FORCE_REINDEX_METADATA = "Force Reindex Metadata"; public static final String ACTION_START_V3_POLL = "Start V3 Poll"; static final String ACTION_FORCE_START_V3_POLL = "Force V3 Poll"; public static final String ACTION_START_CRAWL = "Start Crawl"; public static final String ACTION_FORCE_START_CRAWL = "Force Start Crawl"; public static final String ACTION_START_DEEP_CRAWL = "Deep Crawl"; public static final String ACTION_FORCE_START_DEEP_CRAWL = "Force Deep Crawl"; public static final String ACTION_CHECK_SUBSTANCE = "Check Substance"; static final String ACTION_CRAWL_PLUGINS = "Crawl Plugins"; static final String ACTION_RELOAD_CONFIG = "Reload Config"; static final String ACTION_SLEEP = "Sleep"; public static final String ACTION_DISABLE_METADATA_INDEXING = "Disable Indexing"; /** Set of actions for which audit alerts shouldn't be generated */ public static final Set noAuditActions = SetUtil.set(ACTION_FIND_URL); static final String COL2 = "colspan=2"; static final String COL2CENTER = COL2 + " align=center"; static Logger log = Logger.getLogger("DebugPanel"); private LockssDaemon daemon; private PluginManager pluginMgr; private PollManager pollManager; private CrawlManager crawlMgr; private ConfigManager cfgMgr; private DbManager dbMgr; private MetadataManager metadataMgr; private RemoteApi rmtApi; boolean showResult; boolean showForcePoll; boolean showForceCrawl; boolean showForceReindexMetadata; String formAuid; String formDepth = "100"; protected void resetLocals() { resetVars(); super.resetLocals(); } void resetVars() { formAuid = null; errMsg = null; statusMsg = null; showForcePoll = false; showForceCrawl = false; showForceReindexMetadata = false; } public void init(ServletConfig config) throws ServletException { super.init(config); daemon = getLockssDaemon(); pluginMgr = daemon.getPluginManager(); pollManager = daemon.getPollManager(); crawlMgr = daemon.getCrawlManager(); cfgMgr = daemon.getConfigManager(); rmtApi = daemon.getRemoteApi(); try { dbMgr = daemon.getDbManager(); metadataMgr = daemon.getMetadataManager(); } catch (IllegalArgumentException ex) { } } public void lockssHandleRequest() throws IOException { resetVars(); boolean showForm = true; String action = getParameter(KEY_ACTION); if (!StringUtil.isNullString(action)) { formAuid = getParameter(KEY_AUID); formDepth = getParameter(KEY_REFETCH_DEPTH); UserAccount acct = getUserAccount(); if (acct != null && !noAuditActions.contains(action)) { acct.auditableEvent("used debug panel action: " + action + " AU ID: " + formAuid); } } if (ACTION_MAIL_BACKUP.equals(action)) { doMailBackup(); } if (ACTION_RELOAD_CONFIG.equals(action)) { doReloadConfig(); } if (ACTION_SLEEP.equals(action)) { doSleep(); } if (ACTION_THROW_IOEXCEPTION.equals(action)) { doThrow(); } if (ACTION_START_V3_POLL.equals(action)) { doV3Poll(); } if (ACTION_FORCE_START_V3_POLL.equals(action)) { forceV3Poll(); } if (ACTION_START_CRAWL.equals(action)) { doCrawl(false, false); } if (ACTION_FORCE_START_CRAWL.equals(action)) { doCrawl(true, false); } if (ACTION_START_DEEP_CRAWL.equals(action)) { doCrawl(false, true); } if (ACTION_FORCE_START_DEEP_CRAWL.equals(action)) { doCrawl(true, true); } if (ACTION_CHECK_SUBSTANCE.equals(action)) { doCheckSubstance(); } if (ACTION_CRAWL_PLUGINS.equals(action)) { crawlPluginRegistries(); } if (ACTION_FIND_URL.equals(action)) { showForm = doFindUrl(); } if (ACTION_REINDEX_METADATA.equals(action)) { doReindexMetadata(); } if (ACTION_FORCE_REINDEX_METADATA.equals(action)) { forceReindexMetadata(); } if (ACTION_DISABLE_METADATA_INDEXING.equals(action)) { doDisableMetadataIndexing(); } if (showForm) { displayPage(); } } private void doMailBackup() { try { rmtApi.createConfigBackupFile(RemoteApi.BackupFileDisposition.Mail); } catch (Exception e) { errMsg = "Error: " + e.getMessage(); } } private void doReloadConfig() { cfgMgr.requestReload(); } private void doThrow() throws IOException { String msg = getParameter(KEY_MSG); throw new IOException(msg != null ? msg : "Test message"); } private void doSleep() throws IOException { String timestr = getParameter(KEY_TIME); try { long time = StringUtil.parseTimeInterval(timestr); Deadline.in(time).sleep(); statusMsg = "Slept for " + StringUtil.timeIntervalToString(time); } catch (NumberFormatException e) { errMsg = "Illegal duration: " + e; } catch (InterruptedException e) { errMsg = "Interrupted: " + e; } } private void doReindexMetadata() { ArchivalUnit au = getAu(); if (au == null) return; try { startReindexingMetadata(au, false); } catch (RuntimeException e) { log.error("Can't reindex metadata", e); errMsg = "Error: " + e.toString(); } } private void forceReindexMetadata() { ArchivalUnit au = getAu(); if (au == null) return; try { startReindexingMetadata(au, true); } catch (RuntimeException e) { log.error("Can't reindex metadata", e); errMsg = "Error: " + e.toString(); } } private void doDisableMetadataIndexing() { ArchivalUnit au = getAu(); if (au == null) return; try { disableMetadataIndexing(au, false); } catch (RuntimeException e) { log.error("Can't disable metadata indexing", e); errMsg = "Error: " + e.toString(); } } private void doCrawl(boolean force, boolean deep) { ArchivalUnit au = getAu(); if (au == null) return; try { startCrawl(au, force, deep); } catch (CrawlManagerImpl.NotEligibleException.RateLimiter e) { errMsg = "AU has crawled recently (" + e.getMessage() + "). Click again to override."; showForceCrawl = true; return; } catch (CrawlManagerImpl.NotEligibleException e) { errMsg = "Can't enqueue crawl: " + e.getMessage(); } } private void crawlPluginRegistries() { StringBuilder sb = new StringBuilder(); for (ArchivalUnit au : pluginMgr.getAllRegistryAus()) { sb.append(au.getName()); sb.append(": "); try { startCrawl(au, true, false); sb.append("Queued."); } catch (CrawlManagerImpl.NotEligibleException e) { sb.append("Failed: "); sb.append(e.getMessage()); } sb.append("\n"); } statusMsg = sb.toString(); } private boolean startCrawl(ArchivalUnit au, boolean force, boolean deep) throws CrawlManagerImpl.NotEligibleException { CrawlManagerImpl cmi = (CrawlManagerImpl) crawlMgr; if (force) { RateLimiter limit = cmi.getNewContentRateLimiter(au); if (!limit.isEventOk()) { limit.unevent(); } } cmi.checkEligibleToQueueNewContentCrawl(au); String delayMsg = ""; String deepMsg = ""; try { cmi.checkEligibleForNewContentCrawl(au); } catch (CrawlManagerImpl.NotEligibleException e) { delayMsg = ", Start delayed due to: " + e.getMessage(); } Configuration config = ConfigManager.getCurrentConfig(); int pri = config.getInt(PARAM_CRAWL_PRIORITY, DEFAULT_CRAWL_PRIORITY); CrawlReq req; try { req = new CrawlReq(au); req.setPriority(pri); if (deep) { int d = Integer.parseInt(formDepth); if (d < 0) { errMsg = "Illegal refetch depth: " + d; return false; } req.setRefetchDepth(d); deepMsg = "Deep (" + req.getRefetchDepth() + ") "; } } catch (NumberFormatException e) { errMsg = "Illegal refetch depth: " + formDepth; return false; } catch (RuntimeException e) { log.error("Couldn't create CrawlReq: " + au, e); errMsg = "Couldn't create CrawlReq: " + e.toString(); return false; } cmi.startNewContentCrawl(req, null); statusMsg = deepMsg + "Crawl requested for " + au.getName() + delayMsg; return true; } private void doCheckSubstance() { ArchivalUnit au = getAu(); if (au == null) return; try { checkSubstance(au); } catch (RuntimeException e) { log.error("Error in SubstanceChecker", e); errMsg = "Error in SubstanceChecker; see log."; } } private void checkSubstance(ArchivalUnit au) { SubstanceChecker subChecker = new SubstanceChecker(au); if (!subChecker.isEnabled()) { errMsg = "No substance patterns defined for plugin."; return; } AuState auState = AuUtil.getAuState(au); SubstanceChecker.State oldState = auState.getSubstanceState(); SubstanceChecker.State newState = subChecker.findSubstance(); String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")"); switch (newState) { case Unknown: log.error("Shouldn't happen: SubstanceChecker returned Unknown"); errMsg = "Error in SubstanceChecker; see log."; break; case Yes: statusMsg = "AU has substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.Yes); break; case No: statusMsg = "AU has no substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.No); break; } } private boolean startReindexingMetadata(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } if (!force) { if (!AuUtil.hasCrawled(au)) { errMsg = "Au has never crawled. Click again to reindex metadata"; showForceReindexMetadata = true; return false; } AuState auState = AuUtil.getAuState(au); switch (auState.getSubstanceState()) { case No: errMsg = "Au has no substance. Click again to reindex metadata"; showForceReindexMetadata = true; return false; case Unknown: errMsg = "Unknown substance for Au. Click again to reindex metadata."; showForceReindexMetadata = true; return false; case Yes: // fall through } } // Fully reindex metadata with the highest priority. Connection conn = null; PreparedStatement insertPendingAuBatchStatement = null; try { conn = dbMgr.getConnection(); insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn); if (metadataMgr.enableAndAddAuToReindex( au, conn, insertPendingAuBatchStatement, false, true)) { statusMsg = "Reindexing metadata for " + au.getName(); return true; } } catch (DbException dbe) { log.error("Cannot reindex metadata for " + au.getName(), dbe); } finally { DbManager.safeCloseStatement(insertPendingAuBatchStatement); DbManager.safeRollbackAndClose(conn); } if (force) { errMsg = "Still cannot reindex metadata for " + au.getName(); } else { errMsg = "Cannot reindex metadata for " + au.getName(); } return false; } private boolean disableMetadataIndexing(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } try { metadataMgr.disableAuIndexing(au); statusMsg = "Disabled metadata indexing for " + au.getName(); return true; } catch (Exception e) { errMsg = "Cannot reindex metadata for " + au.getName() + ": " + e.getMessage(); return false; } } private void doV3Poll() { ArchivalUnit au = getAu(); if (au == null) return; try { callV3ContentPoll(au); } catch (PollManager.NotEligibleException e) { errMsg = "AU is not eligible for poll: " + e.getMessage(); // errMsg = "Ineligible: " + e.getMessage() + // "<br>Click again to force new poll."; // showForcePoll = true; return; } catch (Exception e) { log.error("Can't start poll", e); errMsg = "Error: " + e.toString(); } } private void forceV3Poll() { ArchivalUnit au = getAu(); if (au == null) return; try { callV3ContentPoll(au); } catch (Exception e) { log.error("Can't start poll", e); errMsg = "Error: " + e.toString(); } } private void callV3ContentPoll(ArchivalUnit au) throws PollManager.NotEligibleException { log.debug("Enqueuing a V3 Content Poll on " + au.getName()); PollSpec spec = new PollSpec(au.getAuCachedUrlSet(), Poll.V3_POLL); pollManager.enqueueHighPriorityPoll(au, spec); statusMsg = "Enqueued V3 poll for " + au.getName(); } private boolean doFindUrl() throws IOException { String url = getParameter(KEY_URL); String redir = srvURL( AdminServletManager.SERVLET_DAEMON_STATUS, PropUtil.fromArgs("table", ArchivalUnitStatus.AUS_WITH_URL_TABLE_NAME, "key", url)); resp.setContentLength(0); // resp.sendRedirect(resp.encodeRedirectURL(redir)); resp.sendRedirect(redir); return false; } ArchivalUnit getAu() { if (StringUtil.isNullString(formAuid)) { errMsg = "Select an AU"; return null; } ArchivalUnit au = pluginMgr.getAuFromId(formAuid); if (au == null) { errMsg = "No such AU. Select an AU"; return null; } return au; } private void displayPage() throws IOException { Page page = newPage(); layoutErrorBlock(page); ServletUtil.layoutExplanationBlock(page, "Debug Actions"); page.add(makeForm()); page.add("<br>"); endPage(page); } private Element makeForm() { Composite comp = new Composite(); Form frm = new Form(srvURL(myServletDescr())); frm.method("POST"); frm.add("<br><center>"); Input reload = new Input(Input.Submit, KEY_ACTION, ACTION_RELOAD_CONFIG); setTabOrder(reload); frm.add(reload); frm.add(" "); Input backup = new Input(Input.Submit, KEY_ACTION, ACTION_MAIL_BACKUP); setTabOrder(backup); frm.add(backup); frm.add(" "); Input crawlplug = new Input(Input.Submit, KEY_ACTION, ACTION_CRAWL_PLUGINS); setTabOrder(crawlplug); frm.add(crawlplug); frm.add("</center>"); ServletDescr d1 = AdminServletManager.SERVLET_HASH_CUS; if (isServletRunnable(d1)) { frm.add("<br><center>" + srvLink(d1, d1.heading) + "</center>"); } Input findUrl = new Input(Input.Submit, KEY_ACTION, ACTION_FIND_URL); Input findUrlText = new Input(Input.Text, KEY_URL); findUrlText.setSize(50); setTabOrder(findUrl); setTabOrder(findUrlText); frm.add("<br><center>" + findUrl + " " + findUrlText + "</center>"); Input thrw = new Input(Input.Submit, KEY_ACTION, ACTION_THROW_IOEXCEPTION); Input thmsg = new Input(Input.Text, KEY_MSG); setTabOrder(thrw); setTabOrder(thmsg); frm.add("<br><center>" + thrw + " " + thmsg + "</center>"); frm.add("<br><center>AU Actions: select AU</center>"); Composite ausel = ServletUtil.layoutSelectAu(this, KEY_AUID, formAuid); frm.add("<br><center>" + ausel + "</center>"); setTabOrder(ausel); Input v3Poll = new Input( Input.Submit, KEY_ACTION, (showForcePoll ? ACTION_FORCE_START_V3_POLL : ACTION_START_V3_POLL)); Input crawl = new Input( Input.Submit, KEY_ACTION, (showForceCrawl ? ACTION_FORCE_START_CRAWL : ACTION_START_CRAWL)); frm.add("<br><center>"); frm.add(v3Poll); frm.add(" "); frm.add(crawl); if (CurrentConfig.getBooleanParam(PARAM_ENABLE_DEEP_CRAWL, DEFAULT_ENABLE_DEEP_CRAWL)) { Input deepCrawl = new Input( Input.Submit, KEY_ACTION, (showForceCrawl ? ACTION_FORCE_START_DEEP_CRAWL : ACTION_START_DEEP_CRAWL)); Input depthText = new Input(Input.Text, KEY_REFETCH_DEPTH, formDepth); depthText.setSize(4); setTabOrder(depthText); frm.add(" "); frm.add(deepCrawl); frm.add(depthText); } Input checkSubstance = new Input(Input.Submit, KEY_ACTION, ACTION_CHECK_SUBSTANCE); frm.add("<br>"); frm.add(checkSubstance); if (metadataMgr != null) { Input reindex = new Input( Input.Submit, KEY_ACTION, (showForceReindexMetadata ? ACTION_FORCE_REINDEX_METADATA : ACTION_REINDEX_METADATA)); frm.add(" "); frm.add(reindex); Input disableIndexing = new Input(Input.Submit, KEY_ACTION, ACTION_DISABLE_METADATA_INDEXING); frm.add(" "); frm.add(disableIndexing); } frm.add("</center>"); comp.add(frm); return comp; } }
/** * PluginArchivalUnit: The Archival Unit Class for PluginPlugin. This archival unit uses a base url * to define an archival unit. * * @author Seth Morabito * @version 1.0 */ public class RegistryArchivalUnit extends BaseArchivalUnit { protected static final Logger log = Logger.getLogger("RegistryArchivalUnit"); /** The interval between recrawls of the loadable plugin registry AUs. */ static final String PARAM_REGISTRY_CRAWL_INTERVAL = RegistryPlugin.PREFIX + "crawlInterval"; static final long DEFAULT_REGISTRY_CRAWL_INTERVAL = Constants.DAY; /** * If "au", registry AUs will crawl in parallel using individual rate limiters; if "plugin" * they'll crawl sequentially using a shared rate limiter */ static final String PARAM_REGISTRY_FETCH_RATE_LIMITER_SOURCE = RegistryPlugin.PREFIX + "fetchRateLimiterSource"; static final String DEFAULT_REGISTRY_FETCH_RATE_LIMITER_SOURCE = "au"; /** Limits fetch rate of registry crawls */ static final String PARAM_REGISTRY_FETCH_RATE = RegistryPlugin.PREFIX + "fetchRate"; static final String DEFAULT_REGISTRY_FETCH_RATE = "20/10s"; /** Run polls on Plugin registry AUs */ static final String PARAM_ENABLE_REGISTRY_POLLS = RegistryPlugin.PREFIX + "enablePolls"; static final boolean DEFAULT_ENABLE_REGISTRY_POLLS = true; private String m_registryUrl = null; private int m_maxRefetchDepth = NewContentCrawler.DEFAULT_MAX_CRAWL_DEPTH; private List m_permissionCheckers = null; private boolean recomputeRegName = true; private boolean enablePolls = DEFAULT_ENABLE_REGISTRY_POLLS; private String regName = null; public RegistryArchivalUnit(RegistryPlugin plugin) { super(plugin); } // Called by RegistryPlugin iff any config below RegistryPlugin.PREFIX // has changed protected void setConfig( Configuration config, Configuration prevConfig, Configuration.Differences changedKeys) { m_maxRefetchDepth = config.getInt( NewContentCrawler.PARAM_MAX_CRAWL_DEPTH, NewContentCrawler.DEFAULT_MAX_CRAWL_DEPTH); fetchRateLimiter = recomputeFetchRateLimiter(fetchRateLimiter); enablePolls = config.getBoolean(PARAM_ENABLE_REGISTRY_POLLS, DEFAULT_ENABLE_REGISTRY_POLLS); } public void loadAuConfigDescrs(Configuration config) throws ConfigurationException { super.loadAuConfigDescrs(config); this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey()); // Now we can construct a valid CC permission checker. m_permissionCheckers = // ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl)); ListUtil.list(new CreativeCommonsPermissionChecker()); paramMap.putLong( KEY_AU_NEW_CONTENT_CRAWL_INTERVAL, CurrentConfig.getTimeIntervalParam( PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL)); if (log.isDebug2()) { log.debug2( "Setting Registry AU recrawl interval to " + StringUtil.timeIntervalToString( paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL))); } } /** * return a string that represents the plugin registry. This is just the base URL. * * @return The base URL. */ protected String makeName() { return "Plugin registry at '" + m_registryUrl + "'"; } public String getName() { if (recomputeRegName) { regName = recomputeRegName(); } if (regName != null) { return regName; } else { return super.getName(); } } // If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } } boolean isStarted() { return getPlugin().getDaemon().getPluginManager().getAuFromId(getAuId()) != null; } /** * return a string that points to the plugin registry page. * * @return a string that points to the plugin registry page for this registry. This is just the * base URL. */ protected String makeStartUrl() { return m_registryUrl; } /** Call top level polls iff configured to do so. */ public boolean shouldCallTopLevelPoll(AuState aus) { if (!enablePolls) { return false; } return super.shouldCallTopLevelPoll(aus); } /** * Return a new CrawlSpec with the appropriate collect AND redistribute permissions, and with the * maximum refetch depth. * * @return CrawlSpec */ protected CrawlSpec makeCrawlSpec() throws LockssRegexpException { CrawlRule rule = makeRules(); List startUrls = getNewContentCrawlUrls(); return new SpiderCrawlSpec(startUrls, startUrls, rule, m_maxRefetchDepth, null, null); } /** * return the collection of crawl rules used to crawl and cache a list of Plugin JAR files. * * @return CrawlRule */ protected CrawlRule makeRules() { return new RegistryRule(); } // Might need to recompute name if refetch start page public UrlCacher makeUrlCacher(String url) { if (url.equals(m_registryUrl)) { recomputeRegName = true; } return super.makeUrlCacher(url); } protected RateLimiter recomputeFetchRateLimiter(RateLimiter oldLimiter) { String rate = CurrentConfig.getParam(PARAM_REGISTRY_FETCH_RATE, DEFAULT_REGISTRY_FETCH_RATE); Object limiterKey = getFetchRateLimiterKey(); if (limiterKey == null) { return RateLimiter.getRateLimiter(oldLimiter, rate, DEFAULT_REGISTRY_FETCH_RATE); } else { RateLimiter.Pool pool = RateLimiter.getPool(); return pool.findNamedRateLimiter(limiterKey, rate, DEFAULT_REGISTRY_FETCH_RATE); } } protected String getFetchRateLimiterSource() { return CurrentConfig.getParam( PARAM_REGISTRY_FETCH_RATE_LIMITER_SOURCE, DEFAULT_REGISTRY_FETCH_RATE_LIMITER_SOURCE); } // Registry AU crawl rule implementation private class RegistryRule implements CrawlRule { public int match(String url) { if (StringUtil.equalStringsIgnoreCase(url, m_registryUrl) || StringUtil.endsWithIgnoreCase(url, ".jar")) { return CrawlRule.INCLUDE; } else { return CrawlRule.EXCLUDE; } } } }
/** * Class implementing the concept of the set of URLs covered by a poll. * * @author Claire Griffin * @version 1.0 */ public class PollSpec { /** * A lower bound value which indicates the poll should use a {@link SingleNodeCachedUrlSetSpec} * instead of a {@link RangeCachedUrlSetSpec}. */ public static final String SINGLE_NODE_LWRBOUND = "."; public static final String DEFAULT_PLUGIN_VERSION = "1"; private static Logger theLog = Logger.getLogger("PollSpec"); private String auId; private String pluginVersion; private String url; private String uprBound = null; private String lwrBound = null; private CachedUrlSet cus = null; private PluginManager pluginMgr = null; private int protocolVersion; // poll protocol version private int pollType; // One of the types defined by Poll private V3Poller.PollVariant variant = V3Poller.PollVariant.PoR; /** * Construct a PollSpec from a CachedUrlSet and an upper and lower bound * * @param cus the CachedUrlSet * @param lwrBound the lower boundary * @param uprBound the upper boundary * @param pollType one of the types defined by Poll */ public PollSpec(CachedUrlSet cus, String lwrBound, String uprBound, int pollType) { commonSetup(cus, lwrBound, uprBound, pollType); } /** * Construct a PollSpec from a CachedUrlSet. * * @param cus the CachedUrlSpec which defines the range of interest * @param pollType one of the types defined by Poll */ public PollSpec(CachedUrlSet cus, int pollType) { CachedUrlSetSpec cuss = cus.getSpec(); if (cuss instanceof RangeCachedUrlSetSpec) { RangeCachedUrlSetSpec rcuss = (RangeCachedUrlSetSpec) cuss; commonSetup(cus, rcuss.getLowerBound(), rcuss.getUpperBound(), pollType); } else if (cuss.isSingleNode()) { commonSetup(cus, SINGLE_NODE_LWRBOUND, null, pollType); } else { commonSetup(cus, null, null, pollType); } } /** * Construct a PollSpec from a V1 LcapMessage * * @param msg the LcapMessage which defines the range of interest */ public PollSpec(V1LcapMessage msg) { auId = msg.getArchivalId(); pluginVersion = msg.getPluginVersion(); url = msg.getTargetUrl(); uprBound = msg.getUprBound(); lwrBound = msg.getLwrBound(); protocolVersion = msg.getProtocolVersion(); if (msg.isContentPoll()) { pollType = Poll.V1_CONTENT_POLL; } else if (msg.isNamePoll()) { pollType = Poll.V1_NAME_POLL; } else if (msg.isVerifyPoll()) { pollType = Poll.V1_VERIFY_POLL; } else { pollType = -1; } cus = getPluginManager().findCachedUrlSet(this); } public PollSpec(V3LcapMessage msg) { this( msg.getArchivalId(), (msg.getTargetUrl() == null) ? "lockssau:" : msg.getTargetUrl(), null, null, Poll.V3_POLL); protocolVersion = msg.getProtocolVersion(); pluginVersion = msg.getPluginVersion(); } /** Construct a PollSpec from explicit args */ public PollSpec(String auId, String url, String lower, String upper, int pollType) { this.auId = auId; this.url = url; uprBound = upper; lwrBound = lower; this.pollType = pollType; cus = getPluginManager().findCachedUrlSet(this); this.protocolVersion = protocolVersionFromPollType(pollType); } /** * Construct a PollSpec from another PollSpec and a poll type XXX it seems that other constructors * are not setting all fields */ public PollSpec(PollSpec ps, int pollType) { this.auId = ps.auId; this.pluginVersion = ps.pluginVersion; this.url = ps.url; this.uprBound = ps.uprBound; this.lwrBound = ps.lwrBound; this.cus = ps.cus; this.pluginMgr = ps.pluginMgr; this.protocolVersion = ps.protocolVersion; this.pollType = pollType; } /** Setup common to most constructors */ private void commonSetup(CachedUrlSet cus, String lwrBound, String uprBound, int pollType) { CachedUrlSetSpec cuss = cus.getSpec(); if (cuss instanceof PrunedCachedUrlSetSpec) { throw new IllegalArgumentException("Polls do not support PrunedCachedUrlSetSpec"); } this.cus = cus; ArchivalUnit au = cus.getArchivalUnit(); auId = au.getAuId(); this.pluginVersion = AuUtil.getPollVersion(au); url = cuss.getUrl(); this.lwrBound = lwrBound; this.uprBound = uprBound; this.protocolVersion = protocolVersionFromPollType(pollType); this.pollType = pollType; } public CachedUrlSet getCachedUrlSet() { return cus; } public String getAuId() { return auId; } public String getPluginVersion() { return (pluginVersion != null) ? pluginVersion : DEFAULT_PLUGIN_VERSION; } public String getUrl() { return url; } public String getLwrBound() { return lwrBound; } public String getUprBound() { return uprBound; } public String getRangeString() { if (StringUtil.equalStrings(lwrBound, SINGLE_NODE_LWRBOUND)) { return "single node"; } if (lwrBound != null || uprBound != null) { String lwrDisplay = lwrBound; String uprDisplay = uprBound; if (lwrBound != null && lwrBound.startsWith("/")) { lwrDisplay = lwrBound.substring(1); } if (uprBound != null && uprBound.startsWith("/")) { uprDisplay = uprBound.substring(1); } return lwrDisplay + " - " + uprDisplay; } return null; } public int getProtocolVersion() { return protocolVersion; } public int getPollType() { return pollType; } public V3Poller.PollVariant getPollVariant() { return variant; } public void setPollVariant(V3Poller.PollVariant v) { variant = v; } private PluginManager getPluginManager() { if (pluginMgr == null) { pluginMgr = (PluginManager) LockssDaemon.getManager(LockssDaemon.PLUGIN_MANAGER); } return pluginMgr; } /** * Given a poll type, return the correct version of the protocol to use. * * @param pollType * @return The protocol version to use */ private int protocolVersionFromPollType(int pollType) { switch (pollType) { case Poll.V1_CONTENT_POLL: case Poll.V1_NAME_POLL: case Poll.V1_VERIFY_POLL: return Poll.V1_PROTOCOL; case Poll.V3_POLL: return Poll.V3_PROTOCOL; default: return Poll.UNDEFINED_PROTOCOL; } } public String toString() { return "[PS: " + Poll.POLL_NAME[pollType] + " auid=" + auId + ", url=" + url + ", l=" + lwrBound + ", u=" + uprBound + ", type=" + pollType + ", plugVer=" + getPluginVersion() + ", protocol=" + protocolVersion + "]"; } }
public class TestElsevierDTD5XmlMetadataExtractor extends SourceXmlMetadataExtractorTest { private static final Logger log = Logger.getLogger(TestElsevierDTD5XmlMetadataExtractor.class); private MockLockssDaemon theDaemon; protected ArchivalUnit tarAu; private static String PLUGIN_NAME = "org.lockss.plugin.elsevier.ClockssElsevierDTD5SourcePlugin"; private static String BASE_URL = "http://www.source.org/"; private static String YEAR_NAME = "2014"; private static String TAR_A_BASE = BASE_URL + YEAR_NAME + "/CLKS003A.tar"; private static String TAR_B_BASE = BASE_URL + YEAR_NAME + "/CLKS003B.tar"; private static String SUBDIR = "!/CLKS003/"; CIProperties tarHeader; /* for testing validation */ private static Map<String, String> pubTitleMap; private static Map<String, String> dateMap; private static Map<String, String> accessUrlMap; private static Map<String, String> volMap; private static Map<String, String> issueMap; private static Map<String, List<String>> authorMap; static FileMetadataListExtractor els_mle; static FileMetadataListExtractor nocheck_mle; private static final String testDatasetFile = "testDataset.xml"; private static final String realTARFile_A = "CLKS003A.tar"; private static final String realTARFile_B = "CLKS003B.tar"; public void setUp() throws Exception { super.setUp(); tarHeader = new CIProperties(); tarHeader.put(CachedUrl.PROPERTY_CONTENT_TYPE, "application/tar"); tarAu = createTarAu(); // for tests that also check for content els_mle = new FileMetadataListExtractor( new ElsevierDTD5XmlSourceMetadataExtractorFactory .ElsevierDTD5XmlSourceMetadataExtractor()); // for tests that use a no-check-for-pdf version of the extractor nocheck_mle = new FileMetadataListExtractor(new TestElsevierDTD5MetadataExtractor()); setUpExpectedTarContent(); } protected ArchivalUnit createTarAu() throws ArchivalUnit.ConfigurationException { // in this directory this is file "test_elsevierdtd5.tdb" but it becomes xml try { ConfigurationUtil.addFromUrl(getResource("test_elsevierdtd5.xml")); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } Tdb tdb = ConfigManager.getCurrentConfig().getTdb(); TdbAu tdbau1 = tdb.getTdbAusLikeName("Elsevier Source Content 2014").get(0); assertNotNull("Didn't find named TdbAu", tdbau1); return PluginTestUtil.createAndStartAu(tdbau1); } /* * The tests to run for this class */ public void testSimpleMainXML() throws Exception { log.debug3("testSimpleMainXML"); String xml_url = TAR_A_BASE + SUBDIR + "01420615/v64sC/S0142061514004608/main.xml"; List<ArticleMetadata> mdList = extractFromContent(xml_url, "text/xml", simpleMain, nocheck_mle, null); assertEquals(1, mdList.size()); validateSingleMainMetadataRecord(mdList.get(0), "10.1016/j.jidx.2014.07.028", "article"); } public void testSimpleDatasetXML() throws Exception { log.debug3("testSimpleDatasetXML"); String file_input = StringUtil.fromInputStream(getResourceAsStream(testDatasetFile)); String xml_url = TAR_A_BASE + SUBDIR + "dataset.xml"; List<ArticleMetadata> mdList = extractFromContent(xml_url, "text/xml", file_input, nocheck_mle, null); assertEquals(6, mdList.size()); Iterator<ArticleMetadata> mdIt = mdList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateDatasetMetadataRecord(mdRecord); } } public void testFunctionalFromTarHierarchy() throws Exception { log.debug3("in testFromTarHierarchy"); // load the tarballs InputStream file_input = null; try { file_input = getResourceAsStream(realTARFile_A); // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE); // uc.storeContent(file_input, tarHeader); UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); file_input = getResourceAsStream(realTARFile_B); // uc = au.makeUrlCacher(TAR_B_BASE); // uc.storeContent(file_input, tarHeader); uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.safeClose(file_input); } CachedUrlSet cus = tarAu.getAuCachedUrlSet(); for (CachedUrl cu : cus.getCuIterable()) { log.debug3("AU - cu is: " + cu.getUrl()); cu.release(); } // We need to start from the level of the ArticleMetadataExtractor MyListEmitter emitter = new MyListEmitter(); ArticleMetadataExtractor amEx = new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA); Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any()); while (it.hasNext()) { ArticleFiles af = it.next(); log.debug3("Metadata test - articlefiles " + af.toString()); // CachedUrl cu = af.getFullTextCu(); CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA); log.debug3("metadata cu is " + cu.getUrl()); // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu); amEx.extract(MetadataTarget.Any(), af, emitter); List<ArticleMetadata> returnList = emitter.getAmList(); assertNotNull(returnList); log.debug3("size of returnList is " + returnList.size()); Iterator<ArticleMetadata> mdIt = returnList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateCompleteMetadataRecord(mdRecord); } } } /* * The supporting methods */ private void setUpExpectedTarContent() { /* maps the DOIs in the metadata to the expected values */ log.debug3("setUpExpectedTarContent"); pubTitleMap = new HashMap<String, String>(); { pubTitleMap.put("10.1016/j.jidx.2014.07.028", "International Journal of XXX"); pubTitleMap.put("10.1016/j.jidx2.2014.05.013", "Revista"); pubTitleMap.put("10.1016/S1473-1111(14)70840-0", "The Journal"); pubTitleMap.put("10.1016/S0140-1111(14)61865-1", "The Other Journal"); pubTitleMap.put("10.1016/j.foo.2014.08.001", "Foo"); pubTitleMap.put("10.1016/j.foo.2014.08.123", "Foo"); } ; dateMap = new HashMap<String, String>(); { dateMap.put("10.1016/j.jidx.2014.07.028", "2014-07-30"); dateMap.put("10.1016/j.jidx2.2014.05.013", "2014-07-09"); dateMap.put("10.1016/S1473-1111(14)70840-0", "2014-09-01"); dateMap.put("10.1016/S0140-1111(14)61865-1", "2014"); // will get from main.xml as backup dateMap.put("10.1016/j.foo.2014.08.001", "2014-08-20"); dateMap.put("10.1016/j.foo.2014.08.123", "2014-08-20"); } ; accessUrlMap = new HashMap<String, String>(); { accessUrlMap.put( "10.1016/j.jidx.2014.07.028", TAR_A_BASE + SUBDIR + "01420615/v64sC/S0142061514004608/main.pdf"); accessUrlMap.put( "10.1016/j.jidx2.2014.05.013", TAR_A_BASE + SUBDIR + "00349356/v61i9/S0034935614001819/main.pdf"); accessUrlMap.put( "10.1016/S1473-1111(14)70840-0", TAR_A_BASE + SUBDIR + "14733099/v14i10/S1473309914708400/main.pdf"); accessUrlMap.put( "10.1016/S0140-1111(14)61865-1", TAR_B_BASE + SUBDIR + "01406736/v384sS1/S0140673614618651/main.pdf"); accessUrlMap.put( "10.1016/j.foo.2014.08.001", TAR_B_BASE + SUBDIR + "00191035/v242sC/S0019103514004151/main.pdf"); accessUrlMap.put( "10.1016/j.foo.2014.08.123", TAR_B_BASE + SUBDIR + "00191035/v242sC/S0019103514003856/main.pdf"); } ; ArrayList<String> goodAuthors = new ArrayList<String>(); { goodAuthors.add("Writer, Bob"); goodAuthors.add("Q. Text, Samantha"); } ArrayList<String> simpleAuthors = new ArrayList<String>(); { simpleAuthors.add("Simple, Josh"); } ArrayList<String> extendedAuthors = new ArrayList<String>(); { extendedAuthors.add("Writer, Bob"); extendedAuthors.add("Q. Text, Samantha"); extendedAuthors.add("The COLLABORATIVE Investigators"); } authorMap = new HashMap<String, List<String>>(); { authorMap.put("10.1016/j.jidx.2014.07.028", goodAuthors); authorMap.put("10.1016/j.jidx2.2014.05.013", goodAuthors); authorMap.put("10.1016/S1473-1111(14)70840-0", extendedAuthors); authorMap.put("10.1016/S0140-1111(14)61865-1", simpleAuthors); authorMap.put("10.1016/j.foo.2014.08.001", goodAuthors); authorMap.put("10.1016/j.foo.2014.08.123", goodAuthors); } ; volMap = new HashMap<String, String>(); { volMap.put("10.1016/j.jidx.2014.07.028", "64"); volMap.put("10.1016/j.jidx2.2014.05.013", "61"); volMap.put("10.1016/S1473-1111(14)70840-0", "14"); volMap.put("10.1016/S0140-1111(14)61865-1", "384"); volMap.put("10.1016/j.foo.2014.08.001", "242"); volMap.put("10.1016/j.foo.2014.08.123", "242"); } ; issueMap = new HashMap<String, String>(); { issueMap.put("10.1016/j.jidx.2014.07.028", "C"); issueMap.put("10.1016/j.jidx2.2014.05.013", "9"); issueMap.put("10.1016/S1473-1111(14)70840-0", "10"); issueMap.put("10.1016/S0140-1111(14)61865-1", "S1"); issueMap.put("10.1016/j.foo.2014.08.001", "C"); issueMap.put("10.1016/j.foo.2014.08.123", "C"); } ; } private String common_issn = "1111-1111"; private String common_article_title = "Article about Important Things"; // private String common_simple_article_title = "Simple Article Title for Update"; private String common_simple_article_title = "Newsdesk Simple Dochead"; /* * When testing a complete extraction out of the tarset, the MD record will be completely filled in * and pdf-existence will get established */ private void validateCompleteMetadataRecord(ArticleMetadata am) { log.debug3("valideCompleteMetadatRecord"); String doi_val = am.get(MetadataField.FIELD_DOI); /* make sure we can pick up both types of xml article data */ log.debug3("doi val is: " + doi_val); if ("JA 5.2.0 SIMPLE-ARTICLE" .equals(am.getRaw(ElsevierDatasetXmlSchemaHelper.dataset_dtd_metadata))) { log.debug3("simple-article"); assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } else { assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN)); assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR)); assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE)); assertEquals(accessUrlMap.get(doi_val), am.get(MetadataField.FIELD_ACCESS_URL)); assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME)); assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE)); assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals("Elsevier", am.get(MetadataField.FIELD_PROVIDER)); assertEquals("Elsevier", am.get(MetadataField.FIELD_PUBLISHER)); log.debug3(am.ppString(2)); } /* * When testing no-pdf-check basic XML parsing, you will get partial MD records * depending on whether the info comes from dataset.xml or from main.xml */ private void validateDatasetMetadataRecord(ArticleMetadata am) { log.debug3("valideDatasetMetadatRecord"); String doi_val = am.get(MetadataField.FIELD_DOI); assertEquals(common_issn, am.get(MetadataField.FIELD_ISSN)); log.debug3("doi val is: " + doi_val); // The dataset doesn't set this value, it'll fail over the main.xml value if (doi_val.equals("10.1016/S0140-1111(14)61865-1")) { assertEquals(null, am.get(MetadataField.FIELD_DATE)); } else { assertEquals(dateMap.get(doi_val), am.get(MetadataField.FIELD_DATE)); } assertEquals(pubTitleMap.get(doi_val), am.get(MetadataField.FIELD_PUBLICATION_TITLE)); } /* * You will have to tell it the DOI and the schema because those normally come from dataset */ private void validateSingleMainMetadataRecord(ArticleMetadata am, String doi_val, String schema) { log.debug3("valideSingleMainMetadatRecord"); if ("simple-article".equals(schema)) { assertEquals(common_simple_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } else { assertEquals(common_article_title, am.get(MetadataField.FIELD_ARTICLE_TITLE)); } log.debug3("doi val is: " + doi_val); assertEquals(authorMap.get(doi_val), am.getList(MetadataField.FIELD_AUTHOR)); assertEquals(volMap.get(doi_val), am.get(MetadataField.FIELD_VOLUME)); assertEquals(issueMap.get(doi_val), am.get(MetadataField.FIELD_ISSUE)); assertEquals("Comment", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_dochead)); assertEquals(doi_val, am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_doi)); assertEquals("2014", am.getRaw(ElsevierMainDTD5XmlSchemaHelper.common_copyright)); } private static final String simpleMain = "<?xml version=\"1.0\" encoding=\"utf-8\"?>" + "<!DOCTYPE article PUBLIC \"-//ES//DTD journal article DTD version 5.2.0//EN//XML\" \"art520.dtd\">" + "<article docsubtype=\"fla\" xml:lang=\"en\">" + "<item-info><jid>TEST</jid>" + "<aid>9906</aid>" + "<ce:article-number>e09906</ce:article-number>" + "<ce:pii>S9999-9994(15)00010-0</ce:pii>" + "<ce:doi>10.1016/j.jidx.2014.07.028</ce:doi>" + "<ce:copyright type=\"full-transfer\" year=\"2014\">Elsevier GmbH</ce:copyright>" + "</item-info>" + "<head>" + "<ce:dochead id=\"cedoch10\"><ce:textfn>Comment</ce:textfn></ce:dochead>" + "<ce:title id=\"tm005\">Article about Important Things</ce:title>" + "<ce:author-group id=\"ag005\">" + "<ce:author id=\"au005\">" + "<ce:given-name>Bob</ce:given-name><ce:surname>Writer</ce:surname>" + "<ce:cross-ref id=\"ar005\" refid=\"af005\"><ce:sup>a</ce:sup></ce:cross-ref>" + "<ce:cross-ref id=\"ar010\" refid=\"cor1\"><ce:sup>⁎</ce:sup></ce:cross-ref>" + "<ce:e-address id=\"em005\" type=\"email\">[email protected]</ce:e-address>" + "</ce:author>" + "<ce:author id=\"au001\">" + "<ce:given-name>Samantha</ce:given-name><ce:surname>Q. Text</ce:surname>" + "<ce:cross-ref id=\"ar001\" refid=\"af001\"><ce:sup>a</ce:sup></ce:cross-ref>" + "<ce:cross-ref id=\"ar010\" refid=\"cor1\"><ce:sup>⁎</ce:sup></ce:cross-ref>" + "<ce:e-address id=\"em005\" type=\"email\">[email protected]</ce:e-address>" + "</ce:author>" + "</ce:author-group>" + "<ce:date-received day=\"1\" month=\"1\" year=\"2014\"/>" + "<ce:date-revised day=\"26\" month=\"7\" year=\"2014\"/>" + "<ce:date-accepted day=\"3\" month=\"8\" year=\"2014\"/>" + "<ce:abstract class=\"author\" xml:lang=\"en\" id=\"ab005\"><ce:section-title id=\"st050\">Abstract</ce:section-title>" + "<ce:abstract-sec id=\"as005\"><ce:simple-para id=\"sp005\">Abstract goes here.</ce:simple-para></ce:abstract-sec>" + "</ce:abstract>" + "</head>" + "<body>" + "</body>" + "<tail>" + "</tail>" + "</article>"; static class MyListEmitter implements ArticleMetadataExtractor.Emitter { List<ArticleMetadata> amlst = new ArrayList<ArticleMetadata>(); public void emitMetadata(ArticleFiles af, ArticleMetadata md) { if (log.isDebug3()) log.debug3("emit(" + af + ", " + md + ")"); if (md != null) { log.debug3("add " + md + " to amlist"); amlst.add(md); } ; } public List<ArticleMetadata> getAmList() { return amlst; } } /* * A test version of the extractor that allows for suppression of the file check * this allows for basic XML parsing tests without having to provide the actual file content */ public class TestElsevierDTD5MetadataExtractor extends ElsevierDTD5XmlSourceMetadataExtractorFactory.ElsevierDTD5XmlSourceMetadataExtractor { // // Override implementation of getFilenamesAssociatedWithRecord to force // emit for testing purposes - while allowing use of Elsevier extractor. // If a null list is returned, preEmitCheck returns "true" // allowing emit. // protected ArrayList<String> getFilenamesAssociatedWithRecord( SourceXmlSchemaHelper helper, CachedUrl cu, ArticleMetadata oneAM) { return null; } } }
/** Implements the "get status table" command */ public class AddAuConfigure extends AuActivityBase { private static String NAME = "AddAuConfigure"; private static Logger log = Logger.getLogger(NAME); public AddAuConfigure() { super(); } /** * Populate the response body * * @return true on success */ public boolean doRemoteSetupAndVerification() throws IOException { /* * Stop if any required parameters are missing (error) */ if (!verifyMinimumParameters()) { throw new ResponseException("Missing required parameters"); } /* * Initial page setup */ return commandSetup(); } /** * Populate the response body * * @return true on success */ public boolean doCommand() throws IOException { Element infoElement; /* * Return disk space */ infoElement = getXmlUtils().createElement(getResponseRoot(), AP_E_INFO); renderDiskXml(infoElement); /* * No further action if this isn't a create command (success) */ if (!isCreateCommand()) { return true; } /* * Stop if any required parameters are missing (error) */ if (!verifyTarget() || !verifyMinimumParameters() || !verifyDefiningParameters()) { throw new ResponseException("Missing required parameters"); } /* * Create the AU */ if (!commandSetup()) { return false; } return createAu(); } /* * "Helpers" */ /** * Did the client provide the minimal parameters required? * * @return true If so */ private boolean verifyMinimumParameters() { int count = 0; if (!StringUtil.isNullString(getParameter(AP_E_PUBLICATION))) count++; if (!StringUtil.isNullString(getParameter(AP_E_CLASSNAME))) count++; if (!StringUtil.isNullString(getParameter(AP_E_PLUGIN))) count++; return (count > 0); } /** * A target system is required to create an AU - was it provided? * * @return true If at least one target was specified */ private boolean verifyTarget() { if (!isCreateCommand()) { return true; } return !StringUtil.isNullString(getParameter(AP_E_TARGET)); } /** * Are all of the "defining parameters" required to create an AU available? * * @return true If so */ private boolean verifyDefiningParameters() { KeyedList parameters; int size; if (!isCreateCommand()) { return true; } parameters = ParseUtils.getDynamicFields(getXmlUtils(), getRequestDocument(), AP_MD_AUDEFINING); size = parameters.size(); for (int i = 0; i < size; i++) { if (StringUtil.isNullString((String) parameters.getValue(i))) { return false; } } return true; } /** * "Create" command? * * @return true If so... */ private boolean isCreateCommand() { return "create".equalsIgnoreCase(getParameter(AP_E_ACTION)); } /** Query the daemon for information required to set up this command */ private boolean commandSetup() { Configuration configuration = null; Collection noEditKeys = null; String key; String value; /* * Configure a well known publication? */ if ((value = getParameter(AP_E_PUBLICATION)) != null) { PluginProxy plugin = getTitlePlugin(value); /* * Set plugin and Title configuration information */ if (plugin == null) { String message = "Unknown Publication:" + value; log.warning(message); return error(message); } setPlugin(plugin); setTitleConfig(plugin.getTitleConfig(value)); configuration = getTitleConfig().getConfig(); noEditKeys = getNoEditKeys(); } else { /* * Lookup by Plugin or Class name - set the plugin * * NB: As of 23-Feb-04, this is not supported from AddAuPage.java. See * AddAuWithCompleteFunctionalityPage.java for full support. */ if ((value = getParameter(AP_E_PLUGIN)) != null) { key = RemoteApi.pluginKeyFromId(value); } else if ((value = getParameter(AP_E_CLASSNAME)) != null) { key = RemoteApi.pluginKeyFromId(value); } else { return error("Supply a Publication, Plugin, or Class name"); } if (StringUtil.isNullString(key)) { return error("Supply a valid Publication, Plugin, or Class name"); } if (!pluginLoaded(key)) { return error("Plugin is not loaded: " + key); } setPlugin(getPluginProxy(key)); } /* * Finally, return an XML rendition of the Plugin and AU key set up */ generateSetupXml(configuration, noEditKeys); return true; } /** * Create an Archival Unit * * @return true If successful */ private boolean createAu() { Configuration config = getAuConfigFromForm(); AuProxy au; Element element; try { au = getRemoteApi().createAndSaveAuConfiguration(getPlugin(), config); } catch (ArchivalUnit.ConfigurationException exception) { return error("Configuration failed: " + exception.getMessage()); } catch (IOException exception) { return error("Unable to save configuration: " + exception.getMessage()); } /* * Successful creation - add the AU name and ID to the response document */ element = getXmlUtils().createElement(getResponseRoot(), AP_E_AU); XmlUtils.addText(element, au.getName()); element = getXmlUtils().createElement(getResponseRoot(), AP_E_AUID); XmlUtils.addText(element, au.getAuId()); return true; } }
public class TestHighWireArticleIteratorFactory extends ArticleIteratorTestCase { static Logger log = Logger.getLogger(TestHighWireArticleIteratorFactory.class); private SimulatedArchivalUnit sau; // Simulated AU to generate content private static String PLUGIN_NAME = "org.lockss.plugin.highwire.HighWirePressPlugin"; private static String BASE_URL = "http://pediatrics.aappublications.org/"; private static String SIM_ROOT = BASE_URL + "cgi/reprint/"; public void setUp() throws Exception { super.setUp(); String tempDirPath = setUpDiskSpace(); au = createAu(); sau = PluginTestUtil.createAndStartSimAu(simAuConfig(tempDirPath)); } public void tearDown() throws Exception { sau.deleteContentTree(); // theDaemon.stopDaemon(); super.tearDown(); } Configuration simAuConfig(String rootPath) { Configuration conf = ConfigManager.newConfiguration(); conf.put("root", rootPath); conf.put("base_url", SIM_ROOT); conf.put("depth", "0"); conf.put("branch", "0"); conf.put("numFiles", "2"); conf.put( "fileTypes", "" + (SimulatedContentGenerator.FILE_TYPE_PDF | SimulatedContentGenerator.FILE_TYPE_HTML)); conf.put("binFileSize", "7"); return conf; } protected ArchivalUnit createAu() throws ArchivalUnit.ConfigurationException { return PluginTestUtil.createAndStartAu( PLUGIN_NAME, ConfigurationUtil.fromArgs( "base_url", "http://pediatrics.aappublications.org/", "volume_name", "52", "journal_issn", "1098-4275")); } public void testRoots() throws Exception { SubTreeArticleIterator artIter = createSubTreeIter(); System.out.println("Root Urls::" + getRootUrls(artIter)); assertEquals( ListUtil.list( "http://pediatrics.aappublications.org/cgi/content/full/52/", "http://pediatrics.aappublications.org/cgi/reprint/52/"), getRootUrls(artIter)); } public void testUrlsWithPrefixes() throws Exception { SubTreeArticleIterator artIter = createSubTreeIter(); Pattern pat = getPattern(artIter); assertMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprint/foo;52/Supplement_3/S69.pdf"); assertMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprint/52/supplement_3/S69.pdf"); assertNotMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprin/1014174823t49006/j0143.pdfwrong"); assertNotMatchesRE( pat, "http://pediatrics.aappublications.org/cgi/reprintt/1014174823t49006/j0143.pdfwrong"); assertNotMatchesRE(pat, "http://www.example.com/content/"); assertNotMatchesRE(pat, "http://www.example.com/content/j"); assertNotMatchesRE(pat, "http://www.example.com/content/j0123/j383.pdfwrong"); } public void testCreateArticleFiles() throws Exception { PluginTestUtil.crawlSimAu(sau); String pat0 = "001file[.]html"; String rep0 = "52/1/S1"; PluginTestUtil.copyAu(sau, au, ".*[.]html$", pat0, rep0); String pat1 = "001file[.]pdf"; String rep1 = "52/1/S1.pdf"; PluginTestUtil.copyAu(sau, au, ".*[.]pdf$", pat1, rep1); String pdfurl = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1.pdf"; String url = "http://pediatrics.aappublications.org/cgi/reprint/52/1/S1"; au.makeCachedUrl(url); CachedUrl cu = au.makeCachedUrl(pdfurl); assertNotNull(cu); SubTreeArticleIterator artIter = createSubTreeIter(); assertNotNull(artIter); ArticleFiles af = artIter.next(); assertNotNull(af); System.out.println("article files::" + af); assertEquals(url, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF_LANDING_PAGE).getUrl()); assertEquals(pdfurl, af.getRoleCu(ArticleFiles.ROLE_FULL_TEXT_PDF).getUrl()); } }
/** * @author Thomas S. Robertson * @version 0.0 */ public class PluginUtil { static Logger log = Logger.getLogger("PluginUtil"); static final String PREFIX = Configuration.PREFIX + "PluginUtil."; /** * If true, the logic in getBaseUrl() that is responsible for turning the url <code>foo</code> * into the base url <code>foo/</code> (if that is the name the page was actually collected under) * will do that only if the url doesn't end with slash, and the nodeUrl in the props does. This * matches the logic in LockssResourceHandler.handleLockssRedirect(). But it's expensive, and I * don't think the nodeUrl prop should ever be different from the url in any other situation, so I * don't think it's necessary. */ public static final String PARAM_DIR_NODE_CHECK_SLASH = PREFIX + "dirNodeCheckSlash"; public static final boolean DEFAULT_DIR_NODE_CHECK_SLASH = false; private static boolean dirNodeCheckSlash = DEFAULT_DIR_NODE_CHECK_SLASH; /** Called by org.lockss.config.MiscConfig */ public static void setConfig( Configuration config, Configuration oldConfig, Configuration.Differences diffs) { if (diffs.contains(PREFIX)) { dirNodeCheckSlash = config.getBoolean(PARAM_DIR_NODE_CHECK_SLASH, DEFAULT_DIR_NODE_CHECK_SLASH); } } /** * Returns the base url of the provided CachedUrl, checking to see if it's the result of a * redirect. */ public static String getBaseUrl(CachedUrl cu) { // See the comments in LockssResourceHandler.handleLockssRedirect(); // this is the same logic. CIProperties props = cu.getProperties(); if (props != null) { String redir = props.getProperty(CachedUrl.PROPERTY_CONTENT_URL); if (redir != null) { return redir; } else { String url = cu.getUrl(); String nodeUrl = props.getProperty(CachedUrl.PROPERTY_NODE_URL); if (nodeUrl != null && !nodeUrl.equals(url)) { log.debug("getBaseUrl(" + url + "), nodeUrl: " + nodeUrl); if (dirNodeCheckSlash) { URI uri = new URI(url); if (!uri.getPath().endsWith("/")) { URI nodeUri = new URI(nodeUrl); if (nodeUri.getPath().endsWith("/")) { return nodeUrl; } } } else { return nodeUrl; } } } } return cu.getUrl(); } }
/** * An Exploder that ingests Internet Archive WARC files, and behaves as if it had ingested each file * in the WARC file directly from its original source. * * @author David S. H. Rosenthal * @author Felix Ostrowski * @version 0.0 */ public class WarcExploder extends Exploder { private static Logger logger = Logger.getLogger("WarcExploder"); protected InputStream arcStream; protected CIProperties arcProps; /** * Constructor * * @param uc UrlCacher for the archive * @param maxRetries * @param crawlSpec the CrawlSpec for the crawl that foudn the archive * @param crawler the crawler that found the archive * @param explode true to explode the archives * @param store true to store the archive as well */ public WarcExploder(FetchedUrlData toExplode, CrawlerFacade crawlFacade, ExploderHelper helper) { super(toExplode, crawlFacade, helper); arcStream = toExplode.input; arcProps = toExplode.headers; } /** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } } protected CIProperties makeCIProperties(ArchiveRecordHeader elementHeader) throws IOException { CIProperties ret = new CIProperties(); Set elementHeaderFieldKeys = elementHeader.getHeaderFieldKeys(); for (Iterator i = elementHeaderFieldKeys.iterator(); i.hasNext(); ) { String key = (String) i.next(); try { Object valueObject = elementHeader.getHeaderValue(key); if (valueObject == null) { logger.warning("Ignoring null value for key '" + key + "'."); } else { String value = valueObject.toString(); logger.debug3(key + ": " + value); ret.put(key, value); } } catch (ClassCastException ex) { logger.error("makeCIProperties: " + key + " threw ", ex); throw new CacheException.ExploderException(ex); } } return (ret); } protected ArchiveReader wrapStream(String url, InputStream arcStream) throws IOException { ArchiveReader ret = null; logger.debug3("Getting an ArchiveReader"); ret = ArchiveReaderFactory.get(url, arcStream, true); return (ret); } }