// If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } }
public void loadAuConfigDescrs(Configuration config) throws ConfigurationException { super.loadAuConfigDescrs(config); this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey()); // Now we can construct a valid CC permission checker. m_permissionCheckers = // ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl)); ListUtil.list(new CreativeCommonsPermissionChecker()); paramMap.putLong( KEY_AU_NEW_CONTENT_CRAWL_INTERVAL, CurrentConfig.getTimeIntervalParam( PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL)); if (log.isDebug2()) { log.debug2( "Setting Registry AU recrawl interval to " + StringUtil.timeIntervalToString( paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL))); } }
/** * PluginArchivalUnit: The Archival Unit Class for PluginPlugin. This archival unit uses a base url * to define an archival unit. * * @author Seth Morabito * @version 1.0 */ public class RegistryArchivalUnit extends BaseArchivalUnit { protected static final Logger log = Logger.getLogger("RegistryArchivalUnit"); /** The interval between recrawls of the loadable plugin registry AUs. */ static final String PARAM_REGISTRY_CRAWL_INTERVAL = RegistryPlugin.PREFIX + "crawlInterval"; static final long DEFAULT_REGISTRY_CRAWL_INTERVAL = Constants.DAY; /** * If "au", registry AUs will crawl in parallel using individual rate limiters; if "plugin" * they'll crawl sequentially using a shared rate limiter */ static final String PARAM_REGISTRY_FETCH_RATE_LIMITER_SOURCE = RegistryPlugin.PREFIX + "fetchRateLimiterSource"; static final String DEFAULT_REGISTRY_FETCH_RATE_LIMITER_SOURCE = "au"; /** Limits fetch rate of registry crawls */ static final String PARAM_REGISTRY_FETCH_RATE = RegistryPlugin.PREFIX + "fetchRate"; static final String DEFAULT_REGISTRY_FETCH_RATE = "20/10s"; /** Run polls on Plugin registry AUs */ static final String PARAM_ENABLE_REGISTRY_POLLS = RegistryPlugin.PREFIX + "enablePolls"; static final boolean DEFAULT_ENABLE_REGISTRY_POLLS = true; private String m_registryUrl = null; private int m_maxRefetchDepth = NewContentCrawler.DEFAULT_MAX_CRAWL_DEPTH; private List m_permissionCheckers = null; private boolean recomputeRegName = true; private boolean enablePolls = DEFAULT_ENABLE_REGISTRY_POLLS; private String regName = null; public RegistryArchivalUnit(RegistryPlugin plugin) { super(plugin); } // Called by RegistryPlugin iff any config below RegistryPlugin.PREFIX // has changed protected void setConfig( Configuration config, Configuration prevConfig, Configuration.Differences changedKeys) { m_maxRefetchDepth = config.getInt( NewContentCrawler.PARAM_MAX_CRAWL_DEPTH, NewContentCrawler.DEFAULT_MAX_CRAWL_DEPTH); fetchRateLimiter = recomputeFetchRateLimiter(fetchRateLimiter); enablePolls = config.getBoolean(PARAM_ENABLE_REGISTRY_POLLS, DEFAULT_ENABLE_REGISTRY_POLLS); } public void loadAuConfigDescrs(Configuration config) throws ConfigurationException { super.loadAuConfigDescrs(config); this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey()); // Now we can construct a valid CC permission checker. m_permissionCheckers = // ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl)); ListUtil.list(new CreativeCommonsPermissionChecker()); paramMap.putLong( KEY_AU_NEW_CONTENT_CRAWL_INTERVAL, CurrentConfig.getTimeIntervalParam( PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL)); if (log.isDebug2()) { log.debug2( "Setting Registry AU recrawl interval to " + StringUtil.timeIntervalToString( paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL))); } } /** * return a string that represents the plugin registry. This is just the base URL. * * @return The base URL. */ protected String makeName() { return "Plugin registry at '" + m_registryUrl + "'"; } public String getName() { if (recomputeRegName) { regName = recomputeRegName(); } if (regName != null) { return regName; } else { return super.getName(); } } // If there is a <title> element on the start page, use that as our AU // name. String recomputeRegName() { if (!isStarted()) { // This can get invoked (seveeral times, mostly from logging) before // enough mechanism has started to make it possible to resolve the CuUrl // below. return null; } try { CachedUrl cu = makeCachedUrl(m_registryUrl); if (cu == null) return null; URL cuUrl = CuUrl.fromCu(cu); Parser parser = new Parser(cuUrl.toString()); NodeList nodelst = parser.extractAllNodesThatMatch(new NodeClassFilter(TitleTag.class)); Node nodes[] = nodelst.toNodeArray(); recomputeRegName = false; if (nodes.length < 1) return null; // Get the first title found TitleTag tag = (TitleTag) nodes[0]; if (tag == null) return null; return tag.getTitle(); } catch (MalformedURLException e) { log.warning("recomputeRegName", e); return null; } catch (ParserException e) { if (e.getThrowable() instanceof FileNotFoundException) { log.warning("recomputeRegName: " + e.getThrowable().toString()); } else { log.warning("recomputeRegName", e); } return null; } } boolean isStarted() { return getPlugin().getDaemon().getPluginManager().getAuFromId(getAuId()) != null; } /** * return a string that points to the plugin registry page. * * @return a string that points to the plugin registry page for this registry. This is just the * base URL. */ protected String makeStartUrl() { return m_registryUrl; } /** Call top level polls iff configured to do so. */ public boolean shouldCallTopLevelPoll(AuState aus) { if (!enablePolls) { return false; } return super.shouldCallTopLevelPoll(aus); } /** * Return a new CrawlSpec with the appropriate collect AND redistribute permissions, and with the * maximum refetch depth. * * @return CrawlSpec */ protected CrawlSpec makeCrawlSpec() throws LockssRegexpException { CrawlRule rule = makeRules(); List startUrls = getNewContentCrawlUrls(); return new SpiderCrawlSpec(startUrls, startUrls, rule, m_maxRefetchDepth, null, null); } /** * return the collection of crawl rules used to crawl and cache a list of Plugin JAR files. * * @return CrawlRule */ protected CrawlRule makeRules() { return new RegistryRule(); } // Might need to recompute name if refetch start page public UrlCacher makeUrlCacher(String url) { if (url.equals(m_registryUrl)) { recomputeRegName = true; } return super.makeUrlCacher(url); } protected RateLimiter recomputeFetchRateLimiter(RateLimiter oldLimiter) { String rate = CurrentConfig.getParam(PARAM_REGISTRY_FETCH_RATE, DEFAULT_REGISTRY_FETCH_RATE); Object limiterKey = getFetchRateLimiterKey(); if (limiterKey == null) { return RateLimiter.getRateLimiter(oldLimiter, rate, DEFAULT_REGISTRY_FETCH_RATE); } else { RateLimiter.Pool pool = RateLimiter.getPool(); return pool.findNamedRateLimiter(limiterKey, rate, DEFAULT_REGISTRY_FETCH_RATE); } } protected String getFetchRateLimiterSource() { return CurrentConfig.getParam( PARAM_REGISTRY_FETCH_RATE_LIMITER_SOURCE, DEFAULT_REGISTRY_FETCH_RATE_LIMITER_SOURCE); } // Registry AU crawl rule implementation private class RegistryRule implements CrawlRule { public int match(String url) { if (StringUtil.equalStringsIgnoreCase(url, m_registryUrl) || StringUtil.endsWithIgnoreCase(url, ".jar")) { return CrawlRule.INCLUDE; } else { return CrawlRule.EXCLUDE; } } } }