/** UI to invoke various daemon actions */ @SuppressWarnings("serial") public class DebugPanel extends LockssServlet { public static final String PREFIX = Configuration.PREFIX + "debugPanel."; /** Priority for crawls started from the debug panel */ public static final String PARAM_CRAWL_PRIORITY = PREFIX + "crawlPriority"; public static final int DEFAULT_CRAWL_PRIORITY = 10; /** Priority for crawls started from the debug panel */ public static final String PARAM_ENABLE_DEEP_CRAWL = PREFIX + "deepCrawlEnabled"; private static final boolean DEFAULT_ENABLE_DEEP_CRAWL = false; static final String KEY_ACTION = "action"; static final String KEY_MSG = "msg"; static final String KEY_NAME_SEL = "name_sel"; static final String KEY_NAME_TYPE = "name_type"; static final String KEY_AUID = "auid"; static final String KEY_URL = "url"; static final String KEY_REFETCH_DEPTH = "depth"; static final String KEY_TIME = "time"; static final String ACTION_MAIL_BACKUP = "Mail Backup File"; static final String ACTION_THROW_IOEXCEPTION = "Throw IOException"; static final String ACTION_FIND_URL = "Find Preserved URL"; public static final String ACTION_REINDEX_METADATA = "Reindex Metadata"; public static final String ACTION_FORCE_REINDEX_METADATA = "Force Reindex Metadata"; public static final String ACTION_START_V3_POLL = "Start V3 Poll"; static final String ACTION_FORCE_START_V3_POLL = "Force V3 Poll"; public static final String ACTION_START_CRAWL = "Start Crawl"; public static final String ACTION_FORCE_START_CRAWL = "Force Start Crawl"; public static final String ACTION_START_DEEP_CRAWL = "Deep Crawl"; public static final String ACTION_FORCE_START_DEEP_CRAWL = "Force Deep Crawl"; public static final String ACTION_CHECK_SUBSTANCE = "Check Substance"; static final String ACTION_CRAWL_PLUGINS = "Crawl Plugins"; static final String ACTION_RELOAD_CONFIG = "Reload Config"; static final String ACTION_SLEEP = "Sleep"; public static final String ACTION_DISABLE_METADATA_INDEXING = "Disable Indexing"; /** Set of actions for which audit alerts shouldn't be generated */ public static final Set noAuditActions = SetUtil.set(ACTION_FIND_URL); static final String COL2 = "colspan=2"; static final String COL2CENTER = COL2 + " align=center"; static Logger log = Logger.getLogger("DebugPanel"); private LockssDaemon daemon; private PluginManager pluginMgr; private PollManager pollManager; private CrawlManager crawlMgr; private ConfigManager cfgMgr; private DbManager dbMgr; private MetadataManager metadataMgr; private RemoteApi rmtApi; boolean showResult; boolean showForcePoll; boolean showForceCrawl; boolean showForceReindexMetadata; String formAuid; String formDepth = "100"; protected void resetLocals() { resetVars(); super.resetLocals(); } void resetVars() { formAuid = null; errMsg = null; statusMsg = null; showForcePoll = false; showForceCrawl = false; showForceReindexMetadata = false; } public void init(ServletConfig config) throws ServletException { super.init(config); daemon = getLockssDaemon(); pluginMgr = daemon.getPluginManager(); pollManager = daemon.getPollManager(); crawlMgr = daemon.getCrawlManager(); cfgMgr = daemon.getConfigManager(); rmtApi = daemon.getRemoteApi(); try { dbMgr = daemon.getDbManager(); metadataMgr = daemon.getMetadataManager(); } catch (IllegalArgumentException ex) { } } public void lockssHandleRequest() throws IOException { resetVars(); boolean showForm = true; String action = getParameter(KEY_ACTION); if (!StringUtil.isNullString(action)) { formAuid = getParameter(KEY_AUID); formDepth = getParameter(KEY_REFETCH_DEPTH); UserAccount acct = getUserAccount(); if (acct != null && !noAuditActions.contains(action)) { acct.auditableEvent("used debug panel action: " + action + " AU ID: " + formAuid); } } if (ACTION_MAIL_BACKUP.equals(action)) { doMailBackup(); } if (ACTION_RELOAD_CONFIG.equals(action)) { doReloadConfig(); } if (ACTION_SLEEP.equals(action)) { doSleep(); } if (ACTION_THROW_IOEXCEPTION.equals(action)) { doThrow(); } if (ACTION_START_V3_POLL.equals(action)) { doV3Poll(); } if (ACTION_FORCE_START_V3_POLL.equals(action)) { forceV3Poll(); } if (ACTION_START_CRAWL.equals(action)) { doCrawl(false, false); } if (ACTION_FORCE_START_CRAWL.equals(action)) { doCrawl(true, false); } if (ACTION_START_DEEP_CRAWL.equals(action)) { doCrawl(false, true); } if (ACTION_FORCE_START_DEEP_CRAWL.equals(action)) { doCrawl(true, true); } if (ACTION_CHECK_SUBSTANCE.equals(action)) { doCheckSubstance(); } if (ACTION_CRAWL_PLUGINS.equals(action)) { crawlPluginRegistries(); } if (ACTION_FIND_URL.equals(action)) { showForm = doFindUrl(); } if (ACTION_REINDEX_METADATA.equals(action)) { doReindexMetadata(); } if (ACTION_FORCE_REINDEX_METADATA.equals(action)) { forceReindexMetadata(); } if (ACTION_DISABLE_METADATA_INDEXING.equals(action)) { doDisableMetadataIndexing(); } if (showForm) { displayPage(); } } private void doMailBackup() { try { rmtApi.createConfigBackupFile(RemoteApi.BackupFileDisposition.Mail); } catch (Exception e) { errMsg = "Error: " + e.getMessage(); } } private void doReloadConfig() { cfgMgr.requestReload(); } private void doThrow() throws IOException { String msg = getParameter(KEY_MSG); throw new IOException(msg != null ? msg : "Test message"); } private void doSleep() throws IOException { String timestr = getParameter(KEY_TIME); try { long time = StringUtil.parseTimeInterval(timestr); Deadline.in(time).sleep(); statusMsg = "Slept for " + StringUtil.timeIntervalToString(time); } catch (NumberFormatException e) { errMsg = "Illegal duration: " + e; } catch (InterruptedException e) { errMsg = "Interrupted: " + e; } } private void doReindexMetadata() { ArchivalUnit au = getAu(); if (au == null) return; try { startReindexingMetadata(au, false); } catch (RuntimeException e) { log.error("Can't reindex metadata", e); errMsg = "Error: " + e.toString(); } } private void forceReindexMetadata() { ArchivalUnit au = getAu(); if (au == null) return; try { startReindexingMetadata(au, true); } catch (RuntimeException e) { log.error("Can't reindex metadata", e); errMsg = "Error: " + e.toString(); } } private void doDisableMetadataIndexing() { ArchivalUnit au = getAu(); if (au == null) return; try { disableMetadataIndexing(au, false); } catch (RuntimeException e) { log.error("Can't disable metadata indexing", e); errMsg = "Error: " + e.toString(); } } private void doCrawl(boolean force, boolean deep) { ArchivalUnit au = getAu(); if (au == null) return; try { startCrawl(au, force, deep); } catch (CrawlManagerImpl.NotEligibleException.RateLimiter e) { errMsg = "AU has crawled recently (" + e.getMessage() + "). Click again to override."; showForceCrawl = true; return; } catch (CrawlManagerImpl.NotEligibleException e) { errMsg = "Can't enqueue crawl: " + e.getMessage(); } } private void crawlPluginRegistries() { StringBuilder sb = new StringBuilder(); for (ArchivalUnit au : pluginMgr.getAllRegistryAus()) { sb.append(au.getName()); sb.append(": "); try { startCrawl(au, true, false); sb.append("Queued."); } catch (CrawlManagerImpl.NotEligibleException e) { sb.append("Failed: "); sb.append(e.getMessage()); } sb.append("\n"); } statusMsg = sb.toString(); } private boolean startCrawl(ArchivalUnit au, boolean force, boolean deep) throws CrawlManagerImpl.NotEligibleException { CrawlManagerImpl cmi = (CrawlManagerImpl) crawlMgr; if (force) { RateLimiter limit = cmi.getNewContentRateLimiter(au); if (!limit.isEventOk()) { limit.unevent(); } } cmi.checkEligibleToQueueNewContentCrawl(au); String delayMsg = ""; String deepMsg = ""; try { cmi.checkEligibleForNewContentCrawl(au); } catch (CrawlManagerImpl.NotEligibleException e) { delayMsg = ", Start delayed due to: " + e.getMessage(); } Configuration config = ConfigManager.getCurrentConfig(); int pri = config.getInt(PARAM_CRAWL_PRIORITY, DEFAULT_CRAWL_PRIORITY); CrawlReq req; try { req = new CrawlReq(au); req.setPriority(pri); if (deep) { int d = Integer.parseInt(formDepth); if (d < 0) { errMsg = "Illegal refetch depth: " + d; return false; } req.setRefetchDepth(d); deepMsg = "Deep (" + req.getRefetchDepth() + ") "; } } catch (NumberFormatException e) { errMsg = "Illegal refetch depth: " + formDepth; return false; } catch (RuntimeException e) { log.error("Couldn't create CrawlReq: " + au, e); errMsg = "Couldn't create CrawlReq: " + e.toString(); return false; } cmi.startNewContentCrawl(req, null); statusMsg = deepMsg + "Crawl requested for " + au.getName() + delayMsg; return true; } private void doCheckSubstance() { ArchivalUnit au = getAu(); if (au == null) return; try { checkSubstance(au); } catch (RuntimeException e) { log.error("Error in SubstanceChecker", e); errMsg = "Error in SubstanceChecker; see log."; } } private void checkSubstance(ArchivalUnit au) { SubstanceChecker subChecker = new SubstanceChecker(au); if (!subChecker.isEnabled()) { errMsg = "No substance patterns defined for plugin."; return; } AuState auState = AuUtil.getAuState(au); SubstanceChecker.State oldState = auState.getSubstanceState(); SubstanceChecker.State newState = subChecker.findSubstance(); String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")"); switch (newState) { case Unknown: log.error("Shouldn't happen: SubstanceChecker returned Unknown"); errMsg = "Error in SubstanceChecker; see log."; break; case Yes: statusMsg = "AU has substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.Yes); break; case No: statusMsg = "AU has no substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.No); break; } } private boolean startReindexingMetadata(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } if (!force) { if (!AuUtil.hasCrawled(au)) { errMsg = "Au has never crawled. Click again to reindex metadata"; showForceReindexMetadata = true; return false; } AuState auState = AuUtil.getAuState(au); switch (auState.getSubstanceState()) { case No: errMsg = "Au has no substance. Click again to reindex metadata"; showForceReindexMetadata = true; return false; case Unknown: errMsg = "Unknown substance for Au. Click again to reindex metadata."; showForceReindexMetadata = true; return false; case Yes: // fall through } } // Fully reindex metadata with the highest priority. Connection conn = null; PreparedStatement insertPendingAuBatchStatement = null; try { conn = dbMgr.getConnection(); insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn); if (metadataMgr.enableAndAddAuToReindex( au, conn, insertPendingAuBatchStatement, false, true)) { statusMsg = "Reindexing metadata for " + au.getName(); return true; } } catch (DbException dbe) { log.error("Cannot reindex metadata for " + au.getName(), dbe); } finally { DbManager.safeCloseStatement(insertPendingAuBatchStatement); DbManager.safeRollbackAndClose(conn); } if (force) { errMsg = "Still cannot reindex metadata for " + au.getName(); } else { errMsg = "Cannot reindex metadata for " + au.getName(); } return false; } private boolean disableMetadataIndexing(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } try { metadataMgr.disableAuIndexing(au); statusMsg = "Disabled metadata indexing for " + au.getName(); return true; } catch (Exception e) { errMsg = "Cannot reindex metadata for " + au.getName() + ": " + e.getMessage(); return false; } } private void doV3Poll() { ArchivalUnit au = getAu(); if (au == null) return; try { callV3ContentPoll(au); } catch (PollManager.NotEligibleException e) { errMsg = "AU is not eligible for poll: " + e.getMessage(); // errMsg = "Ineligible: " + e.getMessage() + // "<br>Click again to force new poll."; // showForcePoll = true; return; } catch (Exception e) { log.error("Can't start poll", e); errMsg = "Error: " + e.toString(); } } private void forceV3Poll() { ArchivalUnit au = getAu(); if (au == null) return; try { callV3ContentPoll(au); } catch (Exception e) { log.error("Can't start poll", e); errMsg = "Error: " + e.toString(); } } private void callV3ContentPoll(ArchivalUnit au) throws PollManager.NotEligibleException { log.debug("Enqueuing a V3 Content Poll on " + au.getName()); PollSpec spec = new PollSpec(au.getAuCachedUrlSet(), Poll.V3_POLL); pollManager.enqueueHighPriorityPoll(au, spec); statusMsg = "Enqueued V3 poll for " + au.getName(); } private boolean doFindUrl() throws IOException { String url = getParameter(KEY_URL); String redir = srvURL( AdminServletManager.SERVLET_DAEMON_STATUS, PropUtil.fromArgs("table", ArchivalUnitStatus.AUS_WITH_URL_TABLE_NAME, "key", url)); resp.setContentLength(0); // resp.sendRedirect(resp.encodeRedirectURL(redir)); resp.sendRedirect(redir); return false; } ArchivalUnit getAu() { if (StringUtil.isNullString(formAuid)) { errMsg = "Select an AU"; return null; } ArchivalUnit au = pluginMgr.getAuFromId(formAuid); if (au == null) { errMsg = "No such AU. Select an AU"; return null; } return au; } private void displayPage() throws IOException { Page page = newPage(); layoutErrorBlock(page); ServletUtil.layoutExplanationBlock(page, "Debug Actions"); page.add(makeForm()); page.add("<br>"); endPage(page); } private Element makeForm() { Composite comp = new Composite(); Form frm = new Form(srvURL(myServletDescr())); frm.method("POST"); frm.add("<br><center>"); Input reload = new Input(Input.Submit, KEY_ACTION, ACTION_RELOAD_CONFIG); setTabOrder(reload); frm.add(reload); frm.add(" "); Input backup = new Input(Input.Submit, KEY_ACTION, ACTION_MAIL_BACKUP); setTabOrder(backup); frm.add(backup); frm.add(" "); Input crawlplug = new Input(Input.Submit, KEY_ACTION, ACTION_CRAWL_PLUGINS); setTabOrder(crawlplug); frm.add(crawlplug); frm.add("</center>"); ServletDescr d1 = AdminServletManager.SERVLET_HASH_CUS; if (isServletRunnable(d1)) { frm.add("<br><center>" + srvLink(d1, d1.heading) + "</center>"); } Input findUrl = new Input(Input.Submit, KEY_ACTION, ACTION_FIND_URL); Input findUrlText = new Input(Input.Text, KEY_URL); findUrlText.setSize(50); setTabOrder(findUrl); setTabOrder(findUrlText); frm.add("<br><center>" + findUrl + " " + findUrlText + "</center>"); Input thrw = new Input(Input.Submit, KEY_ACTION, ACTION_THROW_IOEXCEPTION); Input thmsg = new Input(Input.Text, KEY_MSG); setTabOrder(thrw); setTabOrder(thmsg); frm.add("<br><center>" + thrw + " " + thmsg + "</center>"); frm.add("<br><center>AU Actions: select AU</center>"); Composite ausel = ServletUtil.layoutSelectAu(this, KEY_AUID, formAuid); frm.add("<br><center>" + ausel + "</center>"); setTabOrder(ausel); Input v3Poll = new Input( Input.Submit, KEY_ACTION, (showForcePoll ? ACTION_FORCE_START_V3_POLL : ACTION_START_V3_POLL)); Input crawl = new Input( Input.Submit, KEY_ACTION, (showForceCrawl ? ACTION_FORCE_START_CRAWL : ACTION_START_CRAWL)); frm.add("<br><center>"); frm.add(v3Poll); frm.add(" "); frm.add(crawl); if (CurrentConfig.getBooleanParam(PARAM_ENABLE_DEEP_CRAWL, DEFAULT_ENABLE_DEEP_CRAWL)) { Input deepCrawl = new Input( Input.Submit, KEY_ACTION, (showForceCrawl ? ACTION_FORCE_START_DEEP_CRAWL : ACTION_START_DEEP_CRAWL)); Input depthText = new Input(Input.Text, KEY_REFETCH_DEPTH, formDepth); depthText.setSize(4); setTabOrder(depthText); frm.add(" "); frm.add(deepCrawl); frm.add(depthText); } Input checkSubstance = new Input(Input.Submit, KEY_ACTION, ACTION_CHECK_SUBSTANCE); frm.add("<br>"); frm.add(checkSubstance); if (metadataMgr != null) { Input reindex = new Input( Input.Submit, KEY_ACTION, (showForceReindexMetadata ? ACTION_FORCE_REINDEX_METADATA : ACTION_REINDEX_METADATA)); frm.add(" "); frm.add(reindex); Input disableIndexing = new Input(Input.Submit, KEY_ACTION, ACTION_DISABLE_METADATA_INDEXING); frm.add(" "); frm.add(disableIndexing); } frm.add("</center>"); comp.add(frm); return comp; } }
public class CrawlRuleTester extends Thread { protected static Logger log = Logger.getLogger(CrawlRuleTester.class); /** Proxy host */ public static final String PARAM_PROXY_HOST = Configuration.PREFIX + "crawltest.proxy.host"; /** Proxy port */ public static final String PARAM_PROXY_PORT = Configuration.PREFIX + "crawltest.proxy.port"; public static final int DEFAULT_PROXY_PORT = -1; /** User-Agent */ public static final String PARAM_USER_AGENT = Configuration.PREFIX + "crawltest.userAgent"; /* Message Types */ public static final int ERROR_MESSAGE = 0; public static final int WARNING_MESSAGE = 1; public static final int PLAIN_MESSAGE = 2; public static final int URL_SUMMARY_MESSAGE = 3; public static final int TEST_SUMMARY_MESSAGE = 4; private String m_baseUrl; private int m_crawlDepth; private long m_crawlDelay; private int m_curDepth; private ArchivalUnit m_au; private String m_outputFile = null; private BufferedWriter m_outWriter = null; private Deadline fetchDeadline = Deadline.in(0); private boolean useLocalWriter = true; private MessageHandler m_msgHandler; private LockssUrlConnectionPool connectionPool = new LockssUrlConnectionPool(); private String proxyHost; private String userAgent; private int proxyPort; // our storage for extracted urls private TreeSet m_extracted = new TreeSet(); private TreeSet m_incls = new TreeSet(); private TreeSet m_excls = new TreeSet(); private TreeSet m_reported = new TreeSet(); public CrawlRuleTester(int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { super("crawlrule tester"); m_crawlDepth = crawlDepth; long minFetchDelay = CurrentConfig.getLongParam( BaseArchivalUnit.PARAM_MIN_FETCH_DELAY, BaseArchivalUnit.DEFAULT_MIN_FETCH_DELAY); m_crawlDelay = Math.max(crawlDelay, minFetchDelay); m_baseUrl = baseUrl; m_au = au; } /** * RuleTest * * @param outFile String * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( String outFile, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outputFile = outFile; } /** * RuleTest * * @param outWriter BufferedWriter * @param crawlDepth int * @param crawlDelay long * @param baseUrl String * @param crawlSpec CrawlSpec */ public CrawlRuleTester( BufferedWriter outWriter, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_outWriter = outWriter; } /** * RuleTest * * @param msgHandler MessageHandler to take all output * @param crawlDepth the crawl depth to use * @param crawlDelay the type to wait between fetches * @param baseUrl the url to start from * @param crawlSpec a CrawlSpec to use for url checking. */ public CrawlRuleTester( MessageHandler msgHandler, int crawlDepth, long crawlDelay, String baseUrl, ArchivalUnit au) { this(crawlDepth, crawlDelay, baseUrl, au); m_msgHandler = msgHandler; } public void run() { try { setConfig(ConfigManager.getCurrentConfig()); if (m_outWriter == null && m_msgHandler == null) { useLocalWriter = true; } else { useLocalWriter = false; } if (useLocalWriter) { openOutputFile(); } checkRules(); if (useLocalWriter) { closeOutputFile(); } } finally { if (m_msgHandler != null) { m_msgHandler.close(); } } } void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else { log.info("Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } } private void openOutputFile() { if (m_outputFile != null) { try { m_outWriter = new BufferedWriter(new FileWriter(m_outputFile, false)); return; } catch (Exception ex) { System.err.println("Error opening output file, writing to stdout: " + ex); } } m_outWriter = new BufferedWriter(new OutputStreamWriter(System.out)); } private void closeOutputFile() { try { if (m_outWriter != null) { m_outWriter.close(); } } catch (IOException ex) { System.err.println("Error closing output file."); } } int[] depth_incl; int[] depth_fetched; int[] depth_parsed; private void checkRules() { outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE); outputMessage( "crawl depth: " + m_crawlDepth + " crawl delay: " + m_crawlDelay + " ms.", PLAIN_MESSAGE); TreeSet crawlList = new TreeSet(); TreeSet fetched = new TreeSet(); // inialize with the baseUrl crawlList.add(m_baseUrl); depth_incl = new int[m_crawlDepth]; depth_fetched = new int[m_crawlDepth]; depth_parsed = new int[m_crawlDepth]; long start_time = TimeBase.nowMs(); for (int depth = 1; depth <= m_crawlDepth; depth++) { if (isInterrupted()) { return; } m_curDepth = depth; if (crawlList.isEmpty() && depth <= m_crawlDepth) { outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE); break; } String[] urls = (String[]) crawlList.toArray(new String[0]); crawlList.clear(); outputMessage("\nDepth " + depth, PLAIN_MESSAGE); for (int ix = 0; ix < urls.length; ix++) { if (isInterrupted()) { return; } pauseBeforeFetch(); String urlstr = urls[ix]; m_incls.clear(); m_excls.clear(); // crawl the page buildUrlSets(urlstr); fetched.add(urlstr); // output incl/excl results, // add the new_incls to the crawlList for next crawl depth loop crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls)); } } long elapsed_time = TimeBase.nowMs() - start_time; outputSummary(m_baseUrl, fetched, crawlList, elapsed_time); } private void buildUrlSets(String url) { try { outputMessage("\nFetching " + url, TEST_SUMMARY_MESSAGE); URL srcUrl = new URL(url); // URLConnection conn = srcUrl.openConnection(); // String type = conn.getContentType(); // type = conn.getHeaderField("content-type"); // InputStream istr = conn.getInputStream(); LockssUrlConnection conn = UrlUtil.openConnection(url, connectionPool); if (proxyHost != null) { conn.setProxy(proxyHost, proxyPort); } if (userAgent != null) { conn.setRequestProperty("user-agent", userAgent); } try { conn.execute(); int resp = conn.getResponseCode(); if (resp != 200) { outputMessage("Resp: " + resp + ": " + conn.getResponseMessage(), TEST_SUMMARY_MESSAGE); return; } depth_fetched[m_curDepth - 1]++; String cookies = conn.getResponseHeaderValue("Set-Cookie"); if (cookies != null) { outputMessage("Cookies: " + cookies, PLAIN_MESSAGE); } String type = conn.getResponseContentType(); if (type == null || !type.toLowerCase().startsWith("text/html")) { outputMessage("Type: " + type + ", not parsing", URL_SUMMARY_MESSAGE); return; } outputMessage("Type: " + type + ", extracting Urls", URL_SUMMARY_MESSAGE); InputStream istr = conn.getResponseInputStream(); InputStreamReader reader = new InputStreamReader(istr); // MyMockCachedUrl mcu = new MyMockCachedUrl(srcUrl.toString(), reader); GoslingHtmlLinkExtractor extractor = new GoslingHtmlLinkExtractor(); extractor.extractUrls(null, istr, null, srcUrl.toString(), new MyLinkExtractorCallback()); istr.close(); depth_parsed[m_curDepth - 1]++; } finally { conn.release(); } } catch (MalformedURLException murle) { murle.printStackTrace(); outputErrResults(url, "Malformed URL:" + murle.getMessage()); } catch (IOException ex) { ex.printStackTrace(); outputErrResults(url, "IOException: " + ex.getMessage()); } } private void pauseBeforeFetch() { if (!fetchDeadline.expired()) { try { fetchDeadline.sleep(); } catch (InterruptedException ie) { // no action } } fetchDeadline.expireIn(m_crawlDelay); } private void outputMessage(String msg, int msgType) { if (isInterrupted()) { return; } if (m_msgHandler != null) { m_msgHandler.outputMessage(msg + "\n", msgType); } else { try { m_outWriter.write(msg); m_outWriter.newLine(); } catch (Exception ex) { System.err.println(msg); } } } private void outputErrResults(String url, String errMsg) { outputMessage("Error: " + errMsg + " occured while processing " + url, ERROR_MESSAGE); } private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; } private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String) iter.next(); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); } public interface MessageHandler { void outputMessage(String message, int messageType); void close(); } private class MyLinkExtractorCallback implements LinkExtractor.Callback { MyLinkExtractorCallback() {} public void foundLink(String url) { m_extracted.add(url); try { String normUrl = UrlUtil.normalizeUrl(url); if (BaseCrawler.isSupportedUrlProtocol(normUrl) && m_au.shouldBeCached(normUrl)) { m_incls.add(normUrl); } else { m_excls.add(normUrl); } } catch (MalformedURLException e) { m_excls.add(url); } } } class MyMockCachedUrl implements CachedUrl { private String url; private boolean doesExist = false; private Reader reader = null; public MyMockCachedUrl(String url, Reader reader) { this.url = url; this.reader = reader; } public ArchivalUnit getArchivalUnit() { throw new UnsupportedOperationException("Not implemented"); } public String getUrl() { return url; } public CachedUrl getCuVersion(int version) { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions() { throw new UnsupportedOperationException("Not implemented"); } public CachedUrl[] getCuVersions(int maxVersions) { throw new UnsupportedOperationException("Not implemented"); } public int getVersion() { return 1; } public Reader openForReading() { return reader; } public LinkRewriterFactory getLinkRewriterFactory() { throw new UnsupportedOperationException("Not implemented"); } public String getEncoding() { return Constants.DEFAULT_ENCODING; } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream() { throw new UnsupportedOperationException("Not implemented"); } /** * getUnfilteredInputStream * * @return InputStream */ public InputStream getUnfilteredInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream() { throw new UnsupportedOperationException("Not implemented"); } public InputStream getUncompressedInputStream(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @return InputStream */ public InputStream openForHashing() { throw new UnsupportedOperationException("Not implemented"); } /** * openForHashing * * @param hasher HashedInputStream.Hasher for unfiltered content * @return InputStream */ public InputStream openForHashing(HashedInputStream.Hasher hasher) { throw new UnsupportedOperationException("Not implemented"); } /** * getContentSize * * @return long */ public long getContentSize() { throw new UnsupportedOperationException("Not implemented"); } public String getContentType() { throw new UnsupportedOperationException("Not implemented"); } public void setOption(String option, String val) {} public boolean hasContent() { return doesExist; } public boolean isLeaf() { return true; } public int getType() { return CachedUrlSetNode.TYPE_CACHED_URL; } public CIProperties getProperties() { return null; } public void addProperty(String key, String value) {} public void release() {} public String toString() { StringBuffer sb = new StringBuffer(url.length() + 17); sb.append("[MyMockCachedUrl: "); sb.append(url); sb.append("]"); return sb.toString(); } @Override public FileMetadataExtractor getFileMetadataExtractor(MetadataTarget target) { return null; } public CachedUrl getArchiveMemberCu(ArchiveMemberSpec ams) { throw new UnsupportedOperationException("Not implemented"); } @Override public boolean isArchiveMember() { return false; } } }
/** * DefinablePlugin: a plugin which uses the data stored in an ExternalizableMap to configure itself. * * @author Claire Griffin * @version 1.0 */ public class DefinablePlugin extends BasePlugin { // configuration map keys public static final String KEY_PLUGIN_IDENTIFIER = "plugin_identifier"; public static final String KEY_PLUGIN_NAME = "plugin_name"; public static final String KEY_PLUGIN_VERSION = "plugin_version"; public static final String KEY_PLUGIN_FEATURE_VERSION_MAP = "plugin_feature_version_map"; public static final String KEY_REQUIRED_DAEMON_VERSION = "required_daemon_version"; public static final String KEY_PUBLISHING_PLATFORM = "plugin_publishing_platform"; public static final String KEY_PLUGIN_CONFIG_PROPS = "plugin_config_props"; public static final String KEY_EXCEPTION_HANDLER = "plugin_cache_result_handler"; public static final String KEY_EXCEPTION_LIST = "plugin_cache_result_list"; public static final String KEY_PLUGIN_NOTES = "plugin_notes"; public static final String KEY_CRAWL_TYPE = "plugin_crawl_type"; public static final String KEY_FOLLOW_LINKS = "plugin_follow_link"; /** Message to be displayed when user configures an AU with this plugin */ public static final String KEY_PLUGIN_AU_CONFIG_USER_MSG = "plugin_au_config_user_msg"; public static final String KEY_PER_HOST_PERMISSION_PATH = "plugin_per_host_permission_path"; public static final String KEY_PLUGIN_PARENT = "plugin_parent"; public static final String KEY_PLUGIN_PARENT_VERSION = "plugin_parent_version"; public static final String KEY_PLUGIN_CRAWL_URL_COMPARATOR_FACTORY = "plugin_crawl_url_comparator_factory"; public static final String KEY_PLUGIN_FETCH_RATE_LIMITER_SOURCE = "plugin_fetch_rate_limiter_source"; public static final String KEY_PLUGIN_ARTICLE_ITERATOR_FACTORY = "plugin_article_iterator_factory"; public static final String KEY_PLUGIN_ARTICLE_METADATA_EXTRACTOR_FACTORY = "plugin_article_metadata_extractor_factory"; public static final String KEY_DEFAULT_ARTICLE_MIME_TYPE = "plugin_default_article_mime_type"; public static final String KEY_ARTICLE_ITERATOR_ROOT = "plugin_article_iterator_root"; public static final String KEY_ARTICLE_ITERATOR_PATTERN = "plugin_article_iterator_pattern"; public static final String DEFAULT_PLUGIN_VERSION = "1"; public static final String DEFAULT_REQUIRED_DAEMON_VERSION = "0.0.0"; public static final String MAP_SUFFIX = ".xml"; public static final String CRAWL_TYPE_HTML_LINKS = "HTML Links"; public static final String CRAWL_TYPE_OAI = "OAI"; public static final String[] CRAWL_TYPES = { CRAWL_TYPE_HTML_LINKS, CRAWL_TYPE_OAI, }; public static final String DEFAULT_CRAWL_TYPE = CRAWL_TYPE_HTML_LINKS; protected String mapName = null; static Logger log = Logger.getLogger("DefinablePlugin"); protected ExternalizableMap definitionMap = new ExternalizableMap(); protected CacheResultHandler resultHandler = null; protected List<String> loadedFromUrls; protected CrawlWindow crawlWindow; protected Map<Plugin.Feature, String> featureVersion; public void initPlugin(LockssDaemon daemon, String extMapName) throws FileNotFoundException { initPlugin(daemon, extMapName, this.getClass().getClassLoader()); } public void initPlugin(LockssDaemon daemon, String extMapName, ClassLoader loader) throws FileNotFoundException { // convert the plugin class name to an xml file name // load the configuration map from jar file ExternalizableMap defMap = loadMap(extMapName, loader); this.initPlugin(daemon, extMapName, defMap, loader); } public void initPlugin( LockssDaemon daemon, String extMapName, ExternalizableMap defMap, ClassLoader loader) { mapName = extMapName; this.classLoader = loader; this.definitionMap = defMap; super.initPlugin(daemon); initMimeMap(); initFeatureVersions(); initAuFeatureMap(); checkParamAgreement(); } private ExternalizableMap loadMap(String extMapName, ClassLoader loader) throws FileNotFoundException { String first = null; String next = extMapName; List<String> urls = new ArrayList<String>(); ExternalizableMap res = null; while (next != null) { // convert the plugin class name to an xml file name String mapFile = next.replace('.', '/') + MAP_SUFFIX; URL url = loader.getResource(mapFile); if (url != null && urls.contains(url.toString())) { throw new PluginException.InvalidDefinition("Plugin inheritance loop: " + next); } // load into map ExternalizableMap oneMap = new ExternalizableMap(); oneMap.loadMapFromResource(mapFile, loader); urls.add(url.toString()); // apply overrides one plugin at a time in inheritance chain processOverrides(oneMap); if (res == null) { res = oneMap; } else { for (Map.Entry ent : oneMap.entrySet()) { String key = (String) ent.getKey(); Object val = ent.getValue(); if (!res.containsKey(key)) { res.setMapElement(key, val); } } } if (oneMap.containsKey(KEY_PLUGIN_PARENT)) { next = oneMap.getString(KEY_PLUGIN_PARENT); } else { next = null; } } loadedFromUrls = urls; return res; } /** If in testing mode FOO, copy values from FOO_override map, if any, to main map */ void processOverrides(TypedEntryMap map) { String testMode = getTestingMode(); if (StringUtil.isNullString(testMode)) { return; } Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE); if (o == null) { return; } if (o instanceof Map) { Map overrideMap = (Map) o; for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) { String key = (String) entry.getKey(); Object val = entry.getValue(); log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val); map.setMapElement(key, val); } } } String getTestingMode() { return theDaemon == null ? null : theDaemon.getTestingMode(); } // Used by tests public void initPlugin(LockssDaemon daemon, File file) throws PluginException { ExternalizableMap oneMap = new ExternalizableMap(); oneMap.loadMap(file); if (oneMap.getErrorString() != null) { throw new PluginException(oneMap.getErrorString()); } initPlugin(daemon, file.getPath(), oneMap, null); } void initPlugin(LockssDaemon daemon, ExternalizableMap defMap) { initPlugin(daemon, defMap, this.getClass().getClassLoader()); } void initPlugin(LockssDaemon daemon, ExternalizableMap defMap, ClassLoader loader) { initPlugin(daemon, "Internal", defMap, loader); } enum PrintfContext { Regexp, URL, Display }; void checkParamAgreement() { for (Map.Entry<String, PrintfContext> ent : DefinableArchivalUnit.printfKeysContext.entrySet()) { checkParamAgreement(ent.getKey(), ent.getValue()); } } void checkParamAgreement(String key, PrintfContext context) { List<String> printfList = getElementList(key); if (printfList == null) { return; } for (String printf : printfList) { if (StringUtil.isNullString(printf)) { log.warning("Null printf string in " + key); continue; } PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf); Collection<String> p_args = p_data.getArguments(); for (String arg : p_args) { ConfigParamDescr descr = findAuConfigDescr(arg); if (descr == null) { throw new PluginException.InvalidDefinition( "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName()); } // ensure range and set params used only in legal context switch (context) { case Regexp: case Display: // everything is legal in a regexp or a display string break; case URL: // NUM_RANGE and SET legal because can enumerate. Can't // enumerate RANGE switch (descr.getType()) { case ConfigParamDescr.TYPE_RANGE: throw new PluginException.InvalidDefinition( "Range parameter (" + arg + ") used in illegal context in " + getPluginName() + ": " + key + ": " + printf); default: } } } } } public List<String> getLoadedFromUrls() { return loadedFromUrls; } public String getPluginName() { if (definitionMap.containsKey(KEY_PLUGIN_NAME)) { return definitionMap.getString(KEY_PLUGIN_NAME); } else { return getDefaultPluginName(); } } protected String getDefaultPluginName() { return StringUtil.shortName(getPluginId()); } public String getVersion() { return definitionMap.getString(KEY_PLUGIN_VERSION, DEFAULT_PLUGIN_VERSION); } public String getFeatureVersion(Plugin.Feature feat) { if (featureVersion == null) { return null; } return featureVersion.get(feat); } public String getRequiredDaemonVersion() { return definitionMap.getString(KEY_REQUIRED_DAEMON_VERSION, DEFAULT_REQUIRED_DAEMON_VERSION); } public String getPublishingPlatform() { return definitionMap.getString(KEY_PUBLISHING_PLATFORM, null); } public String getPluginNotes() { return definitionMap.getString(KEY_PLUGIN_NOTES, null); } public String getDefaultArticleMimeType() { String ret = definitionMap.getString(KEY_DEFAULT_ARTICLE_MIME_TYPE, null); log.debug3("DefaultArticleMimeType " + ret); if (ret == null) { ret = super.getDefaultArticleMimeType(); log.debug3("DefaultArticleMimeType from super " + ret); } return ret; } public List<String> getElementList(String key) { Object element = definitionMap.getMapElement(key); List<String> lst; if (element instanceof String) { return Collections.singletonList((String) element); } else if (element instanceof List) { return (List) element; } else { return null; } } public List getLocalAuConfigDescrs() throws PluginException.InvalidDefinition { List auConfigDescrs = (List) definitionMap.getCollection(KEY_PLUGIN_CONFIG_PROPS, null); if (auConfigDescrs == null) { throw new PluginException.InvalidDefinition(mapName + " missing ConfigParamDescrs"); } return auConfigDescrs; } protected ArchivalUnit createAu0(Configuration auConfig) throws ArchivalUnit.ConfigurationException { DefinableArchivalUnit au = new DefinableArchivalUnit(this, definitionMap); au.setConfiguration(auConfig); return au; } public ExternalizableMap getDefinitionMap() { return definitionMap; } CacheResultHandler getCacheResultHandler() { return resultHandler; } String stripSuffix(String str, String suffix) { return str.substring(0, str.length() - suffix.length()); } protected void initMimeMap() throws PluginException.InvalidDefinition { for (Iterator iter = definitionMap.entrySet().iterator(); iter.hasNext(); ) { Map.Entry ent = (Map.Entry) iter.next(); String key = (String) ent.getKey(); Object val = ent.getValue(); if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " link extractor: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkExtractorFactory fact = (LinkExtractorFactory) newAuxClass(factName, LinkExtractorFactory.class); mti.setLinkExtractorFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY)) { // XXX This clause must precede the one for SUFFIX_HASH_FILTER_FACTORY // XXX unless/until that key is changed to not be a terminal substring // XXX of this one String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " crawl filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setCrawlFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setHashFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT); if (val instanceof String) { String rate = (String) val; log.debug(mime + " fetch rate: " + rate); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); RateLimiter limit = mti.getFetchRateLimiter(); if (limit != null) { limit.setRate(rate); } else { mti.setFetchRateLimiter(new RateLimiter(rate)); } } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY); String factName = (String) val; log.debug(mime + " link rewriter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkRewriterFactory fact = (LinkRewriterFactory) newAuxClass(factName, LinkRewriterFactory.class); mti.setLinkRewriterFactory(fact); } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP); Map factNameMap = (Map) val; Map factClassMap = new HashMap(); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); for (Iterator it = factNameMap.keySet().iterator(); it.hasNext(); ) { String mdTypes = (String) it.next(); String factName = (String) factNameMap.get(mdTypes); log.debug(mime + " (" + mdTypes + ") metadata extractor: " + factName); for (String mdType : (List<String>) StringUtil.breakAt(mdTypes, ";")) { setMdTypeFact(factClassMap, mdType, factName); } } mti.setFileMetadataExtractorFactoryMap(factClassMap); } } } private void setMdTypeFact(Map factClassMap, String mdType, String factName) { log.debug3("Metadata type: " + mdType + " factory " + factName); FileMetadataExtractorFactory fact = (FileMetadataExtractorFactory) newAuxClass(factName, FileMetadataExtractorFactory.class); factClassMap.put(mdType, fact); } protected void initResultMap() throws PluginException.InvalidDefinition { HttpResultMap hResultMap = new HttpResultMap(); // XXX Currently this only allows a CacheResultHandler class to // initialize the result map. Instead, don't use a CacheResultMap // directly, use either the plugin's CacheResultHandler, if specified, // or a default one that wraps the CacheResultMap String handler_class = null; handler_class = definitionMap.getString(KEY_EXCEPTION_HANDLER, null); if (handler_class != null) { try { resultHandler = (CacheResultHandler) newAuxClass(handler_class, CacheResultHandler.class); resultHandler.init(hResultMap); } catch (Exception ex) { throw new PluginException.InvalidDefinition( mapName + " has invalid Exception handler: " + handler_class, ex); } catch (LinkageError le) { throw new PluginException.InvalidDefinition( mapName + " has invalid Exception handler: " + handler_class, le); } } else { // Expect a list of mappings from either result code or exception // name to CacheException name Collection<String> mappings = definitionMap.getCollection(KEY_EXCEPTION_LIST, null); if (mappings != null) { // add each entry for (String entry : mappings) { if (log.isDebug2()) { log.debug2("initMap(" + entry + ")"); } String first; String ceName; try { List<String> pair = StringUtil.breakAt(entry, '=', 2, true, true); first = pair.get(0); ceName = pair.get(1); } catch (Exception ex) { throw new PluginException.InvalidDefinition( "Invalid syntax: " + entry + "in " + mapName); } Object val; // Value should be either a CacheException or CacheResultHandler // class name. PluginFetchEventResponse resp = (PluginFetchEventResponse) newAuxClass(ceName, PluginFetchEventResponse.class, null); if (resp instanceof CacheException) { val = resp.getClass(); } else if (resp instanceof CacheResultHandler) { val = WrapperUtil.wrap((CacheResultHandler) resp, CacheResultHandler.class); } else { throw new PluginException.InvalidDefinition( "Second arg not a " + "CacheException or " + "CacheResultHandler class: " + entry + ", in " + mapName); } try { int code = Integer.parseInt(first); // If parseable as an integer, it's a result code. hResultMap.storeMapEntry(code, val); } catch (NumberFormatException e) { try { Class eClass = Class.forName(first); // If a class name, it should be an exception class if (Exception.class.isAssignableFrom(eClass)) { hResultMap.storeMapEntry(eClass, val); } else { throw new PluginException.InvalidDefinition( "First arg not an " + "Exception class: " + entry + ", in " + mapName); } } catch (Exception ex) { throw new PluginException.InvalidDefinition( "First arg not a " + "number or class: " + entry + ", in " + mapName); } catch (LinkageError le) { throw new PluginException.InvalidDefinition("Can't load " + first, le); } } } } } resultMap = hResultMap; } protected void initFeatureVersions() throws PluginException.InvalidDefinition { if (definitionMap.containsKey(KEY_PLUGIN_FEATURE_VERSION_MAP)) { Map<Plugin.Feature, String> map = new HashMap<Plugin.Feature, String>(); Map<String, String> spec = (Map<String, String>) definitionMap.getMap(KEY_PLUGIN_FEATURE_VERSION_MAP); log.debug2("features: " + spec); for (Map.Entry<String, String> ent : spec.entrySet()) { try { // Prefix version string with feature name to create separate // namespace for each feature String key = ent.getKey(); map.put(Plugin.Feature.valueOf(key), key + "_" + ent.getValue()); } catch (RuntimeException e) { log.warning( getPluginName() + " set unknown feature: " + ent.getKey() + " to version " + ent.getValue(), e); throw new PluginException.InvalidDefinition("Unknown feature: " + ent.getKey(), e); } } featureVersion = map; } else { featureVersion = null; } } protected void initAuFeatureMap() { if (definitionMap.containsKey(DefinableArchivalUnit.KEY_AU_FEATURE_URL_MAP)) { Map<String, ?> featMap = definitionMap.getMap(DefinableArchivalUnit.KEY_AU_FEATURE_URL_MAP); for (Map.Entry ent : featMap.entrySet()) { Object val = ent.getValue(); if (val instanceof Map) { ent.setValue(MapUtil.expandAlternativeKeyLists((Map) val)); } } } } /** Create a CrawlWindow if necessary and return it. The CrawlWindow must be thread-safe. */ protected CrawlWindow makeCrawlWindow() { if (crawlWindow != null) { return crawlWindow; } CrawlWindow window = (CrawlWindow) definitionMap.getMapElement(DefinableArchivalUnit.KEY_AU_CRAWL_WINDOW_SER); if (window == null) { String window_class = definitionMap.getString(DefinableArchivalUnit.KEY_AU_CRAWL_WINDOW, null); if (window_class != null) { ConfigurableCrawlWindow ccw = (ConfigurableCrawlWindow) newAuxClass(window_class, ConfigurableCrawlWindow.class); try { window = ccw.makeCrawlWindow(); } catch (PluginException e) { throw new RuntimeException(e); } } } crawlWindow = window; return window; } LoginPageChecker loginChecker; protected LoginPageChecker makeLoginPageChecker() { if (loginChecker == null) { String loginPageCheckerClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_LOGIN_PAGE_CHECKER, null); if (loginPageCheckerClass != null) { loginChecker = (LoginPageChecker) newAuxClass(loginPageCheckerClass, LoginPageChecker.class); } } return loginChecker; } PermissionCheckerFactory permissionCheckerFact; protected PermissionCheckerFactory getPermissionCheckerFactory() { if (permissionCheckerFact == null) { String permissionCheckerFactoryClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_PERMISSION_CHECKER_FACTORY, null); if (permissionCheckerFactoryClass != null) { permissionCheckerFact = (PermissionCheckerFactory) newAuxClass(permissionCheckerFactoryClass, PermissionCheckerFactory.class); log.debug2("Loaded PermissionCheckerFactory: " + permissionCheckerFact); } } return permissionCheckerFact; } protected UrlNormalizer urlNorm; protected UrlNormalizer getUrlNormalizer() { if (urlNorm == null) { String normalizerClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_URL_NORMALIZER, null); if (normalizerClass != null) { urlNorm = (UrlNormalizer) newAuxClass(normalizerClass, UrlNormalizer.class); } else { urlNorm = NullUrlNormalizer.INSTANCE; } } return urlNorm; } protected ExploderHelper exploderHelper = null; protected ExploderHelper getExploderHelper() { if (exploderHelper == null) { String helperClass = definitionMap.getString(DefinableArchivalUnit.KEY_AU_EXPLODER_HELPER, null); if (helperClass != null) { exploderHelper = (ExploderHelper) newAuxClass(helperClass, ExploderHelper.class); } } return exploderHelper; } protected CrawlUrlComparatorFactory crawlUrlComparatorFactory = null; protected CrawlUrlComparatorFactory getCrawlUrlComparatorFactory() { if (crawlUrlComparatorFactory == null) { String factClass = definitionMap.getString(DefinablePlugin.KEY_PLUGIN_CRAWL_URL_COMPARATOR_FACTORY, null); if (factClass != null) { crawlUrlComparatorFactory = (CrawlUrlComparatorFactory) newAuxClass(factClass, CrawlUrlComparatorFactory.class); } } return crawlUrlComparatorFactory; } protected Comparator<CrawlUrl> getCrawlUrlComparator(ArchivalUnit au) throws PluginException.LinkageError { CrawlUrlComparatorFactory fact = getCrawlUrlComparatorFactory(); if (fact == null) { return null; } return fact.createCrawlUrlComparator(au); } protected FilterRule constructFilterRule(String contentType) { String mimeType = HeaderUtil.getMimeTypeFromContentType(contentType); Object filter_el = definitionMap.getMapElement(mimeType + DefinableArchivalUnit.SUFFIX_FILTER_RULE); if (filter_el instanceof String) { log.debug("Loading filter " + filter_el); return (FilterRule) newAuxClass((String) filter_el, FilterRule.class); } else if (filter_el instanceof List) { if (((List) filter_el).size() > 0) { return new DefinableFilterRule((List) filter_el); } } return super.constructFilterRule(mimeType); } protected ArticleIteratorFactory articleIteratorFact = null; protected ArticleMetadataExtractorFactory articleMetadataFact = null; /** * Returns the plugin's article iterator factory, if any * * @return the ArticleIteratorFactory */ public ArticleIteratorFactory getArticleIteratorFactory() { if (articleIteratorFact == null) { String factClass = definitionMap.getString(KEY_PLUGIN_ARTICLE_ITERATOR_FACTORY, null); if (factClass != null) { articleIteratorFact = (ArticleIteratorFactory) newAuxClass(factClass, ArticleIteratorFactory.class); } } return articleIteratorFact; } /** * Returns the article iterator factory for the content type, if any * * @param contentType the content type * @return the ArticleIteratorFactory */ public ArticleMetadataExtractorFactory getArticleMetadataExtractorFactory(MetadataTarget target) { if (articleMetadataFact == null) { String factClass = definitionMap.getString(KEY_PLUGIN_ARTICLE_METADATA_EXTRACTOR_FACTORY, null); if (factClass != null) { articleMetadataFact = (ArticleMetadataExtractorFactory) newAuxClass(factClass, ArticleMetadataExtractorFactory.class); } } return articleMetadataFact; } public String getPluginId() { String className; if (mapName != null) { className = mapName; } else { // @TODO: eliminate this when we eliminate subclasses className = this.getClass().getName(); } return className; } }