public int match(String url) { if (StringUtil.equalStringsIgnoreCase(url, m_registryUrl) || StringUtil.endsWithIgnoreCase(url, ".jar")) { return CrawlRule.INCLUDE; } else { return CrawlRule.EXCLUDE; } }
private void doSleep() throws IOException { String timestr = getParameter(KEY_TIME); try { long time = StringUtil.parseTimeInterval(timestr); Deadline.in(time).sleep(); statusMsg = "Slept for " + StringUtil.timeIntervalToString(time); } catch (NumberFormatException e) { errMsg = "Illegal duration: " + e; } catch (InterruptedException e) { errMsg = "Interrupted: " + e; } }
private List getSummaryInfo(CrawlerStatus status) { List res = new ArrayList(); StatusTable.SummaryInfo statusSi = new StatusTable.SummaryInfo( "Status", ColumnDescriptor.TYPE_STRING, status.getCrawlStatusMsg()); ArchivalUnit au = status.getAu(); if (au != null) { AuState aus = AuUtil.getAuState(au); if (status.getCrawlStatus() == Crawler.STATUS_SUCCESSFUL && aus.hasNoSubstance()) { statusSi.setValueFootnote(FOOT_NO_SUBSTANCE_CRAWL_STATUS); } } res.add(statusSi); String sources = StringUtil.separatedString(status.getSources()); res.add(new StatusTable.SummaryInfo("Source", ColumnDescriptor.TYPE_STRING, sources)); String startUrls = StringUtil.separatedString(status.getStartUrls()); res.add( new StatusTable.SummaryInfo("Starting Url(s)", ColumnDescriptor.TYPE_STRING, startUrls)); return res; }
ArchivalUnit getAu() { if (StringUtil.isNullString(formAuid)) { errMsg = "Select an AU"; return null; } ArchivalUnit au = pluginMgr.getAuFromId(formAuid); if (au == null) { errMsg = "No such AU. Select an AU"; return null; } return au; }
public void loadAuConfigDescrs(Configuration config) throws ConfigurationException { super.loadAuConfigDescrs(config); this.m_registryUrl = config.get(ConfigParamDescr.BASE_URL.getKey()); // Now we can construct a valid CC permission checker. m_permissionCheckers = // ListUtil.list(new CreativeCommonsPermissionChecker(m_registryUrl)); ListUtil.list(new CreativeCommonsPermissionChecker()); paramMap.putLong( KEY_AU_NEW_CONTENT_CRAWL_INTERVAL, CurrentConfig.getTimeIntervalParam( PARAM_REGISTRY_CRAWL_INTERVAL, DEFAULT_REGISTRY_CRAWL_INTERVAL)); if (log.isDebug2()) { log.debug2( "Setting Registry AU recrawl interval to " + StringUtil.timeIntervalToString( paramMap.getLong(KEY_AU_NEW_CONTENT_CRAWL_INTERVAL))); } }
public void lockssHandleRequest() throws IOException { resetVars(); boolean showForm = true; String action = getParameter(KEY_ACTION); if (!StringUtil.isNullString(action)) { formAuid = getParameter(KEY_AUID); formDepth = getParameter(KEY_REFETCH_DEPTH); UserAccount acct = getUserAccount(); if (acct != null && !noAuditActions.contains(action)) { acct.auditableEvent("used debug panel action: " + action + " AU ID: " + formAuid); } } if (ACTION_MAIL_BACKUP.equals(action)) { doMailBackup(); } if (ACTION_RELOAD_CONFIG.equals(action)) { doReloadConfig(); } if (ACTION_SLEEP.equals(action)) { doSleep(); } if (ACTION_THROW_IOEXCEPTION.equals(action)) { doThrow(); } if (ACTION_START_V3_POLL.equals(action)) { doV3Poll(); } if (ACTION_FORCE_START_V3_POLL.equals(action)) { forceV3Poll(); } if (ACTION_START_CRAWL.equals(action)) { doCrawl(false, false); } if (ACTION_FORCE_START_CRAWL.equals(action)) { doCrawl(true, false); } if (ACTION_START_DEEP_CRAWL.equals(action)) { doCrawl(false, true); } if (ACTION_FORCE_START_DEEP_CRAWL.equals(action)) { doCrawl(true, true); } if (ACTION_CHECK_SUBSTANCE.equals(action)) { doCheckSubstance(); } if (ACTION_CRAWL_PLUGINS.equals(action)) { crawlPluginRegistries(); } if (ACTION_FIND_URL.equals(action)) { showForm = doFindUrl(); } if (ACTION_REINDEX_METADATA.equals(action)) { doReindexMetadata(); } if (ACTION_FORCE_REINDEX_METADATA.equals(action)) { forceReindexMetadata(); } if (ACTION_DISABLE_METADATA_INDEXING.equals(action)) { doDisableMetadataIndexing(); } if (showForm) { displayPage(); } }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }