private void checkSubstance(ArchivalUnit au) { SubstanceChecker subChecker = new SubstanceChecker(au); if (!subChecker.isEnabled()) { errMsg = "No substance patterns defined for plugin."; return; } AuState auState = AuUtil.getAuState(au); SubstanceChecker.State oldState = auState.getSubstanceState(); SubstanceChecker.State newState = subChecker.findSubstance(); String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")"); switch (newState) { case Unknown: log.error("Shouldn't happen: SubstanceChecker returned Unknown"); errMsg = "Error in SubstanceChecker; see log."; break; case Yes: statusMsg = "AU has substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.Yes); break; case No: statusMsg = "AU has no substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.No); break; } }
private boolean startReindexingMetadata(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } if (!force) { if (!AuUtil.hasCrawled(au)) { errMsg = "Au has never crawled. Click again to reindex metadata"; showForceReindexMetadata = true; return false; } AuState auState = AuUtil.getAuState(au); switch (auState.getSubstanceState()) { case No: errMsg = "Au has no substance. Click again to reindex metadata"; showForceReindexMetadata = true; return false; case Unknown: errMsg = "Unknown substance for Au. Click again to reindex metadata."; showForceReindexMetadata = true; return false; case Yes: // fall through } } // Fully reindex metadata with the highest priority. Connection conn = null; PreparedStatement insertPendingAuBatchStatement = null; try { conn = dbMgr.getConnection(); insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn); if (metadataMgr.enableAndAddAuToReindex( au, conn, insertPendingAuBatchStatement, false, true)) { statusMsg = "Reindexing metadata for " + au.getName(); return true; } } catch (DbException dbe) { log.error("Cannot reindex metadata for " + au.getName(), dbe); } finally { DbManager.safeCloseStatement(insertPendingAuBatchStatement); DbManager.safeRollbackAndClose(conn); } if (force) { errMsg = "Still cannot reindex metadata for " + au.getName(); } else { errMsg = "Cannot reindex metadata for " + au.getName(); } return false; }
private boolean disableMetadataIndexing(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } try { metadataMgr.disableAuIndexing(au); statusMsg = "Disabled metadata indexing for " + au.getName(); return true; } catch (Exception e) { errMsg = "Cannot reindex metadata for " + au.getName() + ": " + e.getMessage(); return false; } }
private void crawlPluginRegistries() { StringBuilder sb = new StringBuilder(); for (ArchivalUnit au : pluginMgr.getAllRegistryAus()) { sb.append(au.getName()); sb.append(": "); try { startCrawl(au, true, false); sb.append("Queued."); } catch (CrawlManagerImpl.NotEligibleException e) { sb.append("Failed: "); sb.append(e.getMessage()); } sb.append("\n"); } statusMsg = sb.toString(); }
private boolean startCrawl(ArchivalUnit au, boolean force, boolean deep) throws CrawlManagerImpl.NotEligibleException { CrawlManagerImpl cmi = (CrawlManagerImpl) crawlMgr; if (force) { RateLimiter limit = cmi.getNewContentRateLimiter(au); if (!limit.isEventOk()) { limit.unevent(); } } cmi.checkEligibleToQueueNewContentCrawl(au); String delayMsg = ""; String deepMsg = ""; try { cmi.checkEligibleForNewContentCrawl(au); } catch (CrawlManagerImpl.NotEligibleException e) { delayMsg = ", Start delayed due to: " + e.getMessage(); } Configuration config = ConfigManager.getCurrentConfig(); int pri = config.getInt(PARAM_CRAWL_PRIORITY, DEFAULT_CRAWL_PRIORITY); CrawlReq req; try { req = new CrawlReq(au); req.setPriority(pri); if (deep) { int d = Integer.parseInt(formDepth); if (d < 0) { errMsg = "Illegal refetch depth: " + d; return false; } req.setRefetchDepth(d); deepMsg = "Deep (" + req.getRefetchDepth() + ") "; } } catch (NumberFormatException e) { errMsg = "Illegal refetch depth: " + formDepth; return false; } catch (RuntimeException e) { log.error("Couldn't create CrawlReq: " + au, e); errMsg = "Couldn't create CrawlReq: " + e.toString(); return false; } cmi.startNewContentCrawl(req, null); statusMsg = deepMsg + "Crawl requested for " + au.getName() + delayMsg; return true; }
private void callV3ContentPoll(ArchivalUnit au) throws PollManager.NotEligibleException { log.debug("Enqueuing a V3 Content Poll on " + au.getName()); PollSpec spec = new PollSpec(au.getAuCachedUrlSet(), Poll.V3_POLL); pollManager.enqueueHighPriorityPoll(au, spec); statusMsg = "Enqueued V3 poll for " + au.getName(); }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }