private void checkSubstance(ArchivalUnit au) { SubstanceChecker subChecker = new SubstanceChecker(au); if (!subChecker.isEnabled()) { errMsg = "No substance patterns defined for plugin."; return; } AuState auState = AuUtil.getAuState(au); SubstanceChecker.State oldState = auState.getSubstanceState(); SubstanceChecker.State newState = subChecker.findSubstance(); String chtxt = (newState == oldState ? "(unchanged)" : "(was " + oldState.toString() + ")"); switch (newState) { case Unknown: log.error("Shouldn't happen: SubstanceChecker returned Unknown"); errMsg = "Error in SubstanceChecker; see log."; break; case Yes: statusMsg = "AU has substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.Yes); break; case No: statusMsg = "AU has no substance " + chtxt + ": " + au.getName(); auState.setSubstanceState(SubstanceChecker.State.No); break; } }
private void deleteBlock(CachedUrl cu) throws IOException { log.info("deleting " + cu.getUrl()); CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl()); ArchivalUnit au = cu.getArchivalUnit(); CachedUrlSet cus = au.makeCachedUrlSet(cuss); NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au); nm.deleteNode(cus); }
public void testFunctionalFromTarHierarchy() throws Exception { log.debug3("in testFromTarHierarchy"); // load the tarballs InputStream file_input = null; try { file_input = getResourceAsStream(realTARFile_A); // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE); // uc.storeContent(file_input, tarHeader); UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); file_input = getResourceAsStream(realTARFile_B); // uc = au.makeUrlCacher(TAR_B_BASE); // uc.storeContent(file_input, tarHeader); uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.safeClose(file_input); } CachedUrlSet cus = tarAu.getAuCachedUrlSet(); for (CachedUrl cu : cus.getCuIterable()) { log.debug3("AU - cu is: " + cu.getUrl()); cu.release(); } // We need to start from the level of the ArticleMetadataExtractor MyListEmitter emitter = new MyListEmitter(); ArticleMetadataExtractor amEx = new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA); Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any()); while (it.hasNext()) { ArticleFiles af = it.next(); log.debug3("Metadata test - articlefiles " + af.toString()); // CachedUrl cu = af.getFullTextCu(); CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA); log.debug3("metadata cu is " + cu.getUrl()); // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu); amEx.extract(MetadataTarget.Any(), af, emitter); List<ArticleMetadata> returnList = emitter.getAmList(); assertNotNull(returnList); log.debug3("size of returnList is " + returnList.size()); Iterator<ArticleMetadata> mdIt = returnList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateCompleteMetadataRecord(mdRecord); } } }
private boolean startReindexingMetadata(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } if (!force) { if (!AuUtil.hasCrawled(au)) { errMsg = "Au has never crawled. Click again to reindex metadata"; showForceReindexMetadata = true; return false; } AuState auState = AuUtil.getAuState(au); switch (auState.getSubstanceState()) { case No: errMsg = "Au has no substance. Click again to reindex metadata"; showForceReindexMetadata = true; return false; case Unknown: errMsg = "Unknown substance for Au. Click again to reindex metadata."; showForceReindexMetadata = true; return false; case Yes: // fall through } } // Fully reindex metadata with the highest priority. Connection conn = null; PreparedStatement insertPendingAuBatchStatement = null; try { conn = dbMgr.getConnection(); insertPendingAuBatchStatement = metadataMgr.getPrioritizedInsertPendingAuBatchStatement(conn); if (metadataMgr.enableAndAddAuToReindex( au, conn, insertPendingAuBatchStatement, false, true)) { statusMsg = "Reindexing metadata for " + au.getName(); return true; } } catch (DbException dbe) { log.error("Cannot reindex metadata for " + au.getName(), dbe); } finally { DbManager.safeCloseStatement(insertPendingAuBatchStatement); DbManager.safeRollbackAndClose(conn); } if (force) { errMsg = "Still cannot reindex metadata for " + au.getName(); } else { errMsg = "Cannot reindex metadata for " + au.getName(); } return false; }
public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
private boolean disableMetadataIndexing(ArchivalUnit au, boolean force) { if (metadataMgr == null) { errMsg = "Metadata processing is not enabled."; return false; } try { metadataMgr.disableAuIndexing(au); statusMsg = "Disabled metadata indexing for " + au.getName(); return true; } catch (Exception e) { errMsg = "Cannot reindex metadata for " + au.getName() + ": " + e.getMessage(); return false; } }
private void crawlPluginRegistries() { StringBuilder sb = new StringBuilder(); for (ArchivalUnit au : pluginMgr.getAllRegistryAus()) { sb.append(au.getName()); sb.append(": "); try { startCrawl(au, true, false); sb.append("Queued."); } catch (CrawlManagerImpl.NotEligibleException e) { sb.append("Failed: "); sb.append(e.getMessage()); } sb.append("\n"); } statusMsg = sb.toString(); }
/* private support methods */ private List<ArticleMetadata> setupContentForAU( ArchivalUnit au, String url, String content, boolean isHtmlExtractor) throws IOException, PluginException { FileMetadataExtractor me; InputStream input = null; CIProperties props = null; if (isHtmlExtractor) { input = IOUtils.toInputStream(content, "utf-8"); props = getContentHtmlProperties(); me = new BaseAtyponHtmlMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/html"); } else { input = IOUtils.toInputStream(content, "utf-8"); props = getContentRisProperties(); me = new BaseAtyponRisMetadataExtractorFactory() .createFileMetadataExtractor(MetadataTarget.Any(), "text/plain"); } UrlData ud = new UrlData(input, props, url); UrlCacher uc = au.makeUrlCacher(ud); uc.storeContent(); CachedUrl cu = uc.getCachedUrl(); FileMetadataListExtractor mle = new FileMetadataListExtractor(me); return mle.extract(MetadataTarget.Any(), cu); }
public void export() { log.debug("export(" + au.getName() + ")"); log.debug( "dir: " + dir + ", pref: " + prefix + ", size: " + maxSize + ", ver: " + maxVersions + (compress ? ", (C)" : "")); checkArgs(); try { start(); } catch (IOException e) { recordError("Error opening file", e); return; } writeFiles(); try { finish(); } catch (IOException e) { if (!isDiskFull) { // If we already knew (and reported) disk full, also reporting it // as a close error is misleading. recordError("Error closing file", e); } } }
public static String getRepositorySpec(ArchivalUnit au) { Configuration auConfig = au.getConfiguration(); if (auConfig != null) { // can be null in unit tests String repoSpec = auConfig.get(PluginManager.AU_PARAM_REPOSITORY); if (repoSpec != null && repoSpec.startsWith("local:")) { return repoSpec; } } return "local:" + CurrentConfig.getParam(PARAM_CACHE_LOCATION); }
List<String> auUrls(ArchivalUnit au) { List<String> res = new ArrayList<String>(); for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next(); if (cusn.hasContent()) { res.add(cusn.getUrl()); } } return res; }
public BePressArticleIterator( ArchivalUnit au, SubTreeArticleIterator.Spec spec, boolean isSection) { super(au, spec); String volumeAsString = au.getConfiguration().get(ConfigParamDescr.VOLUME_NUMBER.getKey()); String journalAbbr = au.getConfiguration().get(ConfigParamDescr.JOURNAL_ABBR.getKey()); if (isSection) { journalAbbr = journalAbbr + "/" + au.getConfiguration().get("journal_section"); } // pick up issue level and lower (make (art)?[0-9]+ optional because a few au's have article // at issue level this.pattern = Pattern.compile( String.format( "/%s/((([^0-9]+/)?(vol)?%s/(iss)?[0-9]+(/(art)?[0-9]+)?)|(vol%s/(?-i:[A-Z])[0-9]+))$", journalAbbr, volumeAsString, volumeAsString), Pattern.CASE_INSENSITIVE); this.TOC_pattern = Pattern.compile( String.format("/%s/([^0-9]+/)?(vol)?%s/(iss)?[0-9]+$", journalAbbr, volumeAsString), Pattern.CASE_INSENSITIVE); }
/** * Factory method to create new LockssRepository instances. * * @param au the {@link ArchivalUnit} * @return the new LockssRepository instance */ public static LockssRepository createNewLockssRepository(ArchivalUnit au) { String root = getRepositoryRoot(au); if (root == null || root.equals("null")) { logger.error("No repository dir set in config"); throw new LockssRepository.RepositoryStateException("No repository dir set in config"); } String auDir = LockssRepositoryImpl.mapAuToFileLocation(root, au); if (logger.isDebug2()) { logger.debug2("repo: " + auDir + ", au: " + au.getName()); } staticCacheLocation = extendCacheLocation(root); LockssRepositoryImpl repo = new LockssRepositoryImpl(auDir); Plugin plugin = au.getPlugin(); if (plugin != null) { LockssDaemon daemon = plugin.getDaemon(); if (daemon != null) { RepositoryManager mgr = daemon.getRepositoryManager(); if (mgr != null) { mgr.setRepositoryForPath(auDir, repo); } } } return repo; }
private boolean startCrawl(ArchivalUnit au, boolean force, boolean deep) throws CrawlManagerImpl.NotEligibleException { CrawlManagerImpl cmi = (CrawlManagerImpl) crawlMgr; if (force) { RateLimiter limit = cmi.getNewContentRateLimiter(au); if (!limit.isEventOk()) { limit.unevent(); } } cmi.checkEligibleToQueueNewContentCrawl(au); String delayMsg = ""; String deepMsg = ""; try { cmi.checkEligibleForNewContentCrawl(au); } catch (CrawlManagerImpl.NotEligibleException e) { delayMsg = ", Start delayed due to: " + e.getMessage(); } Configuration config = ConfigManager.getCurrentConfig(); int pri = config.getInt(PARAM_CRAWL_PRIORITY, DEFAULT_CRAWL_PRIORITY); CrawlReq req; try { req = new CrawlReq(au); req.setPriority(pri); if (deep) { int d = Integer.parseInt(formDepth); if (d < 0) { errMsg = "Illegal refetch depth: " + d; return false; } req.setRefetchDepth(d); deepMsg = "Deep (" + req.getRefetchDepth() + ") "; } } catch (NumberFormatException e) { errMsg = "Illegal refetch depth: " + formDepth; return false; } catch (RuntimeException e) { log.error("Couldn't create CrawlReq: " + au, e); errMsg = "Couldn't create CrawlReq: " + e.toString(); return false; } cmi.startNewContentCrawl(req, null); statusMsg = deepMsg + "Crawl requested for " + au.getName() + delayMsg; return true; }
public void setUp() throws Exception { super.setUp(); setUpDiskSpace(); // you need this to have startService work properly... theDaemon = getMockLockssDaemon(); theDaemon.getAlertManager(); theDaemon.getPluginManager().setLoadablePluginsReady(true); theDaemon.setDaemonInited(true); theDaemon.getPluginManager().startService(); theDaemon.getCrawlManager(); // in this directory this is file "test_baseatypon.tdb" but it becomes xml ConfigurationUtil.addFromUrl(getResource("test_baseatypon.xml")); Tdb tdb = ConfigManager.getCurrentConfig().getTdb(); TdbAu tdbau1 = tdb.getTdbAusLikeName(goodJournal + " Volume " + goodVolume).get(0); assertNotNull("Didn't find named TdbAu", tdbau1); bau1 = PluginTestUtil.createAndStartAu(tdbau1); assertNotNull(bau1); TypedEntryMap auConfig = bau1.getProperties(); assertEquals(BASE_URL, auConfig.getString(BASE_URL_KEY)); }
/** * Finds the directory for this AU. If none found in the map, designates a new dir for it. * * @param au the AU * @param repoRoot root of the repository * @return the dir {@link String} */ static String getAuDir(ArchivalUnit au, String repoRoot, boolean create) { return getAuDir(au.getAuId(), repoRoot, create); }
private void callV3ContentPoll(ArchivalUnit au) throws PollManager.NotEligibleException { log.debug("Enqueuing a V3 Content Poll on " + au.getName()); PollSpec spec = new PollSpec(au.getAuCachedUrlSet(), Poll.V3_POLL); pollManager.enqueueHighPriorityPoll(au, spec); statusMsg = "Enqueued V3 poll for " + au.getName(); }
public static V1Poll createCompletedPoll( LockssDaemon daemon, ArchivalUnit au, V1LcapMessage testmsg, int numAgree, int numDisagree, PollManager pollmanager) throws Exception { log.debug( "createCompletedPoll: au: " + au.toString() + " peer " + testmsg.getOriginatorId() + " votes " + numAgree + "/" + numDisagree); CachedUrlSetSpec cusSpec = null; if ((testmsg.getLwrBound() != null) && (testmsg.getLwrBound().equals(PollSpec.SINGLE_NODE_LWRBOUND))) { cusSpec = new SingleNodeCachedUrlSetSpec(testmsg.getTargetUrl()); } else { cusSpec = new RangeCachedUrlSetSpec( testmsg.getTargetUrl(), testmsg.getLwrBound(), testmsg.getUprBound()); } CachedUrlSet cus = au.makeCachedUrlSet(cusSpec); PollSpec spec = new PollSpec(cus, Poll.V1_CONTENT_POLL); ((MockCachedUrlSet) spec.getCachedUrlSet()).setHasContent(false); V1Poll p = null; if (testmsg.isContentPoll()) { p = new V1ContentPoll( spec, pollmanager, testmsg.getOriginatorId(), testmsg.getChallenge(), testmsg.getDuration(), testmsg.getHashAlgorithm()); } else if (testmsg.isNamePoll()) { p = new V1NamePoll( spec, pollmanager, testmsg.getOriginatorId(), testmsg.getChallenge(), testmsg.getDuration(), testmsg.getHashAlgorithm()); } else if (testmsg.isVerifyPoll()) { p = new V1VerifyPoll( spec, pollmanager, testmsg.getOriginatorId(), testmsg.getChallenge(), testmsg.getDuration(), testmsg.getHashAlgorithm(), testmsg.getVerifier()); } assertNotNull(p); p.setMessage(testmsg); p.m_tally.quorum = numAgree + numDisagree; p.m_tally.numAgree = numAgree; p.m_tally.numDisagree = numDisagree; p.m_tally.wtAgree = 2000; p.m_tally.wtDisagree = 200; p.m_tally.localEntries = makeEntries(1, 3); p.m_tally.votedEntries = makeEntries(1, 5); p.m_tally.votedEntries.remove(1); p.m_pollstate = V1Poll.PS_COMPLETE; p.m_callerID = testmsg.getOriginatorId(); log.debug3("poll " + p.toString()); p.m_tally.tallyVotes(); return p; }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }