/** Create LockssKeystores from config subtree below {@link #PARAM_KEYSTORE} */ void configureKeyStores(Configuration config) { Configuration allKs = config.getConfigTree(PARAM_KEYSTORE); for (Iterator iter = allKs.nodeIterator(); iter.hasNext(); ) { String id = (String) iter.next(); Configuration oneKs = allKs.getConfigTree(id); try { LockssKeyStore lk = createLockssKeyStore(oneKs); String name = lk.getName(); if (name == null) { log.error("KeyStore definition missing name: " + oneKs); continue; } LockssKeyStore old = keystoreMap.get(name); if (old != null && !lk.equals(old)) { log.warning( "Keystore " + name + " redefined. " + "New definition may not take effect until daemon restart"); } log.debug("Adding keystore " + name); keystoreMap.put(name, lk); } catch (Exception e) { log.error("Couldn't create keystore: " + oneKs, e); } } }
/** * Return a TitleConfig for the AU. Returns matching entry from the title db if found, else * creates one */ TitleConfig titleConfigFromAu(InactiveAuProxy au) { PluginProxy plugin = au.getPlugin(); String auname = au.getName(); Configuration auConfig = au.getConfiguration(); TitleConfig tc = AuUtil.findTitleConfig(auConfig, plugin.getPlugin()); if (tc != null) { return tc; } tc = new TitleConfig(auname, plugin.getPluginId()); ArrayList<ConfigParamAssignment> params = new ArrayList(); for (Iterator iter = auConfig.keyIterator(); iter.hasNext(); ) { String key = (String) iter.next(); if (!ConfigParamDescr.isReservedParam(key)) { String val = auConfig.get(key); ConfigParamDescr descr = plugin.findAuConfigDescr(key); if (descr != null) { ConfigParamAssignment cpa = new ConfigParamAssignment(descr, val); params.add(cpa); } else { log.warning("Unknown parameter key: " + key + " in au: " + auname); } } } params.trimToSize(); tc.setParams(params); return tc; }
protected void checkLeaf(SimulatedArchivalUnit sau) { log.debug("checkLeaf()"); String parent = sau.getUrlRoot() + "/branch1"; CachedUrlSetSpec spec = new RangeCachedUrlSetSpec(parent); CachedUrlSet set = sau.makeCachedUrlSet(spec); Iterator setIt = set.contentHashIterator(); ArrayList childL = new ArrayList(16); while (setIt.hasNext()) { childL.add(((CachedUrlSetNode) setIt.next()).getUrl()); } String[] expectedA = new String[] { parent, parent + "/001file.html", parent + "/001file.txt", parent + "/002file.html", parent + "/002file.txt", parent + "/branch1", parent + "/branch1/001file.html", parent + "/branch1/001file.txt", parent + "/branch1/002file.html", parent + "/branch1/002file.txt", parent + "/branch1/index.html", parent + "/branch2", parent + "/branch2/001file.html", parent + "/branch2/001file.txt", parent + "/branch2/002file.html", parent + "/branch2/002file.txt", parent + "/branch2/index.html", parent + "/index.html", }; assertIsomorphic(expectedA, childL); }
/** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractGoodRisContent() throws Exception { String goodContent = createGoodRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); assertEquals(goodVolume, md.get(MetadataField.FIELD_VOLUME)); assertEquals(goodIssue, md.get(MetadataField.FIELD_ISSUE)); assertEquals(goodStartPage, md.get(MetadataField.FIELD_START_PAGE)); assertEquals(goodEndPage, md.get(MetadataField.FIELD_END_PAGE)); assertEquals(goodIssn, md.get(MetadataField.FIELD_ISSN)); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); assertEquals(goodDOI, md.get(MetadataField.FIELD_DOI)); // This shouldn't get set. It will default later to fuill_text_cu assertNotEquals(doiURL, md.get(MetadataField.FIELD_ACCESS_URL)); }
private V3LcapMessage makeTestVoteMessage(Collection voteBlocks) throws IOException { mPollMgr.setStateDir("key", tempDir); V3LcapMessage msg = new V3LcapMessage( "ArchivalID_2", "key", "Plug42", m_testBytes, m_testBytes, V3LcapMessage.MSG_VOTE, 987654321, m_testID, tempDir, theDaemon); // Set msg vote blocks. for (Iterator ix = voteBlocks.iterator(); ix.hasNext(); ) { msg.addVoteBlock((VoteBlock) ix.next()); } msg.setHashAlgorithm(LcapMessage.getDefaultHashAlgorithm()); msg.setArchivalId(m_archivalID); msg.setPluginVersion("PlugVer42"); return msg; }
protected void checkRoot(SimulatedArchivalUnit sau) { log.debug("checkRoot()"); CachedUrlSet set = sau.getAuCachedUrlSet(); Iterator setIt = set.flatSetIterator(); ArrayList childL = new ArrayList(1); CachedUrlSet cus = null; while (setIt.hasNext()) { cus = (CachedUrlSet) setIt.next(); childL.add(cus.getUrl()); } String urlRoot = sau.getUrlRoot(); String[] expectedA = new String[1]; expectedA[0] = urlRoot; assertIsomorphic(expectedA, childL); setIt = cus.flatSetIterator(); childL = new ArrayList(7); while (setIt.hasNext()) { childL.add(((CachedUrlSetNode) setIt.next()).getUrl()); } expectedA = new String[] { urlRoot + "/001file.html", urlRoot + "/001file.txt", urlRoot + "/002file.html", urlRoot + "/002file.txt", urlRoot + "/branch1", urlRoot + "/branch2", urlRoot + "/index.html" }; assertIsomorphic(expectedA, childL); }
/** * Return the titles in the set. * * @return a collection of TitleConfig */ public Collection<TitleConfig> getTitles() { Collection aus = daemon.getRemoteApi().getInactiveAus(); ArrayList<TitleConfig> res = new ArrayList<TitleConfig>(aus.size()); for (Iterator iter = aus.iterator(); iter.hasNext(); ) { InactiveAuProxy aup = (InactiveAuProxy) iter.next(); res.add(titleConfigFromAu(aup)); } res.trimToSize(); return res; }
List<String> auUrls(ArchivalUnit au) { List<String> res = new ArrayList<String>(); for (Iterator iter = au.getAuCachedUrlSet().contentHashIterator(); iter.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) iter.next(); if (cusn.hasContent()) { res.add(cusn.getUrl()); } } return res; }
/** Concatenate params for URL string */ String concatParams(Properties props) { if (props == null) { return null; } java.util.List list = new ArrayList(); for (Iterator iter = props.keySet().iterator(); iter.hasNext(); ) { String key = (String) iter.next(); String val = props.getProperty(key); if (!StringUtil.isNullString(val)) { list.add(key + "=" + urlEncode(val)); } } return StringUtil.separatedString(list, "&"); }
public List findExistingRepositoriesFor(String auid) { List res = null; for (Iterator iter = getRepositoryList().iterator(); iter.hasNext(); ) { String repoName = (String) iter.next(); String path = LockssRepositoryImpl.getLocalRepositoryPath(repoName); if (LockssRepositoryImpl.doesAuDirExist(auid, path)) { if (res == null) { res = new ArrayList(); } res.add(repoName); } } return res == null ? Collections.EMPTY_LIST : res; }
public void testSimpleDatasetXML() throws Exception { log.debug3("testSimpleDatasetXML"); String file_input = StringUtil.fromInputStream(getResourceAsStream(testDatasetFile)); String xml_url = TAR_A_BASE + SUBDIR + "dataset.xml"; List<ArticleMetadata> mdList = extractFromContent(xml_url, "text/xml", file_input, nocheck_mle, null); assertEquals(6, mdList.size()); Iterator<ArticleMetadata> mdIt = mdList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateDatasetMetadataRecord(mdRecord); } }
/** * Checks the consistency of the node, and continues with its children if it's consistent. * * @param node RepositoryNodeImpl the node to check */ private void recurseConsistencyCheck(RepositoryNodeImpl node) { logger.debug2("Checking node '" + node.getNodeUrl() + "'..."); // check consistency at each node // correct/deactivate as necessary // 'checkNodeConsistency()' will repair if possible if (node.checkNodeConsistency()) { logger.debug3("Node consistent; recursing on children..."); List children = node.getNodeList(null, false); Iterator iter = children.iterator(); while (iter.hasNext()) { RepositoryNodeImpl child = (RepositoryNodeImpl) iter.next(); recurseConsistencyCheck(child); } } else { logger.debug3("Node inconsistent; deactivating..."); deactivateInconsistentNode(node); } }
/** * Method that creates a simulated Cached URL from the source code provided by the goodContent * String. It then asserts that the metadata extracted, by using the * MetaPressRisMetadataExtractorFactory, match the metadata in the source code. * * @throws Exception */ public void testExtractAlternateRisContent() throws Exception { String goodContent = createAlternateRisContent(); log.debug3(goodContent); List<ArticleMetadata> mdlist = setupContentForAU(bau1, RIS_URL, goodContent, false); assertNotEmpty(mdlist); ArticleMetadata md = mdlist.get(0); assertNotNull(md); Iterator<String> actAuthIter = md.getList(MetadataField.FIELD_AUTHOR).iterator(); for (String expAuth : goodAuthors) { assertEquals(expAuth, actAuthIter.next()); } assertEquals(goodTitle, md.get(MetadataField.FIELD_ARTICLE_TITLE)); assertEquals(goodJournal, md.get(MetadataField.FIELD_PUBLICATION_TITLE)); assertEquals(goodDate, md.get(MetadataField.FIELD_DATE)); assertEquals(goodPublisher, md.get(MetadataField.FIELD_PUBLISHER)); }
public void testRequestMessageCreation() throws Exception { V3LcapMessage reqMsg = new V3LcapMessage( "ArchivalID_2", "key", "Plug42", m_testBytes, m_testBytes, V3LcapMessage.MSG_REPAIR_REQ, 987654321, m_testID, tempDir, theDaemon); reqMsg.setTargetUrl("http://foo.com/"); for (Iterator ix = m_testVoteBlocks.iterator(); ix.hasNext(); ) { reqMsg.addVoteBlock((VoteBlock) ix.next()); } assertEquals(3, reqMsg.getProtocolVersion()); assertEquals("Plug42", reqMsg.getPluginVersion()); assertTrue(m_testID == reqMsg.getOriginatorId()); assertEquals(V3LcapMessage.MSG_REPAIR_REQ, reqMsg.getOpcode()); assertEquals("ArchivalID_2", reqMsg.getArchivalId()); assertEquals("http://foo.com/", reqMsg.getTargetUrl()); assertEquals(m_testBytes, reqMsg.getPollerNonce()); assertEquals(m_testBytes, reqMsg.getVoterNonce()); assertEquals(null, reqMsg.getVoterNonce2()); List aBlocks = new ArrayList(); List bBlocks = new ArrayList(); for (VoteBlocksIterator iter = m_testMsg.getVoteBlockIterator(); iter.hasNext(); ) { aBlocks.add(iter.next()); } for (VoteBlocksIterator iter = reqMsg.getVoteBlockIterator(); iter.hasNext(); ) { bBlocks.add(iter.next()); } assertEquals(aBlocks, bBlocks); // Actual size of test vote blocks is unpredictable assertTrue(reqMsg.getEstimatedEncodedLength() > V3LcapMessage.EST_ENCODED_HEADER_LENGTH); }
private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String) iter.next(); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); }
public void testFunctionalFromTarHierarchy() throws Exception { log.debug3("in testFromTarHierarchy"); // load the tarballs InputStream file_input = null; try { file_input = getResourceAsStream(realTARFile_A); // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE); // uc.storeContent(file_input, tarHeader); UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); file_input = getResourceAsStream(realTARFile_B); // uc = au.makeUrlCacher(TAR_B_BASE); // uc.storeContent(file_input, tarHeader); uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE)); uc.storeContent(); IOUtil.safeClose(file_input); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { IOUtil.safeClose(file_input); } CachedUrlSet cus = tarAu.getAuCachedUrlSet(); for (CachedUrl cu : cus.getCuIterable()) { log.debug3("AU - cu is: " + cu.getUrl()); cu.release(); } // We need to start from the level of the ArticleMetadataExtractor MyListEmitter emitter = new MyListEmitter(); ArticleMetadataExtractor amEx = new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA); Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any()); while (it.hasNext()) { ArticleFiles af = it.next(); log.debug3("Metadata test - articlefiles " + af.toString()); // CachedUrl cu = af.getFullTextCu(); CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA); log.debug3("metadata cu is " + cu.getUrl()); // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu); amEx.extract(MetadataTarget.Any(), af, emitter); List<ArticleMetadata> returnList = emitter.getAmList(); assertNotNull(returnList); log.debug3("size of returnList is " + returnList.size()); Iterator<ArticleMetadata> mdIt = returnList.iterator(); ArticleMetadata mdRecord = null; while (mdIt.hasNext()) { mdRecord = (ArticleMetadata) mdIt.next(); validateCompleteMetadataRecord(mdRecord); } } }
protected CIProperties makeCIProperties(ArchiveRecordHeader elementHeader) throws IOException { CIProperties ret = new CIProperties(); Set elementHeaderFieldKeys = elementHeader.getHeaderFieldKeys(); for (Iterator i = elementHeaderFieldKeys.iterator(); i.hasNext(); ) { String key = (String) i.next(); try { Object valueObject = elementHeader.getHeaderValue(key); if (valueObject == null) { logger.warning("Ignoring null value for key '" + key + "'."); } else { String value = valueObject.toString(); logger.debug3(key + ": " + value); ret.put(key, value); } } catch (ClassCastException ex) { logger.error("makeCIProperties: " + key + " threw ", ex); throw new CacheException.ExploderException(ex); } } return (ret); }
public void testArticleCountAndType() throws Exception { int expCount = 28; PluginTestUtil.crawlSimAu(sau); String pat1 = "branch(\\d+)/(\\d+file\\.html)"; String rep1 = "aps/journal/v123/n$1/full/$2"; PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1); String pat2 = "branch(\\d+)/(\\d+file\\.pdf)"; String rep2 = "aps/journal/v123/n$1/pdf/$2"; PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2); // Remove some URLs int deleted = 0; for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) { CachedUrlSetNode cusn = (CachedUrlSetNode) it.next(); if (cusn instanceof CachedUrl) { CachedUrl cu = (CachedUrl) cusn; String url = cu.getUrl(); if (url.contains("/journal/") && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) { deleteBlock(cu); ++deleted; } } } assertEquals(8, deleted); Iterator<ArticleFiles> it = nau.getArticleIterator(); int count = 0; int countHtmlOnly = 0; int countPdfOnly = 0; while (it.hasNext()) { ArticleFiles af = it.next(); log.info(af.toString()); CachedUrl cu = af.getFullTextCu(); String url = cu.getUrl(); assertNotNull(cu); String contentType = cu.getContentType(); log.debug("count " + count + " url " + url + " " + contentType); count++; if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) { ++countHtmlOnly; } if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) { ++countPdfOnly; } } log.debug("Article count is " + count); assertEquals(expCount, count); assertEquals(4, countHtmlOnly); assertEquals(4, countPdfOnly); }
private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; }
public void setConfig( Configuration config, Configuration oldConfig, Configuration.Differences changedKeys) { // Build list of repositories from list of disk (fs) paths). Needs to // be generalized if ever another repository implementation. if (changedKeys.contains(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST)) { List lst = new ArrayList(); String dspace = config.get(ConfigManager.PARAM_PLATFORM_DISK_SPACE_LIST, ""); List paths = StringUtil.breakAt(dspace, ';'); if (paths != null) { for (Iterator iter = paths.iterator(); iter.hasNext(); ) { lst.add("local:" + (String) iter.next()); } } repoList = lst; } if (changedKeys.contains(PARAM_MAX_PER_AU_CACHE_SIZE)) { paramNodeCacheSize = config.getInt(PARAM_MAX_PER_AU_CACHE_SIZE, DEFAULT_MAX_PER_AU_CACHE_SIZE); for (Iterator iter = getDaemon().getAllLockssRepositories().iterator(); iter.hasNext(); ) { LockssRepository repo = (LockssRepository) iter.next(); if (repo instanceof LockssRepositoryImpl) { LockssRepositoryImpl repoImpl = (LockssRepositoryImpl) repo; repoImpl.setNodeCacheSize(paramNodeCacheSize); } } } if (changedKeys.contains(PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE)) { paramSuspectVersionsCacheSize = config.getInt( PARAM_MAX_SUSPECT_VERSIONS_CACHE_SIZE, DEFAULT_MAX_SUSPECT_VERSIONS_CACHE_SIZE); suspectVersionsCache.setMaxSize(paramSuspectVersionsCacheSize); } if (changedKeys.contains(GLOBAL_CACHE_PREFIX)) { paramIsGlobalNodeCache = config.getBoolean(PARAM_GLOBAL_CACHE_ENABLED, DEFAULT_GLOBAL_CACHE_ENABLED); if (paramIsGlobalNodeCache) { paramGlobalNodeCacheSize = config.getInt(PARAM_MAX_GLOBAL_CACHE_SIZE, DEFAULT_MAX_GLOBAL_CACHE_SIZE); log.debug("global node cache size: " + paramGlobalNodeCacheSize); globalNodeCache.setMaxSize(paramGlobalNodeCacheSize); } } if (changedKeys.contains(DISK_PREFIX)) { int minMB = config.getInt(PARAM_DISK_WARN_FRRE_MB, DEFAULT_DISK_WARN_FRRE_MB); double minPer = config.getPercentage(PARAM_DISK_WARN_FRRE_PERCENT, DEFAULT_DISK_WARN_FRRE_PERCENT); paramDFWarn = PlatformUtil.DF.makeThreshold(minMB, minPer); minMB = config.getInt(PARAM_DISK_FULL_FRRE_MB, DEFAULT_DISK_FULL_FRRE_MB); minPer = config.getPercentage(PARAM_DISK_FULL_FRRE_PERCENT, DEFAULT_DISK_FULL_FRRE_PERCENT); paramDFFull = PlatformUtil.DF.makeThreshold(minMB, minPer); } if (changedKeys.contains(PARAM_SIZE_CALC_MAX_LOAD)) { sizeCalcMaxLoad = config.getPercentage(PARAM_SIZE_CALC_MAX_LOAD, DEFAULT_SIZE_CALC_MAX_LOAD); } if (changedKeys.contains(PREFIX)) { maxUnusedDirSearch = config.getInt(PARAM_MAX_UNUSED_DIR_SEARCH, DEFAULT_MAX_UNUSED_DIR_SEARCH); isStatefulUnusedDirSearch = config.getBoolean( PARAM_IS_STATEFUL_UNUSED_DIR_SEARCH, DEFAULT_IS_STATEFUL_UNUSED_DIR_SEARCH); enableLongComponents = config.getBoolean(PARAM_ENABLE_LONG_COMPONENTS, DEFAULT_ENABLE_LONG_COMPONENTS); enableLongComponentsCompatibility = config.getBoolean( PARAM_ENABLE_LONG_COMPONENTS_COMPATIBILITY, DEFAULT_ENABLE_LONG_COMPONENTS_COMPATIBILITY); maxComponentLength = config.getInt(PARAM_MAX_COMPONENT_LENGTH, DEFAULT_MAX_COMPONENT_LENGTH); checkUnnormalized = (CheckUnnormalizedMode) config.getEnum( CheckUnnormalizedMode.class, PARAM_CHECK_UNNORMALIZED, DEFAULT_CHECK_UNNORMALIZED); } }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }