/** Explode the archive into its constituent elements */ public void explode() throws CacheException { int goodEntries = 0; int badEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.debug( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator<ArchiveRecord> iter = arcReader.iterator(); // Skip first record if (iter.hasNext()) iter.next(); while (iter.hasNext()) { helper.pokeWDog(); // check need to pause handlePause(++entriesBetweenSleep); // handle each element in the archive ArchiveRecord element = iter.next(); // Each element is a URL to be cached in our AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); long elementDate; try { elementDate = ArchiveUtils.parse14DigitDate(elementHeader.getDate()).getTime(); } catch (ParseException e) { elementDate = 0; } logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); // add check to determine if this is a url which should be cached if (au.shouldBeCached(elementUrl) && elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, elementDate, element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; // this needs to use the correct depth ? how CrawlUrlData cud = new CrawlUrlData(elementUrl, 0); crawlFacade.addToParseQueue(cud); crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) { try { arcReader.close(); } catch (IOException ex) { throw new CacheException.ExploderException(ex); } } IOUtil.safeClose(arcStream); } // report failed fetches if (badEntries != 0) { String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }