private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) { Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported)); Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported)); if (!m_inclset.isEmpty()) { outputMessage( "\nIncluded Urls: (" + new_incls.size() + " new, " + (m_inclset.size() - new_incls.size()) + " old)", URL_SUMMARY_MESSAGE); depth_incl[m_curDepth - 1] += new_incls.size(); } for (Iterator it = new_incls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } if (!m_exclset.isEmpty()) { outputMessage( "\nExcluded Urls: (" + new_excls.size() + " new, " + (m_exclset.size() - new_excls.size()) + " old)", URL_SUMMARY_MESSAGE); } for (Iterator it = new_excls.iterator(); it.hasNext(); ) { outputMessage(it.next().toString(), PLAIN_MESSAGE); } m_reported.addAll(new_incls); m_reported.addAll(new_excls); if (m_outWriter != null) { try { m_outWriter.flush(); } catch (IOException ex) { } } return new_incls; }
private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) { int fetchCount = fetched.size(); outputMessage( "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth, TEST_SUMMARY_MESSAGE); outputMessage( "\nUrls fetched: " + fetchCount + " Urls extracted: " + m_extracted.size(), PLAIN_MESSAGE); outputMessage("\nDepth Fetched Parsed New URLs", PLAIN_MESSAGE); for (int depth = 1; depth <= m_crawlDepth; depth++) { PrintfFormat pf = new PrintfFormat("%5d %7d %6d %8d"); Integer[] args = new Integer[] { new Integer(depth), new Integer(depth_fetched[depth - 1]), new Integer(depth_parsed[depth - 1]), new Integer(depth_incl[depth - 1]), }; String s = pf.sprintf(args); outputMessage(s, PLAIN_MESSAGE); } outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE); if (false) { for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) { String url = (String) iter.next(); outputMessage(url, PLAIN_MESSAGE); } } long secs = elapsedTime / Constants.SECOND; long fetchRate = 0; if (secs > 0) { fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime; } outputMessage( "\nElapsed Time: " + secs + " secs." + " Fetch Rate: " + fetchRate + " p/m", PLAIN_MESSAGE); }
protected CIProperties makeCIProperties(ArchiveRecordHeader elementHeader) throws IOException { CIProperties ret = new CIProperties(); Set elementHeaderFieldKeys = elementHeader.getHeaderFieldKeys(); for (Iterator i = elementHeaderFieldKeys.iterator(); i.hasNext(); ) { String key = (String) i.next(); try { Object valueObject = elementHeader.getHeaderValue(key); if (valueObject == null) { logger.warning("Ignoring null value for key '" + key + "'."); } else { String value = valueObject.toString(); logger.debug3(key + ": " + value); ret.put(key, value); } } catch (ClassCastException ex) { logger.error("makeCIProperties: " + key + " threw ", ex); throw new CacheException.ExploderException(ex); } } return (ret); }
protected void initMimeMap() throws PluginException.InvalidDefinition { for (Iterator iter = definitionMap.entrySet().iterator(); iter.hasNext(); ) { Map.Entry ent = (Map.Entry) iter.next(); String key = (String) ent.getKey(); Object val = ent.getValue(); if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " link extractor: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkExtractorFactory fact = (LinkExtractorFactory) newAuxClass(factName, LinkExtractorFactory.class); mti.setLinkExtractorFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY)) { // XXX This clause must precede the one for SUFFIX_HASH_FILTER_FACTORY // XXX unless/until that key is changed to not be a terminal substring // XXX of this one String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " crawl filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setCrawlFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setHashFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT); if (val instanceof String) { String rate = (String) val; log.debug(mime + " fetch rate: " + rate); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); RateLimiter limit = mti.getFetchRateLimiter(); if (limit != null) { limit.setRate(rate); } else { mti.setFetchRateLimiter(new RateLimiter(rate)); } } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY); String factName = (String) val; log.debug(mime + " link rewriter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkRewriterFactory fact = (LinkRewriterFactory) newAuxClass(factName, LinkRewriterFactory.class); mti.setLinkRewriterFactory(fact); } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP); Map factNameMap = (Map) val; Map factClassMap = new HashMap(); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); for (Iterator it = factNameMap.keySet().iterator(); it.hasNext(); ) { String mdTypes = (String) it.next(); String factName = (String) factNameMap.get(mdTypes); log.debug(mime + " (" + mdTypes + ") metadata extractor: " + factName); for (String mdType : (List<String>) StringUtil.breakAt(mdTypes, ";")) { setMdTypeFact(factClassMap, mdType, factName); } } mti.setFileMetadataExtractorFactoryMap(factClassMap); } } }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }