void setConfig(Configuration config) { log.debug("config: " + config); proxyHost = config.get(PARAM_PROXY_HOST); proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT); if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { String http_proxy = System.getenv("http_proxy"); if (!StringUtil.isNullString(http_proxy)) { try { HostPortParser hpp = new HostPortParser(http_proxy); proxyHost = hpp.getHost(); proxyPort = hpp.getPort(); } catch (HostPortParser.InvalidSpec e) { log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e); } } } if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) { proxyHost = null; } else { log.info("Proxying through " + proxyHost + ":" + proxyPort); } userAgent = config.get(PARAM_USER_AGENT); if (StringUtil.isNullString(userAgent)) { userAgent = null; } else { log.debug("Setting User-Agent to " + userAgent); } }
void checkParamAgreement(String key, PrintfContext context) { List<String> printfList = getElementList(key); if (printfList == null) { return; } for (String printf : printfList) { if (StringUtil.isNullString(printf)) { log.warning("Null printf string in " + key); continue; } PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf); Collection<String> p_args = p_data.getArguments(); for (String arg : p_args) { ConfigParamDescr descr = findAuConfigDescr(arg); if (descr == null) { throw new PluginException.InvalidDefinition( "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName()); } // ensure range and set params used only in legal context switch (context) { case Regexp: case Display: // everything is legal in a regexp or a display string break; case URL: // NUM_RANGE and SET legal because can enumerate. Can't // enumerate RANGE switch (descr.getType()) { case ConfigParamDescr.TYPE_RANGE: throw new PluginException.InvalidDefinition( "Range parameter (" + arg + ") used in illegal context in " + getPluginName() + ": " + key + ": " + printf); default: } } } } }
/** If in testing mode FOO, copy values from FOO_override map, if any, to main map */ void processOverrides(TypedEntryMap map) { String testMode = getTestingMode(); if (StringUtil.isNullString(testMode)) { return; } Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE); if (o == null) { return; } if (o instanceof Map) { Map overrideMap = (Map) o; for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) { String key = (String) entry.getKey(); Object val = entry.getValue(); log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val); map.setMapElement(key, val); } } }
protected void initResultMap() throws PluginException.InvalidDefinition { HttpResultMap hResultMap = new HttpResultMap(); // XXX Currently this only allows a CacheResultHandler class to // initialize the result map. Instead, don't use a CacheResultMap // directly, use either the plugin's CacheResultHandler, if specified, // or a default one that wraps the CacheResultMap String handler_class = null; handler_class = definitionMap.getString(KEY_EXCEPTION_HANDLER, null); if (handler_class != null) { try { resultHandler = (CacheResultHandler) newAuxClass(handler_class, CacheResultHandler.class); resultHandler.init(hResultMap); } catch (Exception ex) { throw new PluginException.InvalidDefinition( mapName + " has invalid Exception handler: " + handler_class, ex); } catch (LinkageError le) { throw new PluginException.InvalidDefinition( mapName + " has invalid Exception handler: " + handler_class, le); } } else { // Expect a list of mappings from either result code or exception // name to CacheException name Collection<String> mappings = definitionMap.getCollection(KEY_EXCEPTION_LIST, null); if (mappings != null) { // add each entry for (String entry : mappings) { if (log.isDebug2()) { log.debug2("initMap(" + entry + ")"); } String first; String ceName; try { List<String> pair = StringUtil.breakAt(entry, '=', 2, true, true); first = pair.get(0); ceName = pair.get(1); } catch (Exception ex) { throw new PluginException.InvalidDefinition( "Invalid syntax: " + entry + "in " + mapName); } Object val; // Value should be either a CacheException or CacheResultHandler // class name. PluginFetchEventResponse resp = (PluginFetchEventResponse) newAuxClass(ceName, PluginFetchEventResponse.class, null); if (resp instanceof CacheException) { val = resp.getClass(); } else if (resp instanceof CacheResultHandler) { val = WrapperUtil.wrap((CacheResultHandler) resp, CacheResultHandler.class); } else { throw new PluginException.InvalidDefinition( "Second arg not a " + "CacheException or " + "CacheResultHandler class: " + entry + ", in " + mapName); } try { int code = Integer.parseInt(first); // If parseable as an integer, it's a result code. hResultMap.storeMapEntry(code, val); } catch (NumberFormatException e) { try { Class eClass = Class.forName(first); // If a class name, it should be an exception class if (Exception.class.isAssignableFrom(eClass)) { hResultMap.storeMapEntry(eClass, val); } else { throw new PluginException.InvalidDefinition( "First arg not an " + "Exception class: " + entry + ", in " + mapName); } } catch (Exception ex) { throw new PluginException.InvalidDefinition( "First arg not a " + "number or class: " + entry + ", in " + mapName); } catch (LinkageError le) { throw new PluginException.InvalidDefinition("Can't load " + first, le); } } } } } resultMap = hResultMap; }
protected void initMimeMap() throws PluginException.InvalidDefinition { for (Iterator iter = definitionMap.entrySet().iterator(); iter.hasNext(); ) { Map.Entry ent = (Map.Entry) iter.next(); String key = (String) ent.getKey(); Object val = ent.getValue(); if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " link extractor: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkExtractorFactory fact = (LinkExtractorFactory) newAuxClass(factName, LinkExtractorFactory.class); mti.setLinkExtractorFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY)) { // XXX This clause must precede the one for SUFFIX_HASH_FILTER_FACTORY // XXX unless/until that key is changed to not be a terminal substring // XXX of this one String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " crawl filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setCrawlFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY); if (val instanceof String) { String factName = (String) val; log.debug(mime + " filter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class); mti.setHashFilterFactory(fact); } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT); if (val instanceof String) { String rate = (String) val; log.debug(mime + " fetch rate: " + rate); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); RateLimiter limit = mti.getFetchRateLimiter(); if (limit != null) { limit.setRate(rate); } else { mti.setFetchRateLimiter(new RateLimiter(rate)); } } } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY); String factName = (String) val; log.debug(mime + " link rewriter: " + factName); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); LinkRewriterFactory fact = (LinkRewriterFactory) newAuxClass(factName, LinkRewriterFactory.class); mti.setLinkRewriterFactory(fact); } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP)) { String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP); Map factNameMap = (Map) val; Map factClassMap = new HashMap(); MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime); for (Iterator it = factNameMap.keySet().iterator(); it.hasNext(); ) { String mdTypes = (String) it.next(); String factName = (String) factNameMap.get(mdTypes); log.debug(mime + " (" + mdTypes + ") metadata extractor: " + factName); for (String mdType : (List<String>) StringUtil.breakAt(mdTypes, ";")) { setMdTypeFact(factClassMap, mdType, factName); } } mti.setFileMetadataExtractorFactoryMap(factClassMap); } } }
protected String getDefaultPluginName() { return StringUtil.shortName(getPluginId()); }
/** Explode the archive into its constituent elements */ public void explode() throws CacheException { CachedUrl cachedUrl = null; int goodEntries = 0; int badEntries = 0; int ignoredEntries = 0; int entriesBetweenSleep = 0; ArchiveReader arcReader = null; logger.info( (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode"); try { if (storeArchive) { UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl)); BitSet bs = new BitSet(); bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG); uc.setFetchFlags(bs); uc.storeContent(); archiveData.resetInputStream(); arcStream = archiveData.input; } // Wrap it in an ArchiveReader logger.debug3("About to wrap stream"); arcReader = wrapStream(fetchUrl, arcStream); logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null")); // Explode it if (arcReader == null) { throw new CacheException.ExploderException("no WarcReader for " + origUrl); } ArchivalUnit au = crawlFacade.getAu(); Set stemSet = new HashSet(); logger.debug("Exploding " + fetchUrl); // Iterate through the elements in the WARC file, except the first Iterator i = arcReader.iterator(); // Skip first record for (i.next(); i.hasNext(); ) { // XXX probably not necessary helper.pokeWDog(); if ((++entriesBetweenSleep % sleepAfter) == 0) { long pauseTime = CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE); Deadline pause = Deadline.in(pauseTime); logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime)); while (!pause.expired()) { try { pause.sleep(); } catch (InterruptedException ie) { // no action } } } ArchiveRecord element = (ArchiveRecord) i.next(); // Each element is a URL to be cached in a suitable AU ArchiveRecordHeader elementHeader = element.getHeader(); String elementUrl = elementHeader.getUrl(); String elementMimeType = elementHeader.getMimetype(); long elementLength = elementHeader.getLength(); logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType); if (elementUrl.startsWith("http:")) { ArchiveEntry ae = new ArchiveEntry( elementUrl, elementLength, 0, // XXX need to convert getDate string to long element, // ArchiveRecord extends InputStream this, fetchUrl); ae.setHeaderFields(makeCIProperties(elementHeader)); long bytesStored = elementLength; logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored); try { helper.process(ae); } catch (PluginException ex) { throw new CacheException.ExploderException("helper.process() threw", ex); } if (ae.getBaseUrl() != null) { if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) { storeEntry(ae); handleAddText(ae); goodEntries++; crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored); } else { ignoredEntries++; } } else { badEntries++; logger.debug2("Can't map " + elementUrl + " from " + archiveUrl); } } } } catch (IOException ex) { throw new CacheException.ExploderException(ex); } finally { if (arcReader != null) try { arcReader.close(); arcReader = null; } catch (IOException ex) { throw new CacheException.ExploderException(ex); } if (cachedUrl != null) { cachedUrl.release(); } IOUtil.safeClose(arcStream); } if (badEntries == 0 && goodEntries > 0) { // Make it look like a new crawl finished on each AU to which // URLs were added. for (Iterator it = touchedAus.iterator(); it.hasNext(); ) { ArchivalUnit au = (ArchivalUnit) it.next(); logger.debug3(archiveUrl + " touching " + au.toString()); AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished(); } } else { ArchivalUnit au = crawlFacade.getAu(); String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries"; throw new CacheException.UnretryableException(msg); } }