Пример #1
0
 void setConfig(Configuration config) {
   log.debug("config: " + config);
   proxyHost = config.get(PARAM_PROXY_HOST);
   proxyPort = config.getInt(PARAM_PROXY_PORT, DEFAULT_PROXY_PORT);
   if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
     String http_proxy = System.getenv("http_proxy");
     if (!StringUtil.isNullString(http_proxy)) {
       try {
         HostPortParser hpp = new HostPortParser(http_proxy);
         proxyHost = hpp.getHost();
         proxyPort = hpp.getPort();
       } catch (HostPortParser.InvalidSpec e) {
         log.warning("Can't parse http_proxy environment var, ignoring: " + http_proxy + ": " + e);
       }
     }
   }
   if (StringUtil.isNullString(proxyHost) || proxyPort <= 0) {
     proxyHost = null;
   } else {
     log.info("Proxying through " + proxyHost + ":" + proxyPort);
   }
   userAgent = config.get(PARAM_USER_AGENT);
   if (StringUtil.isNullString(userAgent)) {
     userAgent = null;
   } else {
     log.debug("Setting User-Agent to " + userAgent);
   }
 }
Пример #2
0
 void checkParamAgreement(String key, PrintfContext context) {
   List<String> printfList = getElementList(key);
   if (printfList == null) {
     return;
   }
   for (String printf : printfList) {
     if (StringUtil.isNullString(printf)) {
       log.warning("Null printf string in " + key);
       continue;
     }
     PrintfUtil.PrintfData p_data = PrintfUtil.stringToPrintf(printf);
     Collection<String> p_args = p_data.getArguments();
     for (String arg : p_args) {
       ConfigParamDescr descr = findAuConfigDescr(arg);
       if (descr == null) {
         throw new PluginException.InvalidDefinition(
             "Not a declared parameter: " + arg + " in " + printf + " in " + getPluginName());
       }
       // ensure range and set params used only in legal context
       switch (context) {
         case Regexp:
         case Display:
           // everything is legal in a regexp or a display string
           break;
         case URL:
           // NUM_RANGE and SET legal because can enumerate.  Can't
           // enumerate RANGE
           switch (descr.getType()) {
             case ConfigParamDescr.TYPE_RANGE:
               throw new PluginException.InvalidDefinition(
                   "Range parameter ("
                       + arg
                       + ") used in illegal context in "
                       + getPluginName()
                       + ": "
                       + key
                       + ": "
                       + printf);
             default:
           }
       }
     }
   }
 }
Пример #3
0
 /** If in testing mode FOO, copy values from FOO_override map, if any, to main map */
 void processOverrides(TypedEntryMap map) {
   String testMode = getTestingMode();
   if (StringUtil.isNullString(testMode)) {
     return;
   }
   Object o = map.getMapElement(testMode + DefinableArchivalUnit.SUFFIX_OVERRIDE);
   if (o == null) {
     return;
   }
   if (o instanceof Map) {
     Map overrideMap = (Map) o;
     for (Map.Entry entry : (Set<Map.Entry>) overrideMap.entrySet()) {
       String key = (String) entry.getKey();
       Object val = entry.getValue();
       log.debug(getDefaultPluginName() + ": Overriding " + key + " with " + val);
       map.setMapElement(key, val);
     }
   }
 }
Пример #4
0
  protected void initResultMap() throws PluginException.InvalidDefinition {
    HttpResultMap hResultMap = new HttpResultMap();
    // XXX Currently this only allows a CacheResultHandler class to
    // initialize the result map.  Instead, don't use a CacheResultMap
    // directly, use either the plugin's CacheResultHandler, if specified,
    // or a default one that wraps the CacheResultMap

    String handler_class = null;
    handler_class = definitionMap.getString(KEY_EXCEPTION_HANDLER, null);
    if (handler_class != null) {
      try {
        resultHandler = (CacheResultHandler) newAuxClass(handler_class, CacheResultHandler.class);
        resultHandler.init(hResultMap);
      } catch (Exception ex) {
        throw new PluginException.InvalidDefinition(
            mapName + " has invalid Exception handler: " + handler_class, ex);
      } catch (LinkageError le) {
        throw new PluginException.InvalidDefinition(
            mapName + " has invalid Exception handler: " + handler_class, le);
      }
    } else {
      // Expect a list of mappings from either result code or exception
      // name to CacheException name
      Collection<String> mappings = definitionMap.getCollection(KEY_EXCEPTION_LIST, null);
      if (mappings != null) {
        // add each entry
        for (String entry : mappings) {
          if (log.isDebug2()) {
            log.debug2("initMap(" + entry + ")");
          }
          String first;
          String ceName;
          try {
            List<String> pair = StringUtil.breakAt(entry, '=', 2, true, true);
            first = pair.get(0);
            ceName = pair.get(1);
          } catch (Exception ex) {
            throw new PluginException.InvalidDefinition(
                "Invalid syntax: " + entry + "in " + mapName);
          }
          Object val;

          // Value should be either a CacheException or CacheResultHandler
          // class name.
          PluginFetchEventResponse resp =
              (PluginFetchEventResponse) newAuxClass(ceName, PluginFetchEventResponse.class, null);
          if (resp instanceof CacheException) {
            val = resp.getClass();
          } else if (resp instanceof CacheResultHandler) {
            val = WrapperUtil.wrap((CacheResultHandler) resp, CacheResultHandler.class);
          } else {
            throw new PluginException.InvalidDefinition(
                "Second arg not a "
                    + "CacheException or "
                    + "CacheResultHandler class: "
                    + entry
                    + ", in "
                    + mapName);
          }
          try {
            int code = Integer.parseInt(first);
            // If parseable as an integer, it's a result code.
            hResultMap.storeMapEntry(code, val);
          } catch (NumberFormatException e) {
            try {
              Class eClass = Class.forName(first);
              // If a class name, it should be an exception class
              if (Exception.class.isAssignableFrom(eClass)) {
                hResultMap.storeMapEntry(eClass, val);
              } else {
                throw new PluginException.InvalidDefinition(
                    "First arg not an " + "Exception class: " + entry + ", in " + mapName);
              }
            } catch (Exception ex) {
              throw new PluginException.InvalidDefinition(
                  "First arg not a " + "number or class: " + entry + ", in " + mapName);
            } catch (LinkageError le) {
              throw new PluginException.InvalidDefinition("Can't load " + first, le);
            }
          }
        }
      }
    }
    resultMap = hResultMap;
  }
Пример #5
0
 protected void initMimeMap() throws PluginException.InvalidDefinition {
   for (Iterator iter = definitionMap.entrySet().iterator(); iter.hasNext(); ) {
     Map.Entry ent = (Map.Entry) iter.next();
     String key = (String) ent.getKey();
     Object val = ent.getValue();
     if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY)) {
       String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_EXTRACTOR_FACTORY);
       if (val instanceof String) {
         String factName = (String) val;
         log.debug(mime + " link extractor: " + factName);
         MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
         LinkExtractorFactory fact =
             (LinkExtractorFactory) newAuxClass(factName, LinkExtractorFactory.class);
         mti.setLinkExtractorFactory(fact);
       }
     } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY)) {
       // XXX This clause must precede the one for SUFFIX_HASH_FILTER_FACTORY
       // XXX unless/until that key is changed to not be a terminal substring
       // XXX of this one
       String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_CRAWL_FILTER_FACTORY);
       if (val instanceof String) {
         String factName = (String) val;
         log.debug(mime + " crawl filter: " + factName);
         MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
         FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class);
         mti.setCrawlFilterFactory(fact);
       }
     } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY)) {
       String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY);
       if (val instanceof String) {
         String factName = (String) val;
         log.debug(mime + " filter: " + factName);
         MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
         FilterFactory fact = (FilterFactory) newAuxClass(factName, FilterFactory.class);
         mti.setHashFilterFactory(fact);
       }
     } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT)) {
       String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_FETCH_RATE_LIMIT);
       if (val instanceof String) {
         String rate = (String) val;
         log.debug(mime + " fetch rate: " + rate);
         MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
         RateLimiter limit = mti.getFetchRateLimiter();
         if (limit != null) {
           limit.setRate(rate);
         } else {
           mti.setFetchRateLimiter(new RateLimiter(rate));
         }
       }
     } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY)) {
       String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_LINK_REWRITER_FACTORY);
       String factName = (String) val;
       log.debug(mime + " link rewriter: " + factName);
       MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
       LinkRewriterFactory fact =
           (LinkRewriterFactory) newAuxClass(factName, LinkRewriterFactory.class);
       mti.setLinkRewriterFactory(fact);
     } else if (key.endsWith(DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP)) {
       String mime = stripSuffix(key, DefinableArchivalUnit.SUFFIX_METADATA_EXTRACTOR_FACTORY_MAP);
       Map factNameMap = (Map) val;
       Map factClassMap = new HashMap();
       MimeTypeInfo.Mutable mti = mimeMap.modifyMimeTypeInfo(mime);
       for (Iterator it = factNameMap.keySet().iterator(); it.hasNext(); ) {
         String mdTypes = (String) it.next();
         String factName = (String) factNameMap.get(mdTypes);
         log.debug(mime + " (" + mdTypes + ") metadata extractor: " + factName);
         for (String mdType : (List<String>) StringUtil.breakAt(mdTypes, ";")) {
           setMdTypeFact(factClassMap, mdType, factName);
         }
       }
       mti.setFileMetadataExtractorFactoryMap(factClassMap);
     }
   }
 }
Пример #6
0
 protected String getDefaultPluginName() {
   return StringUtil.shortName(getPluginId());
 }
Пример #7
0
  /** Explode the archive into its constituent elements */
  public void explode() throws CacheException {
    CachedUrl cachedUrl = null;
    int goodEntries = 0;
    int badEntries = 0;
    int ignoredEntries = 0;
    int entriesBetweenSleep = 0;
    ArchiveReader arcReader = null;

    logger.info(
        (storeArchive ? "Storing" : "Fetching") + " WARC file: " + origUrl + " will explode");
    try {
      if (storeArchive) {
        UrlCacher uc = au.makeUrlCacher(new UrlData(arcStream, arcProps, fetchUrl));
        BitSet bs = new BitSet();
        bs.set(UrlCacher.DONT_CLOSE_INPUT_STREAM_FLAG);
        uc.setFetchFlags(bs);
        uc.storeContent();
        archiveData.resetInputStream();
        arcStream = archiveData.input;
      }
      // Wrap it in an ArchiveReader
      logger.debug3("About to wrap stream");
      arcReader = wrapStream(fetchUrl, arcStream);
      logger.debug3("wrapStream() returns " + (arcReader == null ? "null" : "non-null"));
      // Explode it
      if (arcReader == null) {
        throw new CacheException.ExploderException("no WarcReader for " + origUrl);
      }
      ArchivalUnit au = crawlFacade.getAu();
      Set stemSet = new HashSet();
      logger.debug("Exploding " + fetchUrl);
      // Iterate through the elements in the WARC file, except the first
      Iterator i = arcReader.iterator();
      // Skip first record
      for (i.next(); i.hasNext(); ) {
        // XXX probably not necessary
        helper.pokeWDog();
        if ((++entriesBetweenSleep % sleepAfter) == 0) {
          long pauseTime =
              CurrentConfig.getTimeIntervalParam(PARAM_RETRY_PAUSE, DEFAULT_RETRY_PAUSE);
          Deadline pause = Deadline.in(pauseTime);
          logger.debug3("Sleeping for " + StringUtil.timeIntervalToString(pauseTime));
          while (!pause.expired()) {
            try {
              pause.sleep();
            } catch (InterruptedException ie) {
              // no action
            }
          }
        }
        ArchiveRecord element = (ArchiveRecord) i.next();
        // Each element is a URL to be cached in a suitable AU
        ArchiveRecordHeader elementHeader = element.getHeader();
        String elementUrl = elementHeader.getUrl();
        String elementMimeType = elementHeader.getMimetype();
        long elementLength = elementHeader.getLength();
        logger.debug2("WARC url " + elementUrl + " mime " + elementMimeType);
        if (elementUrl.startsWith("http:")) {
          ArchiveEntry ae =
              new ArchiveEntry(
                  elementUrl,
                  elementLength,
                  0, // XXX need to convert getDate string to long
                  element, // ArchiveRecord extends InputStream
                  this,
                  fetchUrl);
          ae.setHeaderFields(makeCIProperties(elementHeader));
          long bytesStored = elementLength;
          logger.debug3("ArchiveEntry: " + ae.getName() + " bytes " + bytesStored);
          try {
            helper.process(ae);
          } catch (PluginException ex) {
            throw new CacheException.ExploderException("helper.process() threw", ex);
          }
          if (ae.getBaseUrl() != null) {
            if (ae.getRestOfUrl() != null && ae.getHeaderFields() != null) {
              storeEntry(ae);
              handleAddText(ae);
              goodEntries++;
              crawlFacade.getCrawlerStatus().addContentBytesFetched(bytesStored);
            } else {
              ignoredEntries++;
            }
          } else {
            badEntries++;
            logger.debug2("Can't map " + elementUrl + " from " + archiveUrl);
          }
        }
      }
    } catch (IOException ex) {
      throw new CacheException.ExploderException(ex);
    } finally {
      if (arcReader != null)
        try {
          arcReader.close();
          arcReader = null;
        } catch (IOException ex) {
          throw new CacheException.ExploderException(ex);
        }
      if (cachedUrl != null) {
        cachedUrl.release();
      }
      IOUtil.safeClose(arcStream);
    }
    if (badEntries == 0 && goodEntries > 0) {
      // Make it look like a new crawl finished on each AU to which
      // URLs were added.
      for (Iterator it = touchedAus.iterator(); it.hasNext(); ) {
        ArchivalUnit au = (ArchivalUnit) it.next();
        logger.debug3(archiveUrl + " touching " + au.toString());
        AuUtil.getDaemon(au).getNodeManager(au).newContentCrawlFinished();
      }
    } else {
      ArchivalUnit au = crawlFacade.getAu();
      String msg = archiveUrl + ": " + badEntries + "/" + goodEntries + " bad entries";
      throw new CacheException.UnretryableException(msg);
    }
  }