Example #1
0
/**
 * Distribute application-specific large, read-only files efficiently.
 *
 * <p><code>DistributedCache</code> is a facility provided by the Map-Reduce framework to cache
 * files (text, archives, jars etc.) needed by applications.
 *
 * <p>Applications specify the files, via urls (hdfs:// or http://) to be cached via the {@link
 * org.apache.hadoop.mapred.JobConf}. The <code>DistributedCache</code> assumes that the files
 * specified via hdfs:// urls are already present on the {@link FileSystem} at the path specified by
 * the url.
 *
 * <p>The framework will copy the necessary files on to the slave node before any tasks for the job
 * are executed on that node. Its efficiency stems from the fact that the files are only copied once
 * per job and the ability to cache archives which are un-archived on the slaves.
 *
 * <p><code>DistributedCache</code> can be used to distribute simple, read-only data/text files
 * and/or more complex types such as archives, jars etc. Archives (zip, tar and tgz/tar.gz files)
 * are un-archived at the slave nodes. Jars may be optionally added to the classpath of the tasks, a
 * rudimentary software distribution mechanism. Files have execution permissions. Optionally users
 * can also direct it to symlink the distributed cache file(s) into the working directory of the
 * task.
 *
 * <p><code>DistributedCache</code> tracks modification timestamps of the cache files. Clearly the
 * cache files should not be modified by the application or externally while the job is executing.
 *
 * <p>Here is an illustrative example on how to use the <code>DistributedCache</code>:
 *
 * <p>
 *
 * <blockquote>
 *
 * <pre>
 *     // Setting up the cache for the application
 *
 *     1. Copy the requisite files to the <code>FileSystem</code>:
 *
 *     $ bin/hadoop fs -copyFromLocal lookup.dat /myapp/lookup.dat
 *     $ bin/hadoop fs -copyFromLocal map.zip /myapp/map.zip
 *     $ bin/hadoop fs -copyFromLocal mylib.jar /myapp/mylib.jar
 *     $ bin/hadoop fs -copyFromLocal mytar.tar /myapp/mytar.tar
 *     $ bin/hadoop fs -copyFromLocal mytgz.tgz /myapp/mytgz.tgz
 *     $ bin/hadoop fs -copyFromLocal mytargz.tar.gz /myapp/mytargz.tar.gz
 *
 *     2. Setup the application's <code>JobConf</code>:
 *
 *     JobConf job = new JobConf();
 *     DistributedCache.addCacheFile(new URI("/myapp/lookup.dat#lookup.dat"),
 *                                   job);
 *     DistributedCache.addCacheArchive(new URI("/myapp/map.zip", job);
 *     DistributedCache.addFileToClassPath(new Path("/myapp/mylib.jar"), job);
 *     DistributedCache.addCacheArchive(new URI("/myapp/mytar.tar", job);
 *     DistributedCache.addCacheArchive(new URI("/myapp/mytgz.tgz", job);
 *     DistributedCache.addCacheArchive(new URI("/myapp/mytargz.tar.gz", job);
 *
 *     3. Use the cached files in the {@link org.apache.hadoop.mapred.Mapper}
 *     or {@link org.apache.hadoop.mapred.Reducer}:
 *
 *     public static class MapClass extends MapReduceBase
 *     implements Mapper&lt;K, V, K, V&gt; {
 *
 *       private Path[] localArchives;
 *       private Path[] localFiles;
 *
 *       public void configure(JobConf job) {
 *         // Get the cached archives/files
 *         localArchives = DistributedCache.getLocalCacheArchives(job);
 *         localFiles = DistributedCache.getLocalCacheFiles(job);
 *       }
 *
 *       public void map(K key, V value,
 *                       OutputCollector&lt;K, V&gt; output, Reporter reporter)
 *       throws IOException {
 *         // Use data from the cached archives/files here
 *         // ...
 *         // ...
 *         output.collect(k, v);
 *       }
 *     }
 *
 * </pre>
 *
 * </blockquote>
 *
 * @see org.apache.hadoop.mapred.JobConf
 * @see org.apache.hadoop.mapred.JobClient
 */
public class DistributedCache {
  // cacheID to cacheStatus mapping
  private static TreeMap<String, CacheStatus> cachedArchives = new TreeMap<String, CacheStatus>();

  private static TreeMap<Path, Long> baseDirSize = new TreeMap<Path, Long>();
  private static TreeMap<Path, Integer> baseDirNumberSubDir = new TreeMap<Path, Integer>();

  // default total cache size
  private static final long DEFAULT_CACHE_SIZE = 10737418240L;
  private static final long DEFAULT_CACHE_SUBDIR_LIMIT = 10000;

  private static final Log LOG = LogFactory.getLog(DistributedCache.class);
  private static Random random = new Random();

  /**
   * Get the locally cached file or archive; it could either be previously cached (and valid) or
   * copy it from the {@link FileSystem} now.
   *
   * @param cache the cache to be localized, this should be specified as new
   *     URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is
   *     provided the file is assumed to be in the filesystem being used in the Configuration
   * @param conf The Confguration file which contains the filesystem
   * @param baseDir The base cache Dir where you wnat to localize the files/archives
   * @param fileStatus The file status on the dfs.
   * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or
   *     .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred
   *     automatically and the directory where the archive is unzipped/unjarred/untarred is returned
   *     as the Path. In case of a file, the path to the file is returned
   * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be
   *     cached hasn't changed since the job started
   * @param currentWorkDir this is the directory where you would want to create symlinks for the
   *     locally cached files/archives
   * @return the path to directory where the archives are unjarred in case of archives, the path to
   *     the file where the file is copied locally
   * @throws IOException
   */
  public static Path getLocalCache(
      URI cache,
      Configuration conf,
      Path baseDir,
      FileStatus fileStatus,
      boolean isArchive,
      long confFileStamp,
      Path currentWorkDir,
      MRAsyncDiskService asyncDiskService)
      throws IOException {
    return getLocalCache(
        cache,
        conf,
        baseDir,
        fileStatus,
        isArchive,
        confFileStamp,
        fileStatus.getLen(),
        currentWorkDir,
        true,
        asyncDiskService,
        new LocalDirAllocator("mapred.local.dir"));
  }

  public static Path getLocalCacheFromTimestamps(
      URI cache,
      Configuration conf,
      Path subDir,
      FileStatus fileStatus,
      boolean isArchive,
      long confFileStamp,
      long fileLength,
      Path currentWorkDir,
      boolean honorSymLinkConf,
      MRAsyncDiskService asyncDiskService,
      LocalDirAllocator lDirAllocator)
      throws IOException {
    return getLocalCache(
        cache,
        conf,
        subDir,
        fileStatus,
        isArchive,
        confFileStamp,
        fileLength,
        currentWorkDir,
        honorSymLinkConf,
        asyncDiskService,
        lDirAllocator);
  }

  public static Path getLocalCacheFromURI(
      URI cache,
      Configuration conf,
      Path subDir,
      boolean isArchive,
      long fileLength,
      Path currentWorkDir,
      boolean honorSymLinkConf,
      MRAsyncDiskService asyncDiskService,
      LocalDirAllocator lDirAllocator)
      throws IOException {
    return getLocalCache(
        cache,
        conf,
        subDir,
        null,
        isArchive,
        0,
        fileLength,
        currentWorkDir,
        honorSymLinkConf,
        asyncDiskService,
        lDirAllocator);
  }

  /** Added for back compatibility. */
  public static Path getLocalCache(
      URI cache,
      Configuration conf,
      Path subdir,
      FileStatus fileStatus,
      boolean isArchive,
      long confFileStamp,
      Path currentWorkDir,
      boolean honorSymLinkConf,
      MRAsyncDiskService asyncDiskService,
      LocalDirAllocator lDirAllocator)
      throws IOException {
    return getLocalCache(
        cache,
        conf,
        subdir,
        fileStatus,
        isArchive,
        confFileStamp,
        fileStatus.getLen(),
        currentWorkDir,
        honorSymLinkConf,
        asyncDiskService,
        lDirAllocator);
  }

  /**
   * Get the locally cached file or archive; it could either be previously cached (and valid) or
   * copy it from the {@link FileSystem} now.
   *
   * @param cache the cache to be localized, this should be specified as new
   *     URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is
   *     provided the file is assumed to be in the filesystem being used in the Configuration
   * @param conf The Confguration file which contains the filesystem
   * @param subDir The sub cache Dir where you want to localize the files/archives
   * @param fileStatus The file status on the dfs.
   * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or
   *     .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred
   *     automatically and the directory where the archive is unzipped/unjarred/untarred is returned
   *     as the Path. In case of a file, the path to the file is returned
   * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be
   *     cached hasn't changed since the job started
   * @param fileLength this is the length of the cache file
   * @param currentWorkDir this is the directory where you would want to create symlinks for the
   *     locally cached files/archives
   * @param honorSymLinkConf if this is false, then the symlinks are not created even if conf says
   *     so (this is required for an optimization in task launches
   * @param lDirAllocator LocalDirAllocator of the tracker
   * @return the path to directory where the archives are unjarred in case of archives, the path to
   *     the file where the file is copied locally
   * @throws IOException
   */
  private static Path getLocalCache(
      URI cache,
      Configuration conf,
      Path subDir,
      FileStatus fileStatus,
      boolean isArchive,
      long confFileStamp,
      long fileLength,
      Path currentWorkDir,
      boolean honorSymLinkConf,
      MRAsyncDiskService asyncDiskService,
      LocalDirAllocator lDirAllocator)
      throws IOException {
    String key = getKey(cache, conf, confFileStamp);

    CacheStatus lcacheStatus;
    Path localizedPath;
    synchronized (cachedArchives) {
      lcacheStatus = cachedArchives.get(key);
      if (lcacheStatus == null) {
        // was never localized
        Path uniqueParentDir = new Path(subDir, String.valueOf(random.nextLong()));
        String cachePath = new Path(uniqueParentDir, makeRelative(cache, conf)).toString();
        Path localPath = lDirAllocator.getLocalPathForWrite(cachePath, fileLength, conf);
        lcacheStatus =
            new CacheStatus(
                new Path(localPath.toString().replace(cachePath, "")), localPath, uniqueParentDir);
        cachedArchives.put(key, lcacheStatus);
      }
      lcacheStatus.refcount++;
    }
    boolean initSuccessful = false;
    try {
      synchronized (lcacheStatus) {
        if (!lcacheStatus.isInited()) {
          localizedPath = localizeCache(conf, cache, confFileStamp, lcacheStatus, isArchive);
          lcacheStatus.initComplete();
        } else {
          if (fileStatus != null) {
            localizedPath =
                checkCacheStatusValidity(
                    conf, cache, confFileStamp, lcacheStatus, fileStatus, isArchive);
          } else {
            // if fileStatus is null, then the md5 must be correct
            // so there is no need to check for cache validity
            localizedPath = lcacheStatus.localizedLoadPath;
          }
        }
        createSymlink(conf, cache, lcacheStatus, isArchive, currentWorkDir, honorSymLinkConf);
      }

      // try deleting stuff if you can
      long size = 0;
      int numberSubDir = 0;
      synchronized (lcacheStatus) {
        synchronized (baseDirSize) {
          Long get = baseDirSize.get(lcacheStatus.getBaseDir());
          if (get != null) {
            size = get.longValue();
          } else {
            LOG.warn("Cannot find size of baseDir: " + lcacheStatus.getBaseDir());
          }
        }
        synchronized (baseDirNumberSubDir) {
          Integer get = baseDirNumberSubDir.get(lcacheStatus.getBaseDir());
          if (get != null) {
            numberSubDir = get.intValue();
          } else {
            LOG.warn("Cannot find subdirectories limit of baseDir: " + lcacheStatus.getBaseDir());
          }
        }
      }
      // setting the cache size to a default of 10GB
      long allowedSize = conf.getLong("local.cache.size", DEFAULT_CACHE_SIZE);
      long allowedNumberSubDir =
          conf.getLong("local.cache.numbersubdir", DEFAULT_CACHE_SUBDIR_LIMIT);
      if (allowedSize < size || allowedNumberSubDir < numberSubDir) {
        // try some cache deletions
        LOG.debug(
            "Start deleting released cache because"
                + " [size, allowedSize, numberSubDir, allowedNumberSubDir] ="
                + " ["
                + size
                + ", "
                + allowedSize
                + ", "
                + numberSubDir
                + ", "
                + allowedNumberSubDir
                + "]");
        deleteCache(conf, asyncDiskService);
      }
      initSuccessful = true;
      return localizedPath;
    } finally {
      if (!initSuccessful) {
        synchronized (cachedArchives) {
          lcacheStatus.refcount--;
        }
      }
    }
  }

  /**
   * Get the locally cached file or archive; it could either be previously cached (and valid) or
   * copy it from the {@link FileSystem} now.
   *
   * @param cache the cache to be localized, this should be specified as new
   *     URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is
   *     provided the file is assumed to be in the filesystem being used in the Configuration
   * @param conf The Confguration file which contains the filesystem
   * @param baseDir The base cache Dir where you wnat to localize the files/archives
   * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or
   *     .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred
   *     automatically and the directory where the archive is unzipped/unjarred/untarred is returned
   *     as the Path. In case of a file, the path to the file is returned
   * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be
   *     cached hasn't changed since the job started
   * @param currentWorkDir this is the directory where you would want to create symlinks for the
   *     locally cached files/archives
   * @return the path to directory where the archives are unjarred in case of archives, the path to
   *     the file where the file is copied locally
   * @throws IOException
   */
  public static Path getLocalCache(
      URI cache,
      Configuration conf,
      Path baseDir,
      boolean isArchive,
      long confFileStamp,
      Path currentWorkDir,
      MRAsyncDiskService asyncDiskService)
      throws IOException {
    return getLocalCache(
        cache, conf, baseDir, null, isArchive, confFileStamp, currentWorkDir, asyncDiskService);
  }

  /**
   * This is the opposite of getlocalcache. When you are done with using the cache, you need to
   * release the cache
   *
   * @param cache The cache URI to be released
   * @param conf configuration which contains the filesystem the cache is contained in.
   * @throws IOException
   */
  public static void releaseCache(URI cache, Configuration conf, long timeStamp)
      throws IOException {
    String cacheId = getKey(cache, conf, timeStamp);

    synchronized (cachedArchives) {
      CacheStatus lcacheStatus = cachedArchives.get(cacheId);
      if (lcacheStatus == null) {
        LOG.warn(
            "Cannot find localized cache: " + cache + " (key: " + cacheId + ") in releaseCache!");
        return;
      }
      lcacheStatus.refcount--;
    }
  }

  /** Runnable which removes the cache directories from the disk */
  private static class CacheFileCleanTask implements Runnable {
    private MRAsyncDiskService asyncDiskService;
    private LocalFileSystem fs;
    private List<CacheStatus> toBeDeletedCache;

    public CacheFileCleanTask(
        MRAsyncDiskService asyncDiskService,
        LocalFileSystem fs,
        List<CacheStatus> toBeDeletedCache) {
      this.asyncDiskService = asyncDiskService;
      this.fs = fs;
      this.toBeDeletedCache = toBeDeletedCache;
    }

    @Override
    public void run() {
      for (CacheStatus lcacheStatus : toBeDeletedCache) {
        synchronized (lcacheStatus) {
          Path fullUniqueParentDir =
              new Path(lcacheStatus.localizedBaseDir, lcacheStatus.uniqueParentDir);
          try {
            LOG.info("Deleting local cached path: " + fullUniqueParentDir.toString());
            deleteLocalPath(asyncDiskService, fs, fullUniqueParentDir);
            // decrement the size of the cache from baseDirSize
            deleteCacheInfoUpdate(lcacheStatus);
            LOG.info("Removed cache " + lcacheStatus.localizedLoadPath);
          } catch (IOException e) {
            LOG.warn("Error when deleting " + fullUniqueParentDir, e);
          }
        }
      }
    }
  }
  // To delete the caches which have a refcount of zero

  private static void deleteCache(Configuration conf, MRAsyncDiskService asyncDiskService)
      throws IOException {
    List<CacheStatus> deleteSet = new LinkedList<CacheStatus>();
    // try deleting cache Status with refcount of zero
    synchronized (cachedArchives) {
      for (Iterator<String> it = cachedArchives.keySet().iterator(); it.hasNext(); ) {
        String cacheId = (String) it.next();
        CacheStatus lcacheStatus = cachedArchives.get(cacheId);
        if (lcacheStatus.refcount == 0) {
          // delete this cache entry from the global list
          // and mark the localized file for deletion
          deleteSet.add(lcacheStatus);
          it.remove();
        }
      }
    }
    // do the deletion asynchronously, after releasing the global lock
    Thread cacheFileCleaner =
        new Thread(new CacheFileCleanTask(asyncDiskService, FileSystem.getLocal(conf), deleteSet));
    cacheFileCleaner.start();
  }

  /**
   * Delete a local path with asyncDiskService if available, or otherwise synchronously with local
   * file system.
   */
  private static void deleteLocalPath(
      MRAsyncDiskService asyncDiskService, LocalFileSystem fs, Path path) throws IOException {
    boolean deleted = false;
    if (asyncDiskService != null) {
      // Try to delete using asyncDiskService
      String localPathToDelete = path.toUri().getPath();
      deleted = asyncDiskService.moveAndDeleteAbsolutePath(localPathToDelete);
      if (!deleted) {
        LOG.warn(
            "Cannot find DistributedCache path "
                + localPathToDelete
                + " on any of the asyncDiskService volumes!");
      }
    }
    if (!deleted) {
      // If no asyncDiskService, we will delete the files synchronously
      fs.delete(path, true);
    }
    LOG.info("Deleted path " + path);
  }

  /*
   * Returns the relative path of the dir this cache will be localized in
   * relative path that this cache will be localized in. For
   * hdfs://hostname:port/absolute_path -- the relative path is
   * hostname/absolute path -- if it is just /absolute_path -- then the
   * relative path is hostname of DFS this mapred cluster is running
   * on/absolute_path
   */
  public static String makeRelative(URI cache, Configuration conf) throws IOException {
    String host = cache.getHost();
    if (host == null) {
      host = cache.getScheme();
    }
    if (host == null) {
      URI defaultUri = FileSystem.get(conf).getUri();
      host = defaultUri.getHost();
      if (host == null) {
        host = defaultUri.getScheme();
      }
    }
    String path = host + cache.getPath();
    path = path.replace(":/", "/"); // remove windows device colon
    return path;
  }

  static String getKey(URI cache, Configuration conf, long timeStamp) throws IOException {
    return makeRelative(cache, conf) + String.valueOf(timeStamp);
  }

  private static Path checkCacheStatusValidity(
      Configuration conf,
      URI cache,
      long confFileStamp,
      CacheStatus cacheStatus,
      FileStatus fileStatus,
      boolean isArchive)
      throws IOException {
    FileSystem fs = FileSystem.get(cache, conf);
    // Has to be
    if (!ifExistsAndFresh(conf, fs, cache, confFileStamp, cacheStatus, fileStatus)) {
      throw new IOException(
          "Stale cache file: " + cacheStatus.localizedLoadPath + " for cache-file: " + cache);
    }
    LOG.info(
        String.format(
            "Using existing cache of %s->%s", cache.toString(), cacheStatus.localizedLoadPath));
    return cacheStatus.localizedLoadPath;
  }

  private static void createSymlink(
      Configuration conf,
      URI cache,
      CacheStatus cacheStatus,
      boolean isArchive,
      Path currentWorkDir,
      boolean honorSymLinkConf)
      throws IOException {
    boolean doSymlink = honorSymLinkConf && DistributedCache.getSymlink(conf);
    if (cache.getFragment() == null) {
      doSymlink = false;
    }
    String link = currentWorkDir.toString() + Path.SEPARATOR + cache.getFragment();
    File flink = new File(link);
    if (doSymlink) {
      if (!flink.exists()) {
        FileUtil.symLink(cacheStatus.localizedLoadPath.toString(), link);
      }
    }
  }

  // the method which actually copies the caches locally and unjars/unzips them
  // and does chmod for the files
  private static Path localizeCache(
      Configuration conf, URI cache, long confFileStamp, CacheStatus cacheStatus, boolean isArchive)
      throws IOException {
    FileSystem fs = getFileSystem(cache, conf);
    FileSystem localFs = FileSystem.getLocal(conf);
    Path parchive = null;

    if (isArchive) {
      parchive =
          new Path(
              cacheStatus.localizedLoadPath, new Path(cacheStatus.localizedLoadPath.getName()));
    } else {
      parchive = cacheStatus.localizedLoadPath;
    }
    if (!localFs.mkdirs(parchive.getParent())) {
      throw new IOException(
          "Mkdirs failed to create directory " + cacheStatus.localizedLoadPath.toString());
    }
    String cacheId = cache.getPath();

    fs.copyToLocalFile(new Path(cacheId), parchive);
    if (isArchive) {
      String tmpArchive = parchive.toString().toLowerCase();
      File srcFile = new File(parchive.toString());
      File destDir = new File(parchive.getParent().toString());
      if (tmpArchive.endsWith(".jar")) {
        RunJar.unJar(srcFile, destDir);
      } else if (tmpArchive.endsWith(".zip")) {
        FileUtil.unZip(srcFile, destDir);
      } else if (isTarFile(tmpArchive)) {
        FileUtil.unTar(srcFile, destDir);
      }
      // else will not do anyhting
      // and copy the file into the dir as it is
    }
    long cacheSize = FileUtil.getDU(new File(parchive.getParent().toString()));
    cacheStatus.size = cacheSize;
    addCacheInfoUpdate(cacheStatus);

    // do chmod here
    try {
      // Setting recursive permission to grant everyone read and execute
      Path localDir = new Path(cacheStatus.localizedBaseDir, cacheStatus.uniqueParentDir);
      LOG.info("Doing chmod on localdir :" + localDir);
      FileUtil.chmod(localDir.toString(), "ugo+rx", true);
    } catch (InterruptedException e) {
      LOG.warn("Exception in chmod" + e.toString());
    }

    // update cacheStatus to reflect the newly cached file
    cacheStatus.mtime = getTimestamp(conf, cache);
    return cacheStatus.localizedLoadPath;
  }

  private static boolean isTarFile(String filename) {
    return (filename.endsWith(".tgz") || filename.endsWith(".tar.gz") || filename.endsWith(".tar"));
  }

  // Checks if the cache has already been localized and is fresh
  private static boolean ifExistsAndFresh(
      Configuration conf,
      FileSystem fs,
      URI cache,
      long confFileStamp,
      CacheStatus lcacheStatus,
      FileStatus fileStatus)
      throws IOException {
    // check for existence of the cache
    long dfsFileStamp;
    if (fileStatus != null) {
      dfsFileStamp = fileStatus.getModificationTime();
    } else {
      dfsFileStamp = getTimestamp(conf, cache);
    }

    // ensure that the file on hdfs hasn't been modified since the job started
    if (dfsFileStamp != confFileStamp) {
      LOG.fatal("File: " + cache + " has changed on HDFS since job started");
      throw new IOException("File: " + cache + " has changed on HDFS since job started");
    }

    if (dfsFileStamp != lcacheStatus.mtime) {
      // needs refreshing
      return false;
    }

    return true;
  }

  /**
   * Returns mtime of a given cache file on hdfs.
   *
   * @param conf configuration
   * @param cache cache file
   * @return mtime of a given cache file on hdfs
   * @throws IOException
   */
  public static long getTimestamp(Configuration conf, URI cache) throws IOException {
    FileSystem fileSystem = FileSystem.get(cache, conf);
    Path filePath = new Path(cache.getPath());

    return fileSystem.getFileStatus(filePath).getModificationTime();
  }

  /**
   * Returns the status of a given cache file on hdfs.
   *
   * @param conf configuration
   * @param cache cache file
   * @return FileStatus object of the file
   * @throws IOException
   */
  public static FileStatus getFileStatus(Configuration conf, URI cache) throws IOException {
    FileSystem fileSystem = FileSystem.get(cache, conf);
    Path filePath = new Path(cache.getPath());

    return fileSystem.getFileStatus(filePath);
  }

  /**
   * This method create symlinks for all files in a given dir in another directory
   *
   * @param conf the configuration
   * @param jobCacheDir the target directory for creating symlinks
   * @param workDir the directory in which the symlinks are created
   * @throws IOException
   */
  public static void createAllSymlink(Configuration conf, File jobCacheDir, File workDir)
      throws IOException {
    if ((jobCacheDir == null || !jobCacheDir.isDirectory())
        || workDir == null
        || (!workDir.isDirectory())) {
      return;
    }
    boolean createSymlink = getSymlink(conf);
    if (createSymlink) {
      File[] list = jobCacheDir.listFiles();
      for (int i = 0; i < list.length; i++) {
        FileUtil.symLink(
            list[i].getAbsolutePath(), new File(workDir, list[i].getName()).toString());
      }
    }
  }

  private static String getFileSysName(URI url) {
    String fsname = url.getScheme();
    if ("hdfs".equals(fsname)) {
      String host = url.getHost();
      int port = url.getPort();
      return (port == (-1)) ? host : (host + ":" + port);
    } else {
      return null;
    }
  }

  private static FileSystem getFileSystem(URI cache, Configuration conf) throws IOException {
    String fileSysName = getFileSysName(cache);
    if (fileSysName != null) return FileSystem.getNamed(fileSysName, conf);
    else return FileSystem.get(conf);
  }

  /**
   * Set the configuration with the given set of archives
   *
   * @param archives The list of archives that need to be localized
   * @param conf Configuration which will be changed
   */
  public static void setCacheArchives(URI[] archives, Configuration conf) {
    String sarchives = StringUtils.uriToString(archives);
    conf.set("mapred.cache.archives", sarchives);
  }

  /**
   * Set the configuration with the given set of files
   *
   * @param files The list of files that need to be localized
   * @param conf Configuration which will be changed
   */
  public static void setCacheFiles(URI[] files, Configuration conf) {
    String sfiles = StringUtils.uriToString(files);
    conf.set("mapred.cache.files", sfiles);
  }

  /**
   * Get cache archives set in the Configuration
   *
   * @param conf The configuration which contains the archives
   * @return A URI array of the caches set in the Configuration
   * @throws IOException
   */
  public static URI[] getCacheArchives(Configuration conf) throws IOException {
    return StringUtils.stringToURI(conf.getStrings("mapred.cache.archives"));
  }

  /**
   * Get cache archives set in the Configuration
   *
   * @param conf The configuration which contains the archives
   * @return A URI array of the caches set in the Configuration
   * @throws IOException
   */
  public static URI[] getSharedCacheArchives(Configuration conf) throws IOException {
    return StringUtils.stringToURI(conf.getStrings("mapred.cache.shared.archives"));
  }

  /**
   * Get cache files set in the Configuration
   *
   * @param conf The configuration which contains the files
   * @return A URI array of the files set in the Configuration
   * @throws IOException
   */
  public static URI[] getCacheFiles(Configuration conf) throws IOException {
    return StringUtils.stringToURI(conf.getStrings("mapred.cache.files"));
  }

  /**
   * Get cache files set in the Configuration
   *
   * @param conf The configuration which contains the files
   * @return A URI array of the files set in the Configuration
   * @throws IOException
   */
  public static URI[] getSharedCacheFiles(Configuration conf) throws IOException {
    return StringUtils.stringToURI(conf.getStrings("mapred.cache.shared.files"));
  }

  /**
   * Return the path array of the localized caches
   *
   * @param conf Configuration that contains the localized archives
   * @return A path array of localized caches
   * @throws IOException
   */
  public static Path[] getLocalCacheArchives(Configuration conf) throws IOException {
    return StringUtils.stringToPath(conf.getStrings("mapred.cache.localArchives"));
  }

  /**
   * Return the path array of the localized caches
   *
   * @param conf Configuration that contains the localized archives
   * @return A path array of localized caches
   * @throws IOException
   */
  public static Path[] getLocalSharedCacheArchives(Configuration conf) throws IOException {
    return StringUtils.stringToPath(conf.getStrings("mapred.cache.shared.localArchives"));
  }

  /**
   * Return the path array of the localized files
   *
   * @param conf Configuration that contains the localized files
   * @return A path array of localized files
   * @throws IOException
   */
  public static Path[] getLocalCacheFiles(Configuration conf) throws IOException {
    return StringUtils.stringToPath(conf.getStrings("mapred.cache.localFiles"));
  }

  /**
   * Return the path array of the localized files
   *
   * @param conf Configuration that contains the localized files
   * @return A path array of localized files
   * @throws IOException
   */
  public static Path[] getLocalSharedCacheFiles(Configuration conf) throws IOException {
    return StringUtils.stringToPath(conf.getStrings("mapred.cache.shared.localFiles"));
  }

  /**
   * Get the timestamps of the archives
   *
   * @param conf The configuration which stored the timestamps
   * @return a string array of timestamps
   * @throws IOException
   */
  public static String[] getArchiveTimestamps(Configuration conf) {
    return conf.getStrings("mapred.cache.archives.timestamps");
  }

  /**
   * Get the timestamps of the files
   *
   * @param conf The configuration which stored the timestamps
   * @return a string array of timestamps
   * @throws IOException
   */
  public static String[] getFileTimestamps(Configuration conf) {
    return conf.getStrings("mapred.cache.files.timestamps");
  }

  public static String[] getSharedArchiveLength(Configuration conf) {
    return conf.getStrings("mapred.cache.shared.archives.length");
  }

  public static String[] getSharedFileLength(Configuration conf) {
    return conf.getStrings("mapred.cache.shared.files.length");
  }

  /**
   * This is to check the timestamp of the archives to be localized
   *
   * @param conf Configuration which stores the timestamp's
   * @param timestamps comma separated list of timestamps of archives. The order should be the same
   *     as the order in which the archives are added.
   */
  public static void setArchiveTimestamps(Configuration conf, String timestamps) {
    conf.set("mapred.cache.archives.timestamps", timestamps);
  }

  public static void setSharedArchiveLength(Configuration conf, String length) {
    conf.set("mapred.cache.shared.archives.length", length);
  }

  /**
   * This is to check the timestamp of the files to be localized
   *
   * @param conf Configuration which stores the timestamp's
   * @param timestamps comma separated list of timestamps of files. The order should be the same as
   *     the order in which the files are added.
   */
  public static void setFileTimestamps(Configuration conf, String timestamps) {
    conf.set("mapred.cache.files.timestamps", timestamps);
  }

  public static void setSharedFileLength(Configuration conf, String length) {
    conf.set("mapred.cache.shared.files.length", length);
  }

  /**
   * Set the conf to contain the location for localized archives
   *
   * @param conf The conf to modify to contain the localized caches
   * @param str a comma separated list of local archives
   */
  public static void setLocalArchives(Configuration conf, String str) {
    conf.set("mapred.cache.localArchives", str);
  }

  /**
   * Set the conf to contain the location for localized archives
   *
   * @param conf The conf to modify to contain the localized caches
   * @param str a comma separated list of local archives
   */
  public static void setLocalSharedArchives(Configuration conf, String str) {
    conf.set("mapred.cache.shared.localArchives", str);
  }

  /**
   * Set the conf to contain the location for localized files
   *
   * @param conf The conf to modify to contain the localized caches
   * @param str a comma separated list of local files
   */
  public static void setLocalFiles(Configuration conf, String str) {
    conf.set("mapred.cache.localFiles", str);
  }

  /**
   * Set the conf to contain the location for localized files
   *
   * @param conf The conf to modify to contain the localized caches
   * @param str a comma separated list of local files
   */
  public static void setLocalSharedFiles(Configuration conf, String str) {
    conf.set("mapred.cache.shared.localFiles", str);
  }

  /**
   * Add a archives to be localized to the conf
   *
   * @param uri The uri of the cache to be localized
   * @param conf Configuration to add the cache to
   */
  public static void addCacheArchive(URI uri, Configuration conf) {
    String archives = conf.get("mapred.cache.archives");
    conf.set(
        "mapred.cache.archives",
        archives == null ? uri.toString() : archives + "," + uri.toString());
  }

  /**
   * Add a archives to be localized to the conf
   *
   * @param uri The uri of the cache to be localized
   * @param conf Configuration to add the cache to
   */
  public static void addSharedCacheArchive(URI uri, Configuration conf) {
    String archives = conf.get("mapred.cache.shared.archives");
    conf.set(
        "mapred.cache.shared.archives",
        archives == null ? uri.toString() : archives + "," + uri.toString());
  }

  /**
   * Add a file to be localized to the conf
   *
   * @param uri The uri of the cache to be localized
   * @param conf Configuration to add the cache to
   */
  public static void addCacheFile(URI uri, Configuration conf) {
    String files = conf.get("mapred.cache.files");
    conf.set("mapred.cache.files", files == null ? uri.toString() : files + "," + uri.toString());
  }

  /**
   * Add a file to be localized to the conf
   *
   * @param uri The uri of the cache to be localized
   * @param conf Configuration to add the cache to
   */
  public static void addSharedCacheFile(URI uri, Configuration conf) {
    String files = conf.get("mapred.cache.shared.files");
    conf.set(
        "mapred.cache.shared.files", files == null ? uri.toString() : files + "," + uri.toString());
  }

  /**
   * Add an file path to the current set of classpath entries It adds the file to cache as well.
   *
   * @param file Path of the file to be added
   * @param conf Configuration that contains the classpath setting
   */
  public static void addFileToClassPath(Path file, Configuration conf) throws IOException {
    String classpath = conf.get("mapred.job.classpath.files");
    conf.set(
        "mapred.job.classpath.files",
        classpath == null
            ? file.toString()
            : classpath + System.getProperty("path.separator") + file.toString());
    URI uri = file.makeQualified(file.getFileSystem(conf)).toUri();

    addCacheFile(uri, conf);
  }

  /**
   * Get the file entries in classpath as an array of Path
   *
   * @param conf Configuration that contains the classpath setting
   */
  public static Path[] getFileClassPaths(Configuration conf) {
    String classpath = conf.get("mapred.job.classpath.files");
    if (classpath == null) return null;
    ArrayList list =
        Collections.list(new StringTokenizer(classpath, System.getProperty("path.separator")));
    Path[] paths = new Path[list.size()];
    for (int i = 0; i < list.size(); i++) {
      paths[i] = new Path((String) list.get(i));
    }
    return paths;
  }

  private static URI addArchiveToClassPathHelper(Path archive, Configuration conf)
      throws IOException {

    String classpath = conf.get("mapred.job.classpath.archives");

    // the scheme/authority use ':' as separator. put the unqualified path in classpath
    String archivePath = archive.toUri().getPath();

    conf.set(
        "mapred.job.classpath.archives",
        classpath == null
            ? archivePath
            : classpath + System.getProperty("path.separator") + archivePath);
    return archive.makeQualified(archive.getFileSystem(conf)).toUri();
  }

  /**
   * Add an archive path to the current set of classpath entries. It adds the archive to cache as
   * well.
   *
   * @param archive Path of the archive to be added
   * @param conf Configuration that contains the classpath setting
   */
  public static void addArchiveToClassPath(Path archive, Configuration conf) throws IOException {
    URI uri = addArchiveToClassPathHelper(archive, conf);
    addCacheArchive(uri, conf);
  }

  /**
   * Add an archive path to the current set of classpath entries. It adds the archive to cache as
   * well.
   *
   * @param archive Path of the archive to be added
   * @param conf Configuration that contains the classpath setting
   */
  public static void addSharedArchiveToClassPath(Path archive, Configuration conf)
      throws IOException {
    URI uri = addArchiveToClassPathHelper(archive, conf);
    addSharedCacheArchive(uri, conf);
  }

  /**
   * Get the archive entries in classpath as an array of Path
   *
   * @param conf Configuration that contains the classpath setting
   */
  public static Path[] getArchiveClassPaths(Configuration conf) {
    String classpath = conf.get("mapred.job.classpath.archives");
    if (classpath == null) return null;
    ArrayList list =
        Collections.list(new StringTokenizer(classpath, System.getProperty("path.separator")));
    Path[] paths = new Path[list.size()];
    for (int i = 0; i < list.size(); i++) {
      paths[i] = new Path((String) list.get(i));
    }
    return paths;
  }

  /**
   * This method allows you to create symlinks in the current working directory of the task to all
   * the cache files/archives
   *
   * @param conf the jobconf
   */
  public static void createSymlink(Configuration conf) {
    conf.set("mapred.create.symlink", "yes");
  }

  /**
   * This method checks to see if symlinks are to be create for the localized cache files in the
   * current working directory
   *
   * @param conf the jobconf
   * @return true if symlinks are to be created- else return false
   */
  public static boolean getSymlink(Configuration conf) {
    String result = conf.get("mapred.create.symlink");
    if ("yes".equals(result)) {
      return true;
    }
    return false;
  }

  /**
   * This method checks if there is a conflict in the fragment names of the uris. Also makes sure
   * that each uri has a fragment. It is only to be called if you want to create symlinks for the
   * various archives and files.
   *
   * @param uriFiles The uri array of urifiles
   * @param uriArchives the uri array of uri archives
   */
  public static boolean checkURIs(URI[] uriFiles, URI[] uriArchives) {
    if ((uriFiles == null) && (uriArchives == null)) {
      return true;
    }
    if (uriFiles != null) {
      for (int i = 0; i < uriFiles.length; i++) {
        String frag1 = uriFiles[i].getFragment();
        if (frag1 == null) return false;
        for (int j = i + 1; j < uriFiles.length; j++) {
          String frag2 = uriFiles[j].getFragment();
          if (frag2 == null) return false;
          if (frag1.equalsIgnoreCase(frag2)) return false;
        }
        if (uriArchives != null) {
          for (int j = 0; j < uriArchives.length; j++) {
            String frag2 = uriArchives[j].getFragment();
            if (frag2 == null) {
              return false;
            }
            if (frag1.equalsIgnoreCase(frag2)) return false;
            for (int k = j + 1; k < uriArchives.length; k++) {
              String frag3 = uriArchives[k].getFragment();
              if (frag3 == null) return false;
              if (frag2.equalsIgnoreCase(frag3)) return false;
            }
          }
        }
      }
    }
    return true;
  }

  private static class CacheStatus {
    // the local load path of this cache
    Path localizedLoadPath;

    // the base dir where the cache lies
    Path localizedBaseDir;

    // the unique directory in localizedBaseDir, where the cache lies
    Path uniqueParentDir;

    // the size of this cache
    long size;

    // number of instances using this cache
    int refcount;

    // the cache-file modification time
    long mtime;

    // is it initialized
    boolean inited = false;

    public CacheStatus(Path baseDir, Path localLoadPath, Path uniqueParentDir) {
      super();
      this.localizedLoadPath = localLoadPath;
      this.refcount = 0;
      this.mtime = -1;
      this.localizedBaseDir = baseDir;
      this.size = 0;
      this.uniqueParentDir = uniqueParentDir;
    }

    // get the base dir for the cache
    Path getBaseDir() {
      return localizedBaseDir;
    }

    // Is it initialized?
    boolean isInited() {
      return inited;
    }

    // mark it as initalized
    void initComplete() {
      inited = true;
    }
  }

  /**
   * Clear the entire contents of the cache and delete the backing files. This should only be used
   * when the server is reinitializing, because the users are going to lose their files.
   */
  public static void purgeCache(Configuration conf, MRAsyncDiskService service) throws IOException {
    synchronized (cachedArchives) {
      LocalFileSystem localFs = FileSystem.getLocal(conf);
      for (Map.Entry<String, CacheStatus> f : cachedArchives.entrySet()) {
        try {
          deleteLocalPath(service, localFs, f.getValue().localizedLoadPath);
        } catch (IOException ie) {
          LOG.debug("Error cleaning up cache", ie);
        }
      }
      cachedArchives.clear();
    }
  }

  /**
   * Update the maps baseDirSize and baseDirNumberSubDir when deleting cache.
   *
   * @param cacheStatus cache status of the cache is deleted
   */
  private static void deleteCacheInfoUpdate(CacheStatus cacheStatus) {
    if (!cacheStatus.isInited()) {
      // if it is not created yet, do nothing.
      return;
    }
    synchronized (baseDirSize) {
      Long dirSize = baseDirSize.get(cacheStatus.getBaseDir());
      if (dirSize != null) {
        dirSize -= cacheStatus.size;
        baseDirSize.put(cacheStatus.getBaseDir(), dirSize);
      }
    }
    synchronized (baseDirNumberSubDir) {
      Integer dirSubDir = baseDirNumberSubDir.get(cacheStatus.getBaseDir());
      if (dirSubDir != null) {
        dirSubDir--;
        baseDirNumberSubDir.put(cacheStatus.getBaseDir(), dirSubDir);
      }
    }
  }

  /**
   * Update the maps baseDirSize and baseDirNumberSubDir when adding cache.
   *
   * @param cacheStatus cache status of the cache is added
   */
  private static void addCacheInfoUpdate(CacheStatus cacheStatus) {
    long cacheSize = cacheStatus.size;
    synchronized (baseDirSize) {
      Long dirSize = baseDirSize.get(cacheStatus.getBaseDir());
      if (dirSize == null) {
        dirSize = Long.valueOf(cacheSize);
      } else {
        dirSize += cacheSize;
      }
      baseDirSize.put(cacheStatus.getBaseDir(), dirSize);
    }
    synchronized (baseDirNumberSubDir) {
      Integer dirSubDir = baseDirNumberSubDir.get(cacheStatus.getBaseDir());
      if (dirSubDir == null) {
        dirSubDir = 1;
      } else {
        dirSubDir += 1;
      }
      baseDirNumberSubDir.put(cacheStatus.getBaseDir(), dirSubDir);
    }
  }
}
public class TestMRSequenceFileAsBinaryOutputFormat extends TestCase {
  private static final Log LOG =
      LogFactory.getLog(TestMRSequenceFileAsBinaryOutputFormat.class.getName());

  private static final int RECORDS = 10000;

  public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = new Job(conf);

    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);

    FileOutputFormat.setOutputPath(job, outdir);

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();

    TaskAttemptContext context =
        MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat =
        new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);

    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
      for (int i = 0; i < RECORDS; ++i) {
        iwritable = new IntWritable(r.nextInt());
        iwritable.write(outbuf);
        bkey.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        dwritable = new DoubleWritable(r.nextDouble());
        dwritable.write(outbuf);
        bval.set(outbuf.getData(), 0, outbuf.getLength());
        outbuf.reset();
        writer.write(bkey, bval);
      }
    } finally {
      writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);

    InputFormat<IntWritable, DoubleWritable> iformat =
        new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
      RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
      MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext =
          new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(
              job.getConfiguration(),
              context.getTaskAttemptID(),
              reader,
              null,
              null,
              MapReduceTestUtil.createDummyReporter(),
              split);
      reader.initialize(split, mcontext);
      try {
        int sourceInt;
        double sourceDouble;
        while (reader.nextKeyValue()) {
          sourceInt = r.nextInt();
          sourceDouble = r.nextDouble();
          iwritable = reader.getCurrentKey();
          dwritable = reader.getCurrentValue();
          assertEquals(
              "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*",
              sourceInt,
              iwritable.get());
          assertTrue(
              "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*",
              Double.compare(dwritable.get(), sourceDouble) == 0);
          ++count;
        }
      } finally {
        reader.close();
      }
    }
    assertEquals("Some records not found", RECORDS, count);
  }

  public void testSequenceOutputClassDefaultsToMapRedOutputClass() throws IOException {
    Job job = new Job();
    // Setting Random class to test getSequenceFileOutput{Key,Value}Class
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(BooleanWritable.class);

    assertEquals(
        "SequenceFileOutputKeyClass should default to ouputKeyClass",
        FloatWritable.class,
        SequenceFileAsBinaryOutputFormat.getSequenceFileOutputKeyClass(job));
    assertEquals(
        "SequenceFileOutputValueClass should default to " + "ouputValueClass",
        BooleanWritable.class,
        SequenceFileAsBinaryOutputFormat.getSequenceFileOutputValueClass(job));

    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);

    assertEquals(
        "SequenceFileOutputKeyClass not updated",
        IntWritable.class,
        SequenceFileAsBinaryOutputFormat.getSequenceFileOutputKeyClass(job));
    assertEquals(
        "SequenceFileOutputValueClass not updated",
        DoubleWritable.class,
        SequenceFileAsBinaryOutputFormat.getSequenceFileOutputValueClass(job));
  }

  public void testcheckOutputSpecsForbidRecordCompression() throws IOException {
    Job job = Job.getInstance(new Configuration(), "testcheckOutputSpecsForbidRecordCompression");
    FileSystem fs = FileSystem.getLocal(job.getConfiguration());
    Path outputdir = new Path(System.getProperty("test.build.data", "/tmp") + "/output");
    fs.delete(outputdir, true);

    // Without outputpath, FileOutputFormat.checkoutputspecs will throw
    // InvalidJobConfException
    FileOutputFormat.setOutputPath(job, outputdir);

    // SequenceFileAsBinaryOutputFormat doesn't support record compression
    // It should throw an exception when checked by checkOutputSpecs
    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);

    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    try {
      new SequenceFileAsBinaryOutputFormat().checkOutputSpecs(job);
    } catch (Exception e) {
      fail(
          "Block compression should be allowed for "
              + "SequenceFileAsBinaryOutputFormat:Caught "
              + e.getClass().getName());
    }

    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.RECORD);
    try {
      new SequenceFileAsBinaryOutputFormat().checkOutputSpecs(job);
      fail("Record compression should not be allowed for " + "SequenceFileAsBinaryOutputFormat");
    } catch (InvalidJobConfException ie) {
      // expected
    } catch (Exception e) {
      fail(
          "Expected "
              + InvalidJobConfException.class.getName()
              + "but caught "
              + e.getClass().getName());
    }
  }
}
Example #3
0
/**
 * Base class that runs a task in a separate process. Tasks are run in a separate process in order
 * to isolate the map/reduce system code from bugs in user supplied map and reduce functions.
 */
abstract class TaskRunner extends Thread {
  public static final Log LOG = LogFactory.getLog(TaskRunner.class);

  volatile boolean killed = false;
  private TaskTracker.TaskInProgress tip;
  private Task t;
  private Object lock = new Object();
  private volatile boolean done = false;
  private int exitCode = -1;
  private boolean exitCodeSet = false;

  private TaskTracker tracker;

  protected JobConf conf;
  JvmManager jvmManager;

  /** for cleaning up old map outputs */
  protected MapOutputFile mapOutputFile;

  public TaskRunner(TaskTracker.TaskInProgress tip, TaskTracker tracker, JobConf conf) {
    this.tip = tip;
    this.t = tip.getTask();
    this.tracker = tracker;
    this.conf = conf;
    this.mapOutputFile = new MapOutputFile(t.getJobID());
    this.mapOutputFile.setConf(conf);
    this.jvmManager = tracker.getJvmManagerInstance();
  }

  public Task getTask() {
    return t;
  }

  public TaskTracker.TaskInProgress getTaskInProgress() {
    return tip;
  }

  public TaskTracker getTracker() {
    return tracker;
  }

  /**
   * Called to assemble this task's input. This method is run in the parent process before the child
   * is spawned. It should not execute user code, only system code.
   */
  public boolean prepare() throws IOException {
    return true;
  }

  /**
   * Called when this task's output is no longer needed. This method is run in the parent process
   * after the child exits. It should not execute user code, only system code.
   */
  public void close() throws IOException {}

  private static String stringifyPathArray(Path[] p) {
    if (p == null) {
      return null;
    }
    StringBuffer str = new StringBuffer(p[0].toString());
    for (int i = 1; i < p.length; i++) {
      str.append(",");
      str.append(p[i].toString());
    }
    return str.toString();
  }

  @Override
  public final void run() {
    try {

      // before preparing the job localize
      // all the archives
      TaskAttemptID taskid = t.getTaskID();
      LocalDirAllocator lDirAlloc = new LocalDirAllocator("mapred.local.dir");
      File jobCacheDir = null;
      if (conf.getJar() != null) {
        jobCacheDir = new File(new Path(conf.getJar()).getParent().toString());
      }
      File workDir =
          new File(
              lDirAlloc
                  .getLocalPathToRead(
                      TaskTracker.getJobCacheSubdir()
                          + Path.SEPARATOR
                          + t.getJobID()
                          + Path.SEPARATOR
                          + t.getTaskID()
                          + Path.SEPARATOR
                          + MRConstants.WORKDIR,
                      conf)
                  .toString());

      URI[] archives = DistributedCache.getCacheArchives(conf);
      URI[] files = DistributedCache.getCacheFiles(conf);
      FileStatus fileStatus;
      FileSystem fileSystem;
      Path localPath;
      String baseDir;

      if ((archives != null) || (files != null)) {
        if (archives != null) {
          String[] archivesTimestamps = DistributedCache.getArchiveTimestamps(conf);
          Path[] p = new Path[archives.length];
          for (int i = 0; i < archives.length; i++) {
            fileSystem = FileSystem.get(archives[i], conf);
            fileStatus = fileSystem.getFileStatus(new Path(archives[i].getPath()));
            String cacheId = DistributedCache.makeRelative(archives[i], conf);
            String cachePath = TaskTracker.getCacheSubdir() + Path.SEPARATOR + cacheId;
            if (lDirAlloc.ifExists(cachePath, conf)) {
              localPath = lDirAlloc.getLocalPathToRead(cachePath, conf);
            } else {
              localPath = lDirAlloc.getLocalPathForWrite(cachePath, fileStatus.getLen(), conf);
            }
            baseDir = localPath.toString().replace(cacheId, "");
            p[i] =
                DistributedCache.getLocalCache(
                    archives[i],
                    conf,
                    new Path(baseDir),
                    fileStatus,
                    true,
                    Long.parseLong(archivesTimestamps[i]),
                    new Path(workDir.getAbsolutePath()),
                    false);
          }
          DistributedCache.setLocalArchives(conf, stringifyPathArray(p));
        }
        if ((files != null)) {
          String[] fileTimestamps = DistributedCache.getFileTimestamps(conf);
          Path[] p = new Path[files.length];
          for (int i = 0; i < files.length; i++) {
            fileSystem = FileSystem.get(files[i], conf);
            fileStatus = fileSystem.getFileStatus(new Path(files[i].getPath()));
            String cacheId = DistributedCache.makeRelative(files[i], conf);
            String cachePath = TaskTracker.getCacheSubdir() + Path.SEPARATOR + cacheId;
            if (lDirAlloc.ifExists(cachePath, conf)) {
              localPath = lDirAlloc.getLocalPathToRead(cachePath, conf);
            } else {
              localPath = lDirAlloc.getLocalPathForWrite(cachePath, fileStatus.getLen(), conf);
            }
            baseDir = localPath.toString().replace(cacheId, "");
            p[i] =
                DistributedCache.getLocalCache(
                    files[i],
                    conf,
                    new Path(baseDir),
                    fileStatus,
                    false,
                    Long.parseLong(fileTimestamps[i]),
                    new Path(workDir.getAbsolutePath()),
                    false);
          }
          DistributedCache.setLocalFiles(conf, stringifyPathArray(p));
        }
        Path localTaskFile = new Path(t.getJobFile());
        FileSystem localFs = FileSystem.getLocal(conf);
        localFs.delete(localTaskFile, true);
        OutputStream out = localFs.create(localTaskFile);
        try {
          conf.writeXml(out);
        } finally {
          out.close();
        }
      }

      if (!prepare()) {
        return;
      }

      String sep = System.getProperty("path.separator");
      StringBuffer classPath = new StringBuffer();
      // start with same classpath as parent process
      classPath.append(System.getProperty("java.class.path"));
      classPath.append(sep);
      if (!workDir.mkdirs()) {
        if (!workDir.isDirectory()) {
          LOG.fatal("Mkdirs failed to create " + workDir.toString());
        }
      }

      String jar = conf.getJar();
      if (jar != null) {
        // if jar exists, it into workDir
        File[] libs = new File(jobCacheDir, "lib").listFiles();
        if (libs != null) {
          for (int i = 0; i < libs.length; i++) {
            classPath.append(sep); // add libs from jar to classpath
            classPath.append(libs[i]);
          }
        }
        classPath.append(sep);
        classPath.append(new File(jobCacheDir, "classes"));
        classPath.append(sep);
        classPath.append(jobCacheDir);
      }

      // include the user specified classpath

      // archive paths
      Path[] archiveClasspaths = DistributedCache.getArchiveClassPaths(conf);
      if (archiveClasspaths != null && archives != null) {
        Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
        if (localArchives != null) {
          for (int i = 0; i < archives.length; i++) {
            for (int j = 0; j < archiveClasspaths.length; j++) {
              if (archives[i].getPath().equals(archiveClasspaths[j].toString())) {
                classPath.append(sep);
                classPath.append(localArchives[i].toString());
              }
            }
          }
        }
      }
      // file paths
      Path[] fileClasspaths = DistributedCache.getFileClassPaths(conf);
      if (fileClasspaths != null && files != null) {
        Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
        if (localFiles != null) {
          for (int i = 0; i < files.length; i++) {
            for (int j = 0; j < fileClasspaths.length; j++) {
              if (files[i].getPath().equals(fileClasspaths[j].toString())) {
                classPath.append(sep);
                classPath.append(localFiles[i].toString());
              }
            }
          }
        }
      }

      classPath.append(sep);
      classPath.append(workDir);
      //  Build exec child jmv args.
      Vector<String> vargs = new Vector<String>(8);
      File jvm = // use same jvm as parent
          new File(new File(System.getProperty("java.home"), "bin"), "java");

      vargs.add(jvm.toString());

      // Add child (task) java-vm options.
      //
      // The following symbols if present in mapred.child.java.opts value are
      // replaced:
      // + @[email protected] is interpolated with value of TaskID.
      // Other occurrences of @ will not be altered.
      //
      // Example with multiple arguments and substitutions, showing
      // jvm GC logging, and start of a passwordless JVM JMX agent so can
      // connect with jconsole and the likes to watch child memory, threads
      // and get thread dumps.
      //
      //  <property>
      //    <name>mapred.child.java.opts</name>
      //    <value>-verbose:gc -Xloggc:/tmp/@[email protected] \
      //           -Dcom.sun.management.jmxremote.authenticate=false \
      //           -Dcom.sun.management.jmxremote.ssl=false \
      //    </value>
      //  </property>
      //
      String javaOpts = conf.get("mapred.child.java.opts", "-Xmx200m");
      javaOpts = javaOpts.replace("@[email protected]", taskid.toString());
      String[] javaOptsSplit = javaOpts.split(" ");

      // Add java.library.path; necessary for loading native libraries.
      //
      // 1. To support native-hadoop library i.e. libhadoop.so, we add the
      //    parent processes' java.library.path to the child.
      // 2. We also add the 'cwd' of the task to it's java.library.path to help
      //    users distribute native libraries via the DistributedCache.
      // 3. The user can also specify extra paths to be added to the
      //    java.library.path via mapred.child.java.opts.
      //
      String libraryPath = System.getProperty("java.library.path");
      if (libraryPath == null) {
        libraryPath = workDir.getAbsolutePath();
      } else {
        libraryPath += sep + workDir;
      }
      boolean hasUserLDPath = false;
      for (int i = 0; i < javaOptsSplit.length; i++) {
        if (javaOptsSplit[i].startsWith("-Djava.library.path=")) {
          javaOptsSplit[i] += sep + libraryPath;
          hasUserLDPath = true;
          break;
        }
      }
      if (!hasUserLDPath) {
        vargs.add("-Djava.library.path=" + libraryPath);
      }
      for (int i = 0; i < javaOptsSplit.length; i++) {
        vargs.add(javaOptsSplit[i]);
      }

      // add java.io.tmpdir given by mapred.child.tmp
      String tmp = conf.get("mapred.child.tmp", "./tmp");
      Path tmpDir = new Path(tmp);

      // if temp directory path is not absolute
      // prepend it with workDir.
      if (!tmpDir.isAbsolute()) {
        tmpDir = new Path(workDir.toString(), tmp);
      }
      FileSystem localFs = FileSystem.getLocal(conf);
      if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) {
        throw new IOException("Mkdirs failed to create " + tmpDir.toString());
      }
      vargs.add("-Djava.io.tmpdir=" + tmpDir.toString());

      // Add classpath.
      vargs.add("-classpath");
      vargs.add(classPath.toString());

      // Setup the log4j prop
      long logSize = TaskLog.getTaskLogLength(conf);
      vargs.add(
          "-Dhadoop.log.dir=" + new File(System.getProperty("hadoop.log.dir")).getAbsolutePath());
      vargs.add("-Dhadoop.root.logger=INFO,TLA");
      vargs.add("-Dhadoop.tasklog.taskid=" + taskid);
      vargs.add("-Dhadoop.tasklog.totalLogFileSize=" + logSize);

      if (conf.getProfileEnabled()) {
        if (conf.getProfileTaskRange(t.isMapTask()).isIncluded(t.getPartition())) {
          File prof = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.PROFILE);
          vargs.add(String.format(conf.getProfileParams(), prof.toString()));
        }
      }

      // Add main class and its arguments
      vargs.add(Child.class.getName()); // main of Child
      // pass umbilical address
      InetSocketAddress address = tracker.getTaskTrackerReportAddress();
      vargs.add(address.getAddress().getHostAddress());
      vargs.add(Integer.toString(address.getPort()));
      vargs.add(taskid.toString()); // pass task identifier

      String pidFile = null;
      if (tracker.isTaskMemoryManagerEnabled()) {
        pidFile =
            lDirAlloc
                .getLocalPathForWrite(
                    (TaskTracker.getPidFilesSubdir() + Path.SEPARATOR + taskid), this.conf)
                .toString();
      }

      // set memory limit using ulimit if feasible and necessary ...
      String[] ulimitCmd = Shell.getUlimitMemoryCommand(conf);
      List<String> setup = null;
      if (ulimitCmd != null) {
        setup = new ArrayList<String>();
        for (String arg : ulimitCmd) {
          setup.add(arg);
        }
      }

      // Set up the redirection of the task's stdout and stderr streams
      File stdout = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDOUT);
      File stderr = TaskLog.getTaskLogFile(taskid, TaskLog.LogName.STDERR);
      stdout.getParentFile().mkdirs();
      tracker.getTaskTrackerInstrumentation().reportTaskLaunch(taskid, stdout, stderr);

      Map<String, String> env = new HashMap<String, String>();
      StringBuffer ldLibraryPath = new StringBuffer();
      ldLibraryPath.append(workDir.toString());
      String oldLdLibraryPath = null;
      oldLdLibraryPath = System.getenv("LD_LIBRARY_PATH");
      if (oldLdLibraryPath != null) {
        ldLibraryPath.append(sep);
        ldLibraryPath.append(oldLdLibraryPath);
      }
      env.put("LD_LIBRARY_PATH", ldLibraryPath.toString());
      jvmManager.launchJvm(
          this,
          jvmManager.constructJvmEnv(
              setup, vargs, stdout, stderr, logSize, workDir, env, pidFile, conf));
      synchronized (lock) {
        while (!done) {
          lock.wait();
        }
      }
      tracker.getTaskTrackerInstrumentation().reportTaskEnd(t.getTaskID());
      if (exitCodeSet) {
        if (!killed && exitCode != 0) {
          if (exitCode == 65) {
            tracker.getTaskTrackerInstrumentation().taskFailedPing(t.getTaskID());
          }
          throw new IOException("Task process exit with nonzero status of " + exitCode + ".");
        }
      }
    } catch (FSError e) {
      LOG.fatal("FSError", e);
      try {
        tracker.fsError(t.getTaskID(), e.getMessage());
      } catch (IOException ie) {
        LOG.fatal(t.getTaskID() + " reporting FSError", ie);
      }
    } catch (Throwable throwable) {
      LOG.warn(t.getTaskID() + " Child Error", throwable);
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      throwable.printStackTrace(new PrintStream(baos));
      try {
        tracker.reportDiagnosticInfo(t.getTaskID(), baos.toString());
      } catch (IOException e) {
        LOG.warn(t.getTaskID() + " Reporting Diagnostics", e);
      }
    } finally {
      try {
        URI[] archives = DistributedCache.getCacheArchives(conf);
        URI[] files = DistributedCache.getCacheFiles(conf);
        if (archives != null) {
          for (int i = 0; i < archives.length; i++) {
            DistributedCache.releaseCache(archives[i], conf);
          }
        }
        if (files != null) {
          for (int i = 0; i < files.length; i++) {
            DistributedCache.releaseCache(files[i], conf);
          }
        }
      } catch (IOException ie) {
        LOG.warn("Error releasing caches : Cache files might not have been cleaned up");
      }
      tracker.reportTaskFinished(t.getTaskID(), false);
      if (t.isMapTask()) {
        tracker.addFreeMapSlot();
      } else {
        tracker.addFreeReduceSlot();
      }
    }
  }

  // Mostly for setting up the symlinks. Note that when we setup the distributed
  // cache, we didn't create the symlinks. This is done on a per task basis
  // by the currently executing task.
  public static void setupWorkDir(JobConf conf) throws IOException {
    File workDir = new File(".").getAbsoluteFile();
    FileUtil.fullyDelete(workDir);
    if (DistributedCache.getSymlink(conf)) {
      URI[] archives = DistributedCache.getCacheArchives(conf);
      URI[] files = DistributedCache.getCacheFiles(conf);
      Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
      Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
      if (archives != null) {
        for (int i = 0; i < archives.length; i++) {
          String link = archives[i].getFragment();
          if (link != null) {
            link = workDir.toString() + Path.SEPARATOR + link;
            File flink = new File(link);
            if (!flink.exists()) {
              FileUtil.symLink(localArchives[i].toString(), link);
            }
          }
        }
      }
      if (files != null) {
        for (int i = 0; i < files.length; i++) {
          String link = files[i].getFragment();
          if (link != null) {
            link = workDir.toString() + Path.SEPARATOR + link;
            File flink = new File(link);
            if (!flink.exists()) {
              FileUtil.symLink(localFiles[i].toString(), link);
            }
          }
        }
      }
    }
    File jobCacheDir = null;
    if (conf.getJar() != null) {
      jobCacheDir = new File(new Path(conf.getJar()).getParent().toString());
    }

    // create symlinks for all the files in job cache dir in current
    // workingdir for streaming
    try {
      DistributedCache.createAllSymlink(conf, jobCacheDir, workDir);
    } catch (IOException ie) {
      // Do not exit even if symlinks have not been created.
      LOG.warn(StringUtils.stringifyException(ie));
    }
    // add java.io.tmpdir given by mapred.child.tmp
    String tmp = conf.get("mapred.child.tmp", "./tmp");
    Path tmpDir = new Path(tmp);

    // if temp directory path is not absolute
    // prepend it with workDir.
    if (!tmpDir.isAbsolute()) {
      tmpDir = new Path(workDir.toString(), tmp);
      FileSystem localFs = FileSystem.getLocal(conf);
      if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) {
        throw new IOException("Mkdirs failed to create " + tmpDir.toString());
      }
    }
  }

  /** Kill the child process */
  public void kill() {
    killed = true;
    jvmManager.taskKilled(this);
    signalDone();
  }

  public void signalDone() {
    synchronized (lock) {
      done = true;
      lock.notify();
    }
  }

  public void setExitCode(int exitCode) {
    this.exitCodeSet = true;
    this.exitCode = exitCode;
  }
}