예제 #1
0
 /** Added for back compatibility. */
 public static Path getLocalCache(
     URI cache,
     Configuration conf,
     Path subdir,
     FileStatus fileStatus,
     boolean isArchive,
     long confFileStamp,
     Path currentWorkDir,
     boolean honorSymLinkConf,
     MRAsyncDiskService asyncDiskService,
     LocalDirAllocator lDirAllocator)
     throws IOException {
   return getLocalCache(
       cache,
       conf,
       subdir,
       fileStatus,
       isArchive,
       confFileStamp,
       fileStatus.getLen(),
       currentWorkDir,
       honorSymLinkConf,
       asyncDiskService,
       lDirAllocator);
 }
예제 #2
0
  /**
   * Generate the list of files and make them into FileSplits. This needs to be copied to insert a
   * filter on acceptable data
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    long desiredMappers =
        job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> fileStatuses = listStatus(job);
    boolean forceNumberMappers = fileStatuses.size() == 1;
    for (FileStatus file : fileStatuses) {
      Path path = file.getPath();
      if (!isPathAcceptable(path)) // filter acceptable data
      continue;
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        // use desired mappers to force more splits
        if (forceNumberMappers && desiredMappers > 0)
          maxSize = Math.min(maxSize, (length / desiredMappers));

        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (withinSlop(splitSize, bytesRemaining)) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    System.out.println("Total # of splits: " + splits.size());
    //     LOG.debug("Total # of splits: " + splits.size());
    return splits;
  }
예제 #3
0
 /**
  * Get the locally cached file or archive; it could either be previously cached (and valid) or
  * copy it from the {@link FileSystem} now.
  *
  * @param cache the cache to be localized, this should be specified as new
  *     URI(hdfs://hostname:port/absolute_path_to_file#LINKNAME). If no schema or hostname:port is
  *     provided the file is assumed to be in the filesystem being used in the Configuration
  * @param conf The Confguration file which contains the filesystem
  * @param baseDir The base cache Dir where you wnat to localize the files/archives
  * @param fileStatus The file status on the dfs.
  * @param isArchive if the cache is an archive or a file. In case it is an archive with a .zip or
  *     .jar or .tar or .tgz or .tar.gz extension it will be unzipped/unjarred/untarred
  *     automatically and the directory where the archive is unzipped/unjarred/untarred is returned
  *     as the Path. In case of a file, the path to the file is returned
  * @param confFileStamp this is the hdfs file modification timestamp to verify that the file to be
  *     cached hasn't changed since the job started
  * @param currentWorkDir this is the directory where you would want to create symlinks for the
  *     locally cached files/archives
  * @return the path to directory where the archives are unjarred in case of archives, the path to
  *     the file where the file is copied locally
  * @throws IOException
  */
 public static Path getLocalCache(
     URI cache,
     Configuration conf,
     Path baseDir,
     FileStatus fileStatus,
     boolean isArchive,
     long confFileStamp,
     Path currentWorkDir,
     MRAsyncDiskService asyncDiskService)
     throws IOException {
   return getLocalCache(
       cache,
       conf,
       baseDir,
       fileStatus,
       isArchive,
       confFileStamp,
       fileStatus.getLen(),
       currentWorkDir,
       true,
       asyncDiskService,
       new LocalDirAllocator("mapred.local.dir"));
 }