Beispiel #1
0
 private void addBlocks(
     VolumeManager fs,
     String host,
     ArrayList<String> files,
     Map<String, Long> totalBlocks,
     Map<String, Long> localBlocks)
     throws Exception {
   long allBlocks = 0;
   long matchingBlocks = 0;
   if (!totalBlocks.containsKey(host)) {
     totalBlocks.put(host, 0L);
     localBlocks.put(host, 0L);
   }
   for (String file : files) {
     Path filePath = new Path(file);
     FileSystem ns = fs.getFileSystemByPath(filePath);
     FileStatus fileStatus = ns.getFileStatus(filePath);
     BlockLocation[] fileBlockLocations =
         ns.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
     for (BlockLocation blockLocation : fileBlockLocations) {
       allBlocks++;
       for (String location : blockLocation.getHosts()) {
         HostAndPort hap = HostAndPort.fromParts(location, 0);
         if (hap.getHostText().equals(host)) {
           matchingBlocks++;
           break;
         }
       }
     }
   }
   totalBlocks.put(host, allBlocks + totalBlocks.get(host));
   localBlocks.put(host, matchingBlocks + localBlocks.get(host));
 }
    OneFileInfo(
        Path path,
        Configuration conf,
        HashMap<String, List<OneBlockInfo>> rackToBlocks,
        HashMap<OneBlockInfo, String[]> blockToNodes,
        HashMap<String, List<OneBlockInfo>> nodeToBlocks)
        throws IOException {
      this.fileSize = 0;

      // get block locations from file system
      FileSystem fs = path.getFileSystem(conf);
      FileStatus stat = fs.getFileStatus(path);
      BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, stat.getLen());
      // create a list of all block and their locations
      if (locations == null) {
        blocks = new OneBlockInfo[0];
      } else {
        blocks = new OneBlockInfo[locations.length];
        for (int i = 0; i < locations.length; i++) {

          fileSize += locations[i].getLength();
          OneBlockInfo oneblock =
              new OneBlockInfo(
                  path,
                  locations[i].getOffset(),
                  locations[i].getLength(),
                  locations[i].getHosts(),
                  locations[i].getTopologyPaths());
          blocks[i] = oneblock;

          // add this block to the block --> node locations map
          blockToNodes.put(oneblock, oneblock.hosts);

          // add this block to the rack --> block map
          for (int j = 0; j < oneblock.racks.length; j++) {
            String rack = oneblock.racks[j];
            List<OneBlockInfo> blklist = rackToBlocks.get(rack);
            if (blklist == null) {
              blklist = new ArrayList<OneBlockInfo>();
              rackToBlocks.put(rack, blklist);
            }
            blklist.add(oneblock);
            // Add this host to rackToNodes map
            addHostToRack(oneblock.racks[j], oneblock.hosts[j]);
          }

          // add this block to the node --> block map
          for (int j = 0; j < oneblock.hosts.length; j++) {
            String node = oneblock.hosts[j];
            List<OneBlockInfo> blklist = nodeToBlocks.get(node);
            if (blklist == null) {
              blklist = new ArrayList<OneBlockInfo>();
              nodeToBlocks.put(node, blklist);
            }
            blklist.add(oneblock);
          }
        }
      }
    }
Beispiel #3
0
 @Override
 public BlockLocation[] getLocations(FileSystem fs, FileStatus status) throws IOException {
   if (status instanceof LocatedFileStatus) {
     return ((LocatedFileStatus) status).getBlockLocations();
   } else {
     return fs.getFileBlockLocations(status, 0, status.getLen());
   }
 }
  /**
   * Test {@code BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len)}. Test
   * the different situations of different start and len.
   */
  @Test
  public void basicBlockLocationTest() throws Exception {
    long start = 0;
    long len = 0;
    FileStatus fStatus = sTFS.getFileStatus(new Path("/testFile1"));

    // block0.offset = start < start+len < block1.offset
    start = 0;
    len = BLOCK_SIZE - 1;
    Assert.assertEquals(1, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset < start < start+len < block1.offset
    start = 1;
    len = BLOCK_SIZE - 2;
    Assert.assertEquals(1, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset < start = start+len < block1.offset
    start = 1;
    len = 0;
    Assert.assertEquals(1, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset = start < start+len = block1.offset
    start = 0;
    len = BLOCK_SIZE;
    Assert.assertEquals(2, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset = start < block1.offset < start+len < block2.offset
    start = 0;
    len = BLOCK_SIZE + 1;
    Assert.assertEquals(2, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset < start < block1.offset < start+len < block2.offset
    start = 1;
    len = BLOCK_SIZE;
    Assert.assertEquals(2, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset = start < start+len = block2.offset
    start = 0;
    len = BLOCK_SIZE * 2;
    Assert.assertEquals(3, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // block0.offset = start < start+len = file.len
    start = 0;
    len = FILE_LEN;
    Assert.assertEquals(3, sTFS.getFileBlockLocations(fStatus, start, len).length);

    // file.len < start < start+len
    start = FILE_LEN + 1;
    len = 1;
    Assert.assertEquals(0, sTFS.getFileBlockLocations(fStatus, start, len).length);
  }
  /**
   * Generate the list of files and make them into FileSplits. This needs to be copied to insert a
   * filter on acceptable data
   */
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    long desiredMappers =
        job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> fileStatuses = listStatus(job);
    boolean forceNumberMappers = fileStatuses.size() == 1;
    for (FileStatus file : fileStatuses) {
      Path path = file.getPath();
      if (!isPathAcceptable(path)) // filter acceptable data
      continue;
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        // use desired mappers to force more splits
        if (forceNumberMappers && desiredMappers > 0)
          maxSize = Math.min(maxSize, (length / desiredMappers));

        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (withinSlop(splitSize, bytesRemaining)) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
          bytesRemaining -= splitSize;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        // Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }
    System.out.println("Total # of splits: " + splits.size());
    //     LOG.debug("Total # of splits: " + splits.size());
    return splits;
  }
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    Configuration conf = job.getConfiguration();
    int numMapTasks = conf.getInt("admm.iteration.num.map.tasks", 0);
    if (0 == numMapTasks) {
      return super.getSplits(job);
    }

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) {
        long blockSize = file.getBlockSize();
        long splitSize = Math.max(computeSplitSize(JAVA_OPTS, numMapTasks, length), blockSize);
        long splitLength = (long) (length / Math.ceil((double) length / splitSize));
        long bytesRemaining = length;

        while (((double) bytesRemaining) / splitLength > SPLIT_SLOP) {
          int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
          splits.add(
              new FileSplit(
                  path, length - bytesRemaining, splitLength, blkLocations[blkIndex].getHosts()));

          bytesRemaining -= splitLength;
        }

        if (bytesRemaining != 0) {
          splits.add(
              new FileSplit(
                  path,
                  length - bytesRemaining,
                  bytesRemaining,
                  blkLocations[blkLocations.length - 1].getHosts()));
        }
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else {
        splits.add(new FileSplit(path, 0, length, new String[0]));
      }
    }

    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    job.getConfiguration().setInt("admm.iteration.num.map.tasks", splits.size());
    return splits;
  }
 @Override
 public List<String> getFileLocations(String path, long offset) throws IOException {
   List<String> ret = new ArrayList<String>();
   try {
     FileStatus fStatus = mFs.getFileStatus(new Path(path));
     BlockLocation[] bLocations = mFs.getFileBlockLocations(fStatus, offset, 1);
     if (bLocations.length > 0) {
       String[] names = bLocations[0].getNames();
       Collections.addAll(ret, names);
     }
   } catch (IOException e) {
     LOG.error("Unable to get file location for " + path, e);
   }
   return ret;
 }
  private void waitForBlocks(FileSystem fileSys, Path name) throws IOException {
    // wait until we have at least one block in the file to read.
    boolean done = false;

    while (!done) {
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {
      }
      done = true;
      BlockLocation[] locations =
          fileSys.getFileBlockLocations(fileSys.getFileStatus(name), 0, blockSize);
      if (locations.length < 1) {
        done = false;
        continue;
      }
    }
  }
  static void checkFullFile(FileSystem fs, Path name) throws IOException {
    FileStatus stat = fs.getFileStatus(name);
    BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, fileSize);
    for (int idx = 0; idx < locations.length; idx++) {
      String[] hosts = locations[idx].getNames();
      for (int i = 0; i < hosts.length; i++) {
        System.out.print(hosts[i] + " ");
      }
      System.out.println(
          " off " + locations[idx].getOffset() + " len " + locations[idx].getLength());
    }

    byte[] expected = AppendTestUtil.randomBytes(seed, fileSize);
    FSDataInputStream stm = fs.open(name);
    byte[] actual = new byte[fileSize];
    stm.readFully(0, actual);
    checkData(actual, 0, expected, "Read 2");
    stm.close();
  }
Beispiel #10
0
  @Override
  public BlockLocation[] getFileBlockLocations(
      final FileStatus file, final long start, final long len) throws IOException {
    if (!(file instanceof HadoopFileStatus)) {
      throw new IOException("file is not an instance of DistributedFileStatus");
    }

    final HadoopFileStatus f = (HadoopFileStatus) file;

    final org.apache.hadoop.fs.BlockLocation[] blkLocations =
        fs.getFileBlockLocations(f.getInternalFileStatus(), start, len);

    // Wrap up HDFS specific block location objects
    final HadoopBlockLocation[] distBlkLocations = new HadoopBlockLocation[blkLocations.length];
    for (int i = 0; i < distBlkLocations.length; i++) {
      distBlkLocations[i] = new HadoopBlockLocation(blkLocations[i]);
    }

    return distBlkLocations;
  }
Beispiel #11
0
    /**
     * @param hadoopConf
     * @param bucket bucket to be processed by this split
     * @param files actual files this split should process. It is assumed the caller has already
     *     parsed out the files in base and deltas to populate this list.
     * @param base directory of the base, or the partition/table location if the files are in old
     *     style. Can be null.
     * @param deltas directories of the delta files.
     * @throws IOException
     */
    CompactorInputSplit(
        Configuration hadoopConf, int bucket, List<Path> files, Path base, Path[] deltas)
        throws IOException {
      bucketNum = bucket;
      this.base = base;
      this.deltas = deltas;
      locations = new ArrayList<String>();

      for (Path path : files) {
        FileSystem fs = path.getFileSystem(hadoopConf);
        FileStatus stat = fs.getFileStatus(path);
        length += stat.getLen();
        BlockLocation[] locs = fs.getFileBlockLocations(stat, 0, length);
        for (int i = 0; i < locs.length; i++) {
          String[] hosts = locs[i].getHosts();
          for (int j = 0; j < hosts.length; j++) {
            locations.add(hosts[j]);
          }
        }
      }
    }
  //
  // verify that the data written to the full blocks are sane
  //
  private void checkFile(FileSystem fileSys, Path name, int repl) throws IOException {
    boolean done = false;

    // wait till all full blocks are confirmed by the datanodes.
    while (!done) {
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {
      }
      done = true;
      BlockLocation[] locations =
          fileSys.getFileBlockLocations(fileSys.getFileStatus(name), 0, fileSize);
      if (locations.length < numBlocks) {
        done = false;
        continue;
      }
      for (int idx = 0; idx < locations.length; idx++) {
        if (locations[idx].getHosts().length < repl) {
          done = false;
          break;
        }
      }
    }
    FSDataInputStream stm = fileSys.open(name);
    final byte[] expected;
    if (simulatedStorage) {
      expected = new byte[numBlocks * blockSize];
      for (int i = 0; i < expected.length; i++) {
        expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
      }
    } else {
      expected = AppendTestUtil.randomBytes(seed, numBlocks * blockSize);
    }
    // do a sanity check. Read the file
    byte[] actual = new byte[numBlocks * blockSize];
    stm.readFully(0, actual);
    stm.close();
    checkData(actual, 0, expected, "Read 1");
  }
 /** wait for the file's replication to be done */
 public static void waitReplication(FileSystem fs, Path fileName, short replFactor)
     throws IOException {
   boolean good;
   do {
     good = true;
     BlockLocation locs[] =
         fs.getFileBlockLocations(fs.getFileStatus(fileName), 0, Long.MAX_VALUE);
     for (int j = 0; j < locs.length; j++) {
       String[] hostnames = locs[j].getNames();
       if (hostnames.length != replFactor) {
         String hostNameList = "";
         for (String h : hostnames) hostNameList += h + " ";
         System.out.println(
             "Block "
                 + j
                 + " of file "
                 + fileName
                 + " has replication factor "
                 + hostnames.length
                 + "; locations "
                 + hostNameList);
         good = false;
         try {
           System.out.println("Waiting for replication factor to drain");
           Thread.sleep(100);
         } catch (InterruptedException e) {
         }
         break;
       }
     }
     if (good) {
       System.out.println(
           "All blocks of file "
               + fileName
               + " verified to have replication factor "
               + replFactor);
     }
   } while (!good);
 }
  private void testDataNodeRedirect(Path path) throws IOException {
    // Create the file
    if (hdfs.exists(path)) {
      hdfs.delete(path, true);
    }
    FSDataOutputStream out = hdfs.create(path, (short) 1);
    out.writeBytes("0123456789");
    out.close();

    // Get the path's block location so we can determine
    // if we were redirected to the right DN.
    FileStatus status = hdfs.getFileStatus(path);
    BlockLocation[] locations = hdfs.getFileBlockLocations(status, 0, 10);
    String locationName = locations[0].getNames()[0];

    // Connect to the NN to get redirected
    URL u =
        hftpFs.getNamenodeURL(
            "/data" + ServletUtil.encodePath(path.toUri().getPath()), "ugi=userx,groupy");
    HttpURLConnection conn = (HttpURLConnection) u.openConnection();
    HttpURLConnection.setFollowRedirects(true);
    conn.connect();
    conn.getInputStream();

    boolean checked = false;
    // Find the datanode that has the block according to locations
    // and check that the URL was redirected to this DN's info port
    for (DataNode node : cluster.getDataNodes()) {
      DatanodeRegistration dnR = node.dnRegistration;
      if (dnR.getName().equals(locationName)) {
        checked = true;
        assertEquals(dnR.getInfoPort(), conn.getURL().getPort());
      }
    }
    assertTrue(
        "The test never checked that location of " + "the block and hftp desitnation are the same",
        checked);
  }
  //
  // verify that the data written to the full blocks are sane
  //
  private void checkFile(FileSystem fileSys, Path name, int repl) throws IOException {
    boolean done = false;

    // wait till all full blocks are confirmed by the datanodes.
    while (!done) {
      try {
        Thread.sleep(1000);
      } catch (InterruptedException e) {;
      }
      done = true;
      BlockLocation[] locations =
          fileSys.getFileBlockLocations(fileSys.getFileStatus(name), 0, AppendTestUtil.FILE_SIZE);
      if (locations.length < AppendTestUtil.NUM_BLOCKS) {
        System.out.println("Number of blocks found " + locations.length);
        done = false;
        continue;
      }
      for (int idx = 0; idx < AppendTestUtil.NUM_BLOCKS; idx++) {
        if (locations[idx].getHosts().length < repl) {
          System.out.println("Block index " + idx + " not yet replciated.");
          done = false;
          break;
        }
      }
    }
    byte[] expected = new byte[AppendTestUtil.NUM_BLOCKS * AppendTestUtil.BLOCK_SIZE];
    if (simulatedStorage) {
      for (int i = 0; i < expected.length; i++) {
        expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
      }
    } else {
      System.arraycopy(fileContents, 0, expected, 0, expected.length);
    }
    // do a sanity check. Read the file
    AppendTestUtil.checkFullFile(
        fileSys, name, AppendTestUtil.NUM_BLOCKS * AppendTestUtil.BLOCK_SIZE, expected, "Read 1");
  }
Beispiel #16
0
 static int blocksInFile(FileSystem fs, Path path, long len) throws IOException {
   FileStatus f = fs.getFileStatus(path);
   return fs.getFileBlockLocations(f, 0L, len).length;
 }
  /**
   * Generate the list of files and make them into FileSplits.
   *
   * @param job the job context
   * @throws IOException
   */
  public List<InputSplit> getSplits(JobContext job) throws IOException {
    Stopwatch sw = new Stopwatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
      Path path = file.getPath();
      long length = file.getLen();
      if (length != 0) {
        BlockLocation[] blkLocations;
        if (file instanceof LocatedFileStatus) {
          blkLocations = ((LocatedFileStatus) file).getBlockLocations();
        } else {
          FileSystem fs = path.getFileSystem(job.getConfiguration());
          blkLocations = fs.getFileBlockLocations(file, 0, length);
        }
        if (isSplitable(job, path)) {
          long blockSize = file.getBlockSize();
          long splitSize = computeSplitSize(blockSize, minSize, maxSize);

          long bytesRemaining = length;
          while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
            int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
            splits.add(
                makeSplit(
                    path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
            bytesRemaining -= splitSize;
          }

          if (bytesRemaining != 0) {
            int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
            splits.add(
                makeSplit(
                    path,
                    length - bytesRemaining,
                    bytesRemaining,
                    blkLocations[blkIndex].getHosts()));
          }
        } else { // not splitable
          splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
        }
      } else {
        // Create empty hosts array for zero length files
        splits.add(makeSplit(path, 0, length, new String[0]));
      }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LogGlobal.isDebugEnabled()) {
      /* LOG.debug("Total # of splits generated by getSplits: "+splits.size()+", TimeTaken: "+sw.elapsedMillis()) */
      LOG.total_splits_generated_getsplits_timetak(
              String.valueOf(splits.size()), String.valueOf(sw.elapsedMillis()))
          .tag("methodCall")
          .debug();
    }
    return splits;
  }
Beispiel #18
0
  @Test
  public void testHighAvailability() throws IOException {
    Configuration conf = new HdfsConfiguration();

    // Create cluster with 3 readers and 1 writer
    MiniDFSCluster cluster =
        new MiniDFSCluster.Builder(conf)
            .nnTopology(MiniDFSNNTopology.simpleHOPSTopology(4))
            .numDataNodes(2)
            .format(true)
            .build();
    cluster.waitActive();

    try {

      // Get the filesystem and create a directory
      FileSystem fs = cluster.getFileSystem(0);

      // Write operation should work since we have one writer
      assertTrue(fs.mkdirs(dir));

      // Write operation - Create a file and write something to it
      Path file1 = new Path(dir, "file1");
      createFile(fs, file1);

      // Read operation - The file should exist.
      assertTrue(fs.exists(file1));

      // Read operation - List files in this directory
      assertEquals(1, list(fs));

      // Read operation - Get file status
      FileStatus fileStatus = fs.listStatus(dir)[0];

      // Read operation - Get block locations
      assertNotSame(0, fs.getFileBlockLocations(file1, 0, 1).length);

      // Now we kill all namenodes except the last two
      cluster.getNameNode(0).stop();
      cluster.getNameNode(1).stop();

      // Now lets read again - These operations should be possible
      assertTrue(fs.exists(file1));

      // Writer operation - concat files
      Path file2 = new Path(dir, "file2");
      createFile(fs, file2);
      assertTrue(fs.exists(file2));
      Path file3 = new Path(dir, "file3");
      createFile(fs, file3);
      assertTrue(fs.exists(file3));
      Path file4 = new Path(dir, "file4");

      // Read operation - list files (3 files created now under this directory)
      assertEquals(3, list(fs));

      // Write operation - rename
      // [S] commented out because rename is not yet supported
      // ((DistributedFileSystem) fs).rename(file1, file4);

      // Kill another namenode
      cluster.getNameNode(2).stop();

      // Read operation - File status
      fs.getFileStatus(file2);

      // Write operation - Delete
      assertTrue(fs.delete(dir, true));

    } catch (IOException ex) {
      // In case we have any connectivity issues here, there is a problem
      // All connectivitiy issues are handled in the above piece of code
      LOG.error(ex);
      ex.printStackTrace();
      assertFalse("Cannot be any connectivity issues", ex instanceof ConnectException);
      fail();
    } finally {
      if (cluster != null) {
        cluster.shutdown();
      }
    }
  }
  @SuppressWarnings("unchecked")
  @Override
  /**
   * Splits the input collection into sets of files where each Map task gets about the same number
   * of files
   */
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
      return new InputSplit[0];
    }

    if (numSplits > paths.length) {
      numSplits = paths.length;
    } else if (numSplits < 1) {
      numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits =
        new ArrayList<PositionAwareSplit<CombineFileSplit>>(numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations =
        (TObjectLongHashMap<String>[]) Array.newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
      final FileStatus fss = fs.getFileStatus(paths[i]);
      lengths[i] = fss.getLen();
      final TObjectLongHashMap<String> location2size =
          locations[i] = new TObjectLongHashMap<String>();
      final long normalblocksize = fss.getBlockSize();
      for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
        final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
        final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
        for (BlockLocation bl : blockLocations) {
          for (String host : bl.getHosts()) {
            location2size.adjustOrPutValue(host, blocksize, blocksize);
          }
        }
      }
    }

    // we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
      /* caclulate split size for this task - usually numberOfFilesPerSplit, but
       * less than this for the last split */
      final int splitSizeForThisSplit =
          numberOfFilesPerSplit + pathsUsed > numPaths
              ? numPaths - pathsUsed
              : numberOfFilesPerSplit;
      // arrays of information for split
      Path[] splitPaths = new Path[splitSizeForThisSplit];
      long[] splitLengths = new long[splitSizeForThisSplit];
      long[] splitStarts = new long[splitSizeForThisSplit];
      final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
      String[] splitLocations = null; // final recommended locations for this split.
      for (int i = 0; i < splitSizeForThisSplit; i++) {
        locations[pathsUsed + i].forEachEntry(
            new TObjectLongProcedure<String>() {
              public boolean execute(String a, long b) {
                allLocationsForSplit.adjustOrPutValue(a, b, b);
                return true;
              }
            });
        if (allLocationsForSplit.size() <= 3) {
          splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
        } else {
          String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
          Arrays.sort(
              hosts,
              new Comparator<String>() {
                public int compare(String o1, String o2) {
                  long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                  if (diffamount > 0) {
                    return -1;
                  } else if (diffamount < 0) {
                    return 1;
                  }
                  return 0;
                }
              });
          splitLocations = new String[3];
          System.arraycopy(hosts, 0, splitLocations, 0, 3);
        }
      }

      // copy information for this split
      System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
      System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
      // count the number of paths consumed
      pathsUsed += splitSizeForThisSplit;

      // make the actual split object
      // logger.info("New split of size " + splitSizeForThisSplit);
      mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
      splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
      splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
      throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
  }