private void addBlocks( VolumeManager fs, String host, ArrayList<String> files, Map<String, Long> totalBlocks, Map<String, Long> localBlocks) throws Exception { long allBlocks = 0; long matchingBlocks = 0; if (!totalBlocks.containsKey(host)) { totalBlocks.put(host, 0L); localBlocks.put(host, 0L); } for (String file : files) { Path filePath = new Path(file); FileSystem ns = fs.getFileSystemByPath(filePath); FileStatus fileStatus = ns.getFileStatus(filePath); BlockLocation[] fileBlockLocations = ns.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); for (BlockLocation blockLocation : fileBlockLocations) { allBlocks++; for (String location : blockLocation.getHosts()) { HostAndPort hap = HostAndPort.fromParts(location, 0); if (hap.getHostText().equals(host)) { matchingBlocks++; break; } } } } totalBlocks.put(host, allBlocks + totalBlocks.get(host)); localBlocks.put(host, matchingBlocks + localBlocks.get(host)); }
public FileFragment(String tableName, Path uri, BlockLocation blockLocation) throws IOException { this( tableName, uri, blockLocation.getOffset(), blockLocation.getLength(), blockLocation.getHosts(), null); }
@SuppressWarnings("unchecked") @Override /** * Splits the input collection into sets of files where each Map task gets about the same number * of files */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path[] paths = FileInputFormat.getInputPaths(job); // HADOOP-1818: Manage splits only if there are paths if (paths.length == 0) { return new InputSplit[0]; } if (numSplits > paths.length) { numSplits = paths.length; } else if (numSplits < 1) { numSplits = 1; } logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks"); List<PositionAwareSplit<CombineFileSplit>> splits = new ArrayList<PositionAwareSplit<CombineFileSplit>>(numSplits); final int numPaths = paths.length; long[] lengths = new long[numPaths]; TObjectLongHashMap<String>[] locations = (TObjectLongHashMap<String>[]) Array.newInstance(TObjectLongHashMap.class, numPaths); final FileSystem fs = FileSystem.get(job); for (int i = 0; i < paths.length; i++) { final FileStatus fss = fs.getFileStatus(paths[i]); lengths[i] = fss.getLen(); final TObjectLongHashMap<String> location2size = locations[i] = new TObjectLongHashMap<String>(); final long normalblocksize = fss.getBlockSize(); for (long offset = 0; offset < lengths[i]; offset += normalblocksize) { final long blocksize = Math.min(offset + normalblocksize, lengths[i]); final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize); for (BlockLocation bl : blockLocations) { for (String host : bl.getHosts()) { location2size.adjustOrPutValue(host, blocksize, blocksize); } } } } // we need to over-estimate using ceil, to ensure that the last split is not /too/ big final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits); int pathsUsed = 0; int splitnum = 0; CombineFileSplit mfs; // for each split except the last one (which may be smaller than numberOfFilesPerSplit) while (pathsUsed < numPaths) { /* caclulate split size for this task - usually numberOfFilesPerSplit, but * less than this for the last split */ final int splitSizeForThisSplit = numberOfFilesPerSplit + pathsUsed > numPaths ? numPaths - pathsUsed : numberOfFilesPerSplit; // arrays of information for split Path[] splitPaths = new Path[splitSizeForThisSplit]; long[] splitLengths = new long[splitSizeForThisSplit]; long[] splitStarts = new long[splitSizeForThisSplit]; final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>(); String[] splitLocations = null; // final recommended locations for this split. for (int i = 0; i < splitSizeForThisSplit; i++) { locations[pathsUsed + i].forEachEntry( new TObjectLongProcedure<String>() { public boolean execute(String a, long b) { allLocationsForSplit.adjustOrPutValue(a, b, b); return true; } }); if (allLocationsForSplit.size() <= 3) { splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); } else { String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]); Arrays.sort( hosts, new Comparator<String>() { public int compare(String o1, String o2) { long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2); if (diffamount > 0) { return -1; } else if (diffamount < 0) { return 1; } return 0; } }); splitLocations = new String[3]; System.arraycopy(hosts, 0, splitLocations, 0, 3); } } // copy information for this split System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit); System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit); // count the number of paths consumed pathsUsed += splitSizeForThisSplit; // make the actual split object // logger.info("New split of size " + splitSizeForThisSplit); mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations); splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum)); splitnum++; } if (!(pathsUsed == paths.length)) { throw new IOException("Number of used paths does not equal total available paths!"); } return splits.toArray(new PositionAwareSplit[splits.size()]); }