Ejemplo n.º 1
0
 private void addBlocks(
     VolumeManager fs,
     String host,
     ArrayList<String> files,
     Map<String, Long> totalBlocks,
     Map<String, Long> localBlocks)
     throws Exception {
   long allBlocks = 0;
   long matchingBlocks = 0;
   if (!totalBlocks.containsKey(host)) {
     totalBlocks.put(host, 0L);
     localBlocks.put(host, 0L);
   }
   for (String file : files) {
     Path filePath = new Path(file);
     FileSystem ns = fs.getFileSystemByPath(filePath);
     FileStatus fileStatus = ns.getFileStatus(filePath);
     BlockLocation[] fileBlockLocations =
         ns.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
     for (BlockLocation blockLocation : fileBlockLocations) {
       allBlocks++;
       for (String location : blockLocation.getHosts()) {
         HostAndPort hap = HostAndPort.fromParts(location, 0);
         if (hap.getHostText().equals(host)) {
           matchingBlocks++;
           break;
         }
       }
     }
   }
   totalBlocks.put(host, allBlocks + totalBlocks.get(host));
   localBlocks.put(host, matchingBlocks + localBlocks.get(host));
 }
Ejemplo n.º 2
0
 public FileFragment(String tableName, Path uri, BlockLocation blockLocation) throws IOException {
   this(
       tableName,
       uri,
       blockLocation.getOffset(),
       blockLocation.getLength(),
       blockLocation.getHosts(),
       null);
 }
  @SuppressWarnings("unchecked")
  @Override
  /**
   * Splits the input collection into sets of files where each Map task gets about the same number
   * of files
   */
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    Path[] paths = FileInputFormat.getInputPaths(job);
    // HADOOP-1818: Manage splits only if there are paths
    if (paths.length == 0) {
      return new InputSplit[0];
    }

    if (numSplits > paths.length) {
      numSplits = paths.length;
    } else if (numSplits < 1) {
      numSplits = 1;
    }
    logger.info("Allocating " + paths.length + " files across " + numSplits + " map tasks");
    List<PositionAwareSplit<CombineFileSplit>> splits =
        new ArrayList<PositionAwareSplit<CombineFileSplit>>(numSplits);
    final int numPaths = paths.length;
    long[] lengths = new long[numPaths];
    TObjectLongHashMap<String>[] locations =
        (TObjectLongHashMap<String>[]) Array.newInstance(TObjectLongHashMap.class, numPaths);
    final FileSystem fs = FileSystem.get(job);
    for (int i = 0; i < paths.length; i++) {
      final FileStatus fss = fs.getFileStatus(paths[i]);
      lengths[i] = fss.getLen();
      final TObjectLongHashMap<String> location2size =
          locations[i] = new TObjectLongHashMap<String>();
      final long normalblocksize = fss.getBlockSize();
      for (long offset = 0; offset < lengths[i]; offset += normalblocksize) {
        final long blocksize = Math.min(offset + normalblocksize, lengths[i]);
        final BlockLocation[] blockLocations = fs.getFileBlockLocations(fss, offset, blocksize);
        for (BlockLocation bl : blockLocations) {
          for (String host : bl.getHosts()) {
            location2size.adjustOrPutValue(host, blocksize, blocksize);
          }
        }
      }
    }

    // we need to over-estimate using ceil, to ensure that the last split is not /too/ big
    final int numberOfFilesPerSplit = (int) Math.ceil((double) paths.length / (double) numSplits);

    int pathsUsed = 0;
    int splitnum = 0;
    CombineFileSplit mfs;
    // for each split except the last one (which may be smaller than numberOfFilesPerSplit)
    while (pathsUsed < numPaths) {
      /* caclulate split size for this task - usually numberOfFilesPerSplit, but
       * less than this for the last split */
      final int splitSizeForThisSplit =
          numberOfFilesPerSplit + pathsUsed > numPaths
              ? numPaths - pathsUsed
              : numberOfFilesPerSplit;
      // arrays of information for split
      Path[] splitPaths = new Path[splitSizeForThisSplit];
      long[] splitLengths = new long[splitSizeForThisSplit];
      long[] splitStarts = new long[splitSizeForThisSplit];
      final TObjectLongHashMap<String> allLocationsForSplit = new TObjectLongHashMap<String>();
      String[] splitLocations = null; // final recommended locations for this split.
      for (int i = 0; i < splitSizeForThisSplit; i++) {
        locations[pathsUsed + i].forEachEntry(
            new TObjectLongProcedure<String>() {
              public boolean execute(String a, long b) {
                allLocationsForSplit.adjustOrPutValue(a, b, b);
                return true;
              }
            });
        if (allLocationsForSplit.size() <= 3) {
          splitLocations = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
        } else {
          String[] hosts = allLocationsForSplit.keys(new String[allLocationsForSplit.size()]);
          Arrays.sort(
              hosts,
              new Comparator<String>() {
                public int compare(String o1, String o2) {
                  long diffamount = allLocationsForSplit.get(o1) - allLocationsForSplit.get(o2);
                  if (diffamount > 0) {
                    return -1;
                  } else if (diffamount < 0) {
                    return 1;
                  }
                  return 0;
                }
              });
          splitLocations = new String[3];
          System.arraycopy(hosts, 0, splitLocations, 0, 3);
        }
      }

      // copy information for this split
      System.arraycopy(lengths, pathsUsed, splitLengths, 0, splitSizeForThisSplit);
      System.arraycopy(paths, pathsUsed, splitPaths, 0, splitSizeForThisSplit);
      // count the number of paths consumed
      pathsUsed += splitSizeForThisSplit;

      // make the actual split object
      // logger.info("New split of size " + splitSizeForThisSplit);
      mfs = new CombineFileSplit(job, splitPaths, splitStarts, splitLengths, splitLocations);
      splits.add(new PositionAwareSplit<CombineFileSplit>(mfs, splitnum));
      splitnum++;
    }

    if (!(pathsUsed == paths.length)) {
      throw new IOException("Number of used paths does not equal total available paths!");
    }
    return splits.toArray(new PositionAwareSplit[splits.size()]);
  }