Пример #1
0
 /**
  * Create a new pool and add the filters to it. A pathname can satisfy any one of the specified
  * filters. A split cannot have files from different pools.
  */
 protected void createPool(PathFilter... filters) {
   MultiPathFilter multi = new MultiPathFilter();
   for (PathFilter f : filters) {
     multi.add(f);
   }
   pools.add(multi);
 }
Пример #2
0
  @Override
  public List<InputSplit> getSplits(JobContext job) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;
    Configuration conf = job.getConfiguration();

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
      minSizeNode = minSplitSizeNode;
    } else {
      minSizeNode = conf.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
      minSizeRack = minSplitSizeRack;
    } else {
      minSizeRack = conf.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
      maxSize = maxSplitSize;
    } else {
      maxSize = conf.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
      throw new IOException(
          "Minimum split size pernode "
              + minSizeNode
              + " cannot be larger than maximum split size "
              + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
      throw new IOException(
          "Minimum split size per rack"
              + minSizeRack
              + " cannot be larger than maximum split size "
              + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
      throw new IOException(
          "Minimum split size per node"
              + minSizeNode
              + " cannot be smaller than minimum split "
              + "size per rack "
              + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(job).toArray(new FileStatus[0]));
    List<InputSplit> splits = new ArrayList<InputSplit>();
    if (paths.length == 0) {
      return splits;
    }

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contains paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
      ArrayList<Path> myPaths = new ArrayList<Path>();

      // pick one input path. If it matches all the filters in a pool,
      // add it to the output set
      for (int i = 0; i < paths.length; i++) {
        if (paths[i] == null) { // already processed
          continue;
        }
        Path p = new Path(paths[i].toUri().getPath());
        if (onepool.accept(p)) {
          myPaths.add(paths[i]); // add it to my output set
          paths[i] = null; // already processed
        }
      }
      // create splits for all files in this pool.
      getMoreSplits(
          conf,
          myPaths.toArray(new Path[myPaths.size()]),
          maxSize,
          minSizeNode,
          minSizeRack,
          splits);
    }

    // Finally, process all paths that do not belong to any pool.
    ArrayList<Path> myPaths = new ArrayList<Path>();
    for (int i = 0; i < paths.length; i++) {
      if (paths[i] == null) { // already processed
        continue;
      }
      myPaths.add(paths[i]);
    }
    // create splits for all files that are not in any pool.
    getMoreSplits(
        conf, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits;
  }