/** * splits the input files into tasks handled by a single node we have to read the input files to * do this based on a number of items in a sequence */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long filesPerTask = DistBlockFixer.filesPerTask(job.getConfiguration()); Path[] inPaths = getInputPaths(job); List<InputSplit> splits = new ArrayList<InputSplit>(); long fileCounter = 0; for (Path inPath : inPaths) { FileSystem fs = inPath.getFileSystem(job.getConfiguration()); if (!fs.getFileStatus(inPath).isDir()) { throw new IOException(inPath.toString() + " is not a directory"); } FileStatus[] inFiles = fs.listStatus(inPath); for (FileStatus inFileStatus : inFiles) { Path inFile = inFileStatus.getPath(); if (!inFileStatus.isDir() && (inFile.getName().equals(job.getJobName() + IN_FILE_SUFFIX))) { fileCounter++; SequenceFile.Reader inFileReader = new SequenceFile.Reader(fs, inFile, job.getConfiguration()); long startPos = inFileReader.getPosition(); long counter = 0; // create an input split every filesPerTask items in the sequence LongWritable key = new LongWritable(); Text value = new Text(); try { while (inFileReader.next(key, value)) { if (counter % filesPerTask == filesPerTask - 1L) { splits.add( new FileSplit(inFile, startPos, inFileReader.getPosition() - startPos, null)); startPos = inFileReader.getPosition(); } counter++; } // create input split for remaining items if necessary // this includes the case where no splits were created by the loop if (startPos != inFileReader.getPosition()) { splits.add( new FileSplit(inFile, startPos, inFileReader.getPosition() - startPos, null)); } } finally { inFileReader.close(); } } } } LOG.info("created " + splits.size() + " input splits from " + fileCounter + " files"); return splits; }
public DistBlockFixer(Configuration conf) { super(conf); filesPerTask = DistBlockFixer.filesPerTask(getConf()); maxPendingJobs = DistBlockFixer.maxPendingJobs(getConf()); maxFixTimeForFile = DistBlockFixer.maxFixTimeForFile(getConf()); }