/** * Return the progress within the input split * * @return 0.0 to 1.0 of the input byte range */ public float getProgress() throws IOException, InterruptedException { if (end == start) { return 0.0f; } else { return Math.min(1.0f, (float) ((in.getPosition() - start) / (double) (end - start))); } }
/** Read raw bytes from a SequenceFile. */ public synchronized boolean nextKeyValue() throws IOException, InterruptedException { if (done) { return false; } long pos = in.getPosition(); key.set(pos); info.setPosition(pos); boolean eof = -1 == in.nextRawKey(buffer); if (!eof) { in.nextRawValue(vbytes); value.set( buffer.getLength(), vbytes.getSize(), (int) (in.getPosition() - pos), in.syncSeen()); } buffer.reset(); return !(done = (eof || (pos >= end && in.syncSeen()))); }
private List<InputSplit> getSplits( Configuration configuration, int numSplits, long totalSizeBytes) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(numSplits); long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits); CopyListingFileStatus srcFileStatus = new CopyListingFileStatus(); Text srcRelPath = new Text(); long currentSplitSize = 0; long lastSplitStart = 0; long lastPosition = 0; final Path listingFilePath = getListingFilePath(configuration); if (LOG.isDebugEnabled()) { LOG.debug( "Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes); } SequenceFile.Reader reader = null; try { reader = getListingFileReader(configuration); while (reader.next(srcRelPath, srcFileStatus)) { // If adding the current file would cause the bytes per map to exceed // limit. Add the current file to new split if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) { FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null); if (LOG.isDebugEnabled()) { LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize); } splits.add(split); lastSplitStart = lastPosition; currentSplitSize = 0; } currentSplitSize += srcFileStatus.getLen(); lastPosition = reader.getPosition(); } if (lastPosition > lastSplitStart) { FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null); if (LOG.isDebugEnabled()) { LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize); } splits.add(split); } } finally { IOUtils.closeStream(reader); } return splits; }
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Path path = ((FileSplit) split).getPath(); Configuration conf = context.getConfiguration(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = ((FileSplit) split).getStart() + split.getLength(); if (((FileSplit) split).getStart() > in.getPosition()) { in.sync(((FileSplit) split).getStart()); // sync to start } this.start = in.getPosition(); vbytes = in.createValueBytes(); done = start >= end; info = InputInfo.getInstance(); info.setSplit((FileSplit) split); System.err.println("input split = " + split); }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { final int srcCount = job.getInt(OP_COUNT_LABEL, -1); final int targetcount = srcCount / numSplits; String srclist = job.get(OP_LIST_LABEL, ""); if (srcCount < 0 || "".equals(srclist)) { throw new RuntimeException( "Invalid metadata: #files(" + srcCount + ") listuri(" + srclist + ")"); } Path srcs = new Path(srclist); FileSystem fs = srcs.getFileSystem(job); List<FileSplit> splits = new ArrayList<FileSplit>(numSplits); Text key = new Text(); PolicyInfo value = new PolicyInfo(); SequenceFile.Reader in = null; long prev = 0L; int count = 0; // count src try { for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value); ) { long curr = in.getPosition(); long delta = curr - prev; if (++count > targetcount) { count = 0; splits.add(new FileSplit(srcs, prev, delta, (String[]) null)); prev = curr; } } } finally { in.close(); } long remaining = fs.getFileStatus(srcs).getLen() - prev; if (remaining != 0) { splits.add(new FileSplit(srcs, prev, remaining, (String[]) null)); } LOG.info( "jobname= " + jobName + " numSplits=" + numSplits + ", splits.size()=" + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
/** * splits the input files into tasks handled by a single node we have to read the input files to * do this based on a number of items in a sequence */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long filesPerTask = DistBlockFixer.filesPerTask(job.getConfiguration()); Path[] inPaths = getInputPaths(job); List<InputSplit> splits = new ArrayList<InputSplit>(); long fileCounter = 0; for (Path inPath : inPaths) { FileSystem fs = inPath.getFileSystem(job.getConfiguration()); if (!fs.getFileStatus(inPath).isDir()) { throw new IOException(inPath.toString() + " is not a directory"); } FileStatus[] inFiles = fs.listStatus(inPath); for (FileStatus inFileStatus : inFiles) { Path inFile = inFileStatus.getPath(); if (!inFileStatus.isDir() && (inFile.getName().equals(job.getJobName() + IN_FILE_SUFFIX))) { fileCounter++; SequenceFile.Reader inFileReader = new SequenceFile.Reader(fs, inFile, job.getConfiguration()); long startPos = inFileReader.getPosition(); long counter = 0; // create an input split every filesPerTask items in the sequence LongWritable key = new LongWritable(); Text value = new Text(); try { while (inFileReader.next(key, value)) { if (counter % filesPerTask == filesPerTask - 1L) { splits.add( new FileSplit(inFile, startPos, inFileReader.getPosition() - startPos, null)); startPos = inFileReader.getPosition(); } counter++; } // create input split for remaining items if necessary // this includes the case where no splits were created by the loop if (startPos != inFileReader.getPosition()) { splits.add( new FileSplit(inFile, startPos, inFileReader.getPosition() - startPos, null)); } } finally { inFileReader.close(); } } } } LOG.info("created " + splits.size() + " input splits from " + fileCounter + " files"); return splits; }