@Override public boolean seekToNewSource(long targetPos) throws IOException { long sumsPos = getChecksumFilePos(targetPos); fs.reportChecksumFailure(file, datas, targetPos, sums, sumsPos); boolean newDataSource = datas.seekToNewSource(targetPos); return sums.seekToNewSource(sumsPos) || newDataSource; }
public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file) throws IOException { this( fs, file, fs.getConf() .getInt( LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_KEY, LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_DEFAULT)); }
public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file, int bufferSize) throws IOException { super(file, fs.getFileStatus(file).getReplication()); this.datas = fs.getRawFileSystem().open(file, bufferSize); this.fs = fs; Path sumFile = fs.getChecksumFile(file); try { int sumBufferSize = fs.getSumBufferSize(fs.getBytesPerSum(), bufferSize); sums = fs.getRawFileSystem().open(sumFile, sumBufferSize); byte[] version = new byte[CHECKSUM_VERSION.length]; sums.readFully(version); if (!Arrays.equals(version, CHECKSUM_VERSION)) throw new IOException("Not a checksum file: " + sumFile); this.bytesPerSum = sums.readInt(); set(fs.verifyChecksum, new CRC32(), bytesPerSum, 4); } catch (FileNotFoundException e) { // quietly ignore set(fs.verifyChecksum, null, 1, 0); } catch (IOException e) { // loudly ignore LOG.warn( "Problem opening checksum file: " + file + ". Ignoring exception: " + StringUtils.stringifyException(e)); set(fs.verifyChecksum, null, 1, 0); } }
public ChecksumFSOutputSummer( ChecksumFileSystem fs, Path file, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { super(new CRC32(), fs.getBytesPerSum(), 4); int bytesPerSum = fs.getBytesPerSum(); this.datas = fs.getRawFileSystem() .create(file, overwrite, bufferSize, replication, blockSize, progress); int sumBufferSize = fs.getSumBufferSize(bytesPerSum, bufferSize); this.sums = fs.getRawFileSystem() .create(fs.getChecksumFile(file), true, sumBufferSize, replication, blockSize); sums.write(CHECKSUM_VERSION, 0, CHECKSUM_VERSION.length); sums.writeInt(bytesPerSum); }
/* Return the file length */ private long getFileLength() throws IOException { if (fileLen == -1L) { fileLen = fs.getContentSummary(file).getLength(); } return fileLen; }
public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file) throws IOException { this(fs, file, fs.getConf().getInt("io.file.buffer.size", 4096)); }
@Override public void merge(List<CompressAwarePath> inputs) throws IOException { // sanity check if (inputs == null || inputs.isEmpty()) { LOG.info("No ondisk files to merge..."); return; } long approxOutputSize = 0; int bytesPerSum = jobConf.getInt("io.bytes.per.checksum", 512); LOG.info( "OnDiskMerger: We have " + inputs.size() + " map outputs on disk. Triggering merge..."); // 1. Prepare the list of files to be merged. for (CompressAwarePath file : inputs) { approxOutputSize += localFS.getFileStatus(file).getLen(); } // add the checksum length approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum); // 2. Start the on-disk merge process Path outputPath = localDirAllocator .getLocalPathForWrite(inputs.get(0).toString(), approxOutputSize, jobConf) .suffix(Task.MERGED_OUTPUT_PREFIX); Writer<K, V> writer = new Writer<K, V>( jobConf, rfs, outputPath, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null); RawKeyValueIterator iter = null; CompressAwarePath compressAwarePath; Path tmpDir = new Path(reduceId.toString()); try { iter = Merger.merge( jobConf, rfs, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, inputs.toArray(new Path[inputs.size()]), true, ioSortFactor, tmpDir, (RawComparator<K>) jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, mergedMapOutputsCounter, null); Merger.writeFile(iter, writer, reporter, jobConf); writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); } catch (IOException e) { localFS.delete(outputPath, true); throw e; } closeOnDiskFile(compressAwarePath); LOG.info( reduceId + " Finished merging " + inputs.size() + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); }
RawKeyValueIterator merge( Class<K> keyClass, Class<V> valueClass, int factor, int inMem, Path tmpDir) throws IOException { LOG.info("Merging " + segments.size() + " sorted segments"); // create the MergeStreams from the sorted map created in the constructor // and dump the final output to a file int numSegments = segments.size(); int origFactor = factor; int passNo = 1; do { // get the factor for this pass of merge. We assume in-memory segments // are the first entries in the segment list and that the pass factor // doesn't apply to them factor = getPassFactor(factor, passNo, numSegments - inMem); if (1 == passNo) { factor += inMem; } List<Segment<K, V>> segmentsToMerge = new ArrayList<Segment<K, V>>(); int segmentsConsidered = 0; int numSegmentsToConsider = factor; while (true) { // extract the smallest 'factor' number of segments // Call cleanup on the empty segments (no key/value data) List<Segment<K, V>> mStream = getSegmentDescriptors(numSegmentsToConsider); for (Segment<K, V> segment : mStream) { // Initialize the segment at the last possible moment; // this helps in ensuring we don't use buffers until we need them segment.init(); long startPos = segment.getPosition(); boolean hasNext = segment.next(); long endPos = segment.getPosition(); totalBytesProcessed += endPos - startPos; mergeProgress.set(totalBytesProcessed * progPerByte); if (hasNext) { segmentsToMerge.add(segment); segmentsConsidered++; } else { segment.close(); numSegments--; // we ignore this segment for the merge } } // if we have the desired number of segments // or looked at all available segments, we break if (segmentsConsidered == factor || segments.size() == 0) { break; } numSegmentsToConsider = factor - segmentsConsidered; } // feed the streams to the priority queue initialize(segmentsToMerge.size()); clear(); for (Segment<K, V> segment : segmentsToMerge) { put(segment); } // if we have lesser number of segments remaining, then just return the // iterator, else do another single level merge if (numSegments <= factor) { // calculate the length of the remaining segments. Required for // calculating the merge progress long totalBytes = 0; for (int i = 0; i < segmentsToMerge.size(); i++) { totalBytes += segmentsToMerge.get(i).getLength(); } if (totalBytes != 0) // being paranoid progPerByte = 1.0f / (float) totalBytes; if (totalBytes != 0) mergeProgress.set(totalBytesProcessed * progPerByte); else mergeProgress.set(1.0f); // Last pass and no segments left - we're done LOG.info( "Down to the last merge-pass, with " + numSegments + " segments left of total size: " + totalBytes + " bytes"); return this; } else { LOG.info( "Merging " + segmentsToMerge.size() + " intermediate segments out of a total of " + (segments.size() + segmentsToMerge.size())); // we want to spread the creation of temp files on multiple disks if // available under the space constraints long approxOutputSize = 0; for (Segment<K, V> s : segmentsToMerge) { approxOutputSize += s.getLength() + ChecksumFileSystem.getApproxChkSumLength(s.getLength()); } Path tmpFilename = new Path(tmpDir, "intermediate").suffix("." + passNo); Path outputFile = lDirAlloc.getLocalPathForWrite(tmpFilename.toString(), approxOutputSize, conf); Writer<K, V> writer = new Writer<K, V>(conf, fs, outputFile, keyClass, valueClass, codec); writeFile(this, writer, reporter); writer.close(); // we finished one single level merge; now clean up the priority // queue this.close(); // Add the newly create segment to the list of segments to be merged Segment<K, V> tempSegment = new Segment<K, V>(conf, fs, outputFile, codec, false); segments.add(tempSegment); numSegments = segments.size(); Collections.sort(segments, segmentComparator); passNo++; } // we are worried about only the first pass merge factor. So reset the // factor to what it originally was factor = origFactor; } while (true); }