private void mergeParts() throws IOException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; Path[] filename = new Path[numSpills]; Path[] indexFileName = new Path[numSpills]; FileSystem localFs = FileSystem.getLocal(job); for (int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(getTaskID(), i); indexFileName[i] = mapOutputFile.getSpillIndexFile(getTaskID(), i); finalOutFileSize += localFs.getFileStatus(filename[i]).getLen(); } if (numSpills == 1) { // the spill is the final output localFs.rename(filename[0], new Path(filename[0].getParent(), "file.out")); localFs.rename(indexFileName[0], new Path(indexFileName[0].getParent(), "file.out.index")); return; } // make correction in the length to include the sequence file header // lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(getTaskID(), finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(getTaskID(), finalIndexFileSize); // The output stream for the final single output file FSDataOutputStream finalOut = localFs.create(finalOutputFile, true, 4096); // The final index file output stream FSDataOutputStream finalIndexOut = localFs.create(finalIndexFile, true, 4096); if (numSpills == 0) { // create dummy files for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec); writer.close(); writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer); } finalOut.close(); finalIndexOut.close(); return; } { for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment<K, V>> segmentList = new ArrayList<Segment<K, V>>(numSpills); for (int i = 0; i < numSpills; i++) { FSDataInputStream indexIn = localFs.open(indexFileName[i]); indexIn.seek(parts * MAP_OUTPUT_INDEX_RECORD_LENGTH); long segmentOffset = indexIn.readLong(); long rawSegmentLength = indexIn.readLong(); long segmentLength = indexIn.readLong(); indexIn.close(); Segment<K, V> s = new Segment<K, V>( job, localFs, filename[i], segmentOffset, segmentLength, codec, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug( "Index: (" + indexFileName[i] + ", " + segmentOffset + rawSegmentLength + ", " + segmentLength + ")"); } } // merge @SuppressWarnings("unchecked") RawKeyValueIterator kvIter = Merger.merge( job, localFs, keyClass, valClass, segmentList, job.getInt("io.sort.factor", 100), new Path(getTaskID().toString()), job.getOutputKeyComparator(), reporter); // write merged output to disk long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec); if (null == combinerClass || job.getCombineOnceOnly() || numSpills < minSpillsForCombine) { Merger.writeFile(kvIter, writer, reporter, job); } else { combineCollector.setWriter(writer); combineAndSpill(kvIter, combineInputCounter); } // close writer.close(); // write index record writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer); } finalOut.close(); finalIndexOut.close(); // cleanup for (int i = 0; i < numSpills; i++) { localFs.delete(filename[i], true); localFs.delete(indexFileName[i], true); } } }
@SuppressWarnings("unchecked") public MapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job, Reporter reporter) throws IOException { this.job = job; this.reporter = reporter; localFs = FileSystem.getLocal(job); partitions = job.getNumReduceTasks(); partitioner = (Partitioner) ReflectionUtils.newInstance(job.getPartitionerClass(), job); // sanity checks final float spillper = job.getFloat("io.sort.spill.percent", (float) 0.8); final float recper = job.getFloat("io.sort.record.percent", (float) 0.05); final int sortmb = job.getInt("io.sort.mb", 100); if (spillper > (float) 1.0 || spillper < (float) 0.0) { throw new IOException("Invalid \"io.sort.spill.percent\": " + spillper); } if (recper > (float) 1.0 || recper < (float) 0.01) { throw new IOException("Invalid \"io.sort.record.percent\": " + recper); } if ((sortmb & 0x7FF) != sortmb) { throw new IOException("Invalid \"io.sort.mb\": " + sortmb); } sorter = (IndexedSorter) ReflectionUtils.newInstance(job.getClass("map.sort.class", QuickSort.class), job); LOG.info("io.sort.mb = " + sortmb); // buffers and accounting int maxMemUsage = sortmb << 20; int recordCapacity = (int) (maxMemUsage * recper); recordCapacity -= recordCapacity % RECSIZE; kvbuffer = new byte[maxMemUsage - recordCapacity]; bufvoid = kvbuffer.length; recordCapacity /= RECSIZE; kvoffsets = new int[recordCapacity]; kvindices = new int[recordCapacity * ACCTSIZE]; softBufferLimit = (int) (kvbuffer.length * spillper); softRecordLimit = (int) (kvoffsets.length * spillper); LOG.info("data buffer = " + softBufferLimit + "/" + kvbuffer.length); LOG.info("record buffer = " + softRecordLimit + "/" + kvoffsets.length); // k/v serialization comparator = job.getOutputKeyComparator(); keyClass = (Class<K>) job.getMapOutputKeyClass(); valClass = (Class<V>) job.getMapOutputValueClass(); serializationFactory = new SerializationFactory(job); keySerializer = serializationFactory.getSerializer(keyClass); keySerializer.open(bb); valSerializer = serializationFactory.getSerializer(valClass); valSerializer.open(bb); // counters Counters counters = getCounters(); mapOutputByteCounter = counters.findCounter(MAP_OUTPUT_BYTES); mapOutputRecordCounter = counters.findCounter(MAP_OUTPUT_RECORDS); combineInputCounter = counters.findCounter(COMBINE_INPUT_RECORDS); combineOutputCounter = counters.findCounter(COMBINE_OUTPUT_RECORDS); // compression if (job.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = job.getMapOutputCompressorClass(DefaultCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job); } // combiner combinerClass = job.getCombinerClass(); combineCollector = (null != combinerClass) ? new CombineOutputCollector(combineOutputCounter) : null; minSpillsForCombine = job.getInt("min.num.spills.for.combine", 3); }