private void mergeParts() throws IOException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; Path[] filename = new Path[numSpills]; Path[] indexFileName = new Path[numSpills]; FileSystem localFs = FileSystem.getLocal(job); for (int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(getTaskID(), i); indexFileName[i] = mapOutputFile.getSpillIndexFile(getTaskID(), i); finalOutFileSize += localFs.getFileStatus(filename[i]).getLen(); } if (numSpills == 1) { // the spill is the final output localFs.rename(filename[0], new Path(filename[0].getParent(), "file.out")); localFs.rename(indexFileName[0], new Path(indexFileName[0].getParent(), "file.out.index")); return; } // make correction in the length to include the sequence file header // lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(getTaskID(), finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(getTaskID(), finalIndexFileSize); // The output stream for the final single output file FSDataOutputStream finalOut = localFs.create(finalOutputFile, true, 4096); // The final index file output stream FSDataOutputStream finalIndexOut = localFs.create(finalIndexFile, true, 4096); if (numSpills == 0) { // create dummy files for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec); writer.close(); writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer); } finalOut.close(); finalIndexOut.close(); return; } { for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment<K, V>> segmentList = new ArrayList<Segment<K, V>>(numSpills); for (int i = 0; i < numSpills; i++) { FSDataInputStream indexIn = localFs.open(indexFileName[i]); indexIn.seek(parts * MAP_OUTPUT_INDEX_RECORD_LENGTH); long segmentOffset = indexIn.readLong(); long rawSegmentLength = indexIn.readLong(); long segmentLength = indexIn.readLong(); indexIn.close(); Segment<K, V> s = new Segment<K, V>( job, localFs, filename[i], segmentOffset, segmentLength, codec, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug( "Index: (" + indexFileName[i] + ", " + segmentOffset + rawSegmentLength + ", " + segmentLength + ")"); } } // merge @SuppressWarnings("unchecked") RawKeyValueIterator kvIter = Merger.merge( job, localFs, keyClass, valClass, segmentList, job.getInt("io.sort.factor", 100), new Path(getTaskID().toString()), job.getOutputKeyComparator(), reporter); // write merged output to disk long segmentStart = finalOut.getPos(); Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec); if (null == combinerClass || job.getCombineOnceOnly() || numSpills < minSpillsForCombine) { Merger.writeFile(kvIter, writer, reporter, job); } else { combineCollector.setWriter(writer); combineAndSpill(kvIter, combineInputCounter); } // close writer.close(); // write index record writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer); } finalOut.close(); finalIndexOut.close(); // cleanup for (int i = 0; i < numSpills; i++) { localFs.delete(filename[i], true); localFs.delete(indexFileName[i], true); } } }
/** * Handles the degenerate case where serialization fails to fit in the in-memory buffer, so we * must spill the record from collect directly to a spill file. Consider this "losing". */ @SuppressWarnings("unchecked") private void spillSingleRecord(final K key, final V value) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; FSDataOutputStream indexOut = null; final int partition = partitioner.getPartition(key, value, partitions); try { // create spill file Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, size); out = localFs.create(filename); // create spill index Path indexFilename = mapOutputFile.getSpillIndexFileForWrite( getTaskID(), numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); indexOut = localFs.create(indexFilename); // we don't run the combiner for a single record for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer(job, out, keyClass, valClass, codec); if (i == partition) { if (job.getCombineOnceOnly()) { Reducer combiner = (Reducer) ReflectionUtils.newInstance(combinerClass, job); combineCollector.setWriter(writer); combiner.reduce( key, new Iterator<V>() { private boolean done = false; public boolean hasNext() { return !done; } public V next() { if (done) throw new NoSuchElementException(); done = true; return value; } public void remove() { throw new UnsupportedOperationException(); } }, combineCollector, reporter); } else { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } } writer.close(); // index record writeIndexRecord(indexOut, out, segmentStart, writer); } catch (IOException e) { if (null != writer) writer.close(); throw e; } } ++numSpills; } finally { if (out != null) out.close(); if (indexOut != null) indexOut.close(); } }