private ValuesIterator createEmptyIterator(boolean inMemory) throws IOException, InterruptedException { if (!inMemory) { streamPaths = new Path[0]; // This will return EmptyIterator rawKeyValueIterator = TezMerger.merge( conf, fs, keyClass, valClass, null, false, -1, 1024, streamPaths, false, mergeFactor, tmpDir, comparator, new ProgressReporter(), null, null, null, null); } else { List<TezMerger.Segment> segments = Lists.newLinkedList(); // This will return EmptyIterator rawKeyValueIterator = TezMerger.merge( conf, fs, keyClass, valClass, segments, mergeFactor, tmpDir, comparator, new ProgressReporter(), new GenericCounter("readsCounter", "y"), new GenericCounter("writesCounter", "y1"), new GenericCounter("bytesReadCounter", "y2"), new Progress()); } return new ValuesIterator( rawKeyValueIterator, comparator, keyClass, valClass, conf, (TezCounter) new GenericCounter("inputKeyCounter", "y3"), (TezCounter) new GenericCounter("inputValueCounter", "y4")); }
/** * Create sample data (in memory), with an attached counter and return ValuesIterator * * @param inMemory * @param keyCounter * @param tupleCounter * @return ValuesIterator * @throws IOException */ private ValuesIterator createCountedIterator( boolean inMemory, TezCounter keyCounter, TezCounter tupleCounter) throws IOException, InterruptedException { if (!inMemory) { streamPaths = createFiles(); // Merge all files to get KeyValueIterator rawKeyValueIterator = TezMerger.merge( conf, fs, keyClass, valClass, null, false, -1, 1024, streamPaths, false, mergeFactor, tmpDir, comparator, new ProgressReporter(), null, null, null, null); } else { List<TezMerger.Segment> segments = createInMemStreams(); rawKeyValueIterator = TezMerger.merge( conf, fs, keyClass, valClass, segments, mergeFactor, tmpDir, comparator, new ProgressReporter(), new GenericCounter("readsCounter", "y"), new GenericCounter("writesCounter", "y1"), new GenericCounter("bytesReadCounter", "y2"), new Progress()); } return new ValuesIterator( rawKeyValueIterator, comparator, keyClass, valClass, conf, keyCounter, tupleCounter); }
private void mergeParts() throws IOException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final String taskIdentifier = outputContext.getUniqueIdentifier(); for (int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); } if (numSpills == 1) { // the spill is the final output sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0])); if (indexCacheList.size() == 0) { sameVolRename( mapOutputFile.getSpillIndexFile(0), mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0])); } else { indexCacheList .get(0) .writeToFile(mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), conf); } return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = mapOutputFile.getSpillIndexFile(i); indexCacheList.add(new TezSpillRecord(indexFileName, conf)); } // make correction in the length to include the sequence file header // lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); // The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { // TODO Change event generation to say there is no data rather than generating a dummy file // create dummy files TezSpillRecord sr = new TezSpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null); writer.close(); TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); // Covers the case of multiple spills. outputBytesWithOverheadCounter.increment(writer.getRawLength()); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, conf); } finally { finalOut.close(); } return; } else { final TezSpillRecord spillRec = new TezSpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment( conf, rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug( "TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")"); } } int mergeFactor = this.conf.getInt( TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; // merge TezRawKeyValueIterator kvIter = TezMerger.merge( conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null); // Not using any Progress in TezMerger. Should just work. // write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null); if (combiner == null || numSpills < minSpillsForCombine) { TezMerger.writeFile( kvIter, writer, nullProgressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT); } else { runCombineProcessor(kvIter, writer); } writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, conf); finalOut.close(); for (int i = 0; i < numSpills; i++) { rfs.delete(filename[i], true); } } }