public void initialize(TezOutputContext outputContext, Configuration conf, int numOutputs) throws IOException { this.outputContext = outputContext; this.conf = conf; this.partitions = numOutputs; rfs = ((LocalFileSystem) FileSystem.getLocal(this.conf)).getRaw(); // sorter sorter = ReflectionUtils.newInstance( this.conf.getClass( TezJobConfig.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, QuickSort.class, IndexedSorter.class), this.conf); comparator = ConfigUtils.getIntermediateOutputKeyComparator(this.conf); // k/v serialization keyClass = ConfigUtils.getIntermediateOutputKeyClass(this.conf); valClass = ConfigUtils.getIntermediateOutputValueClass(this.conf); serializationFactory = new SerializationFactory(this.conf); keySerializer = serializationFactory.getSerializer(keyClass); valSerializer = serializationFactory.getSerializer(valClass); // counters mapOutputByteCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_BYTES); mapOutputRecordCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_RECORDS); fileOutputByteCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES); spilledRecordsCounter = outputContext.getCounters().findCounter(TaskCounter.SPILLED_RECORDS); // compression if (ConfigUtils.shouldCompressIntermediateOutput(this.conf)) { Class<? extends CompressionCodec> codecClass = ConfigUtils.getIntermediateOutputCompressorClass(this.conf, DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, this.conf); } else { codec = null; } // Task outputs mapOutputFile = TezRuntimeUtils.instantiateTaskOutputManager(conf, outputContext); LOG.info( "Instantiating Partitioner: [" + conf.get(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS) + "]"); this.conf.setInt(TezJobConfig.TEZ_RUNTIME_NUM_EXPECTED_PARTITIONS, this.partitions); this.partitioner = TezRuntimeUtils.instantiatePartitioner(this.conf); this.combiner = TezRuntimeUtils.instantiateCombiner(this.conf, outputContext); }
private void mergeParts() throws IOException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final String taskIdentifier = outputContext.getUniqueIdentifier(); for (int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); } if (numSpills == 1) { // the spill is the final output sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0])); if (indexCacheList.size() == 0) { sameVolRename( mapOutputFile.getSpillIndexFile(0), mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0])); } else { indexCacheList .get(0) .writeToFile(mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), conf); } return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = mapOutputFile.getSpillIndexFile(i); indexCacheList.add(new TezSpillRecord(indexFileName, conf)); } // make correction in the length to include the sequence file header // lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); // The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { // TODO Change event generation to say there is no data rather than generating a dummy file // create dummy files TezSpillRecord sr = new TezSpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null); writer.close(); TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); // Covers the case of multiple spills. outputBytesWithOverheadCounter.increment(writer.getRawLength()); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, conf); } finally { finalOut.close(); } return; } else { final TezSpillRecord spillRec = new TezSpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { // create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment( conf, rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug( "TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")"); } } int mergeFactor = this.conf.getInt( TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; // merge TezRawKeyValueIterator kvIter = TezMerger.merge( conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null); // Not using any Progress in TezMerger. Should just work. // write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null); if (combiner == null || numSpills < minSpillsForCombine) { TezMerger.writeFile( kvIter, writer, nullProgressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT); } else { runCombineProcessor(kvIter, writer); } writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, conf); finalOut.close(); for (int i = 0; i < numSpills; i++) { rfs.delete(filename[i], true); } } }
public Fetcher( Configuration job, ShuffleScheduler scheduler, MergeManager merger, ShuffleClientMetrics metrics, Shuffle shuffle, SecretKey jobTokenSecret, TezInputContext inputContext) throws IOException { this.job = job; this.scheduler = scheduler; this.merger = merger; this.metrics = metrics; this.shuffle = shuffle; this.id = ++nextId; this.jobTokenSecret = jobTokenSecret; ioErrs = inputContext .getCounters() .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.IO_ERROR.toString()); wrongLengthErrs = inputContext .getCounters() .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_LENGTH.toString()); badIdErrs = inputContext .getCounters() .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.BAD_ID.toString()); wrongMapErrs = inputContext .getCounters() .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_MAP.toString()); connectionErrs = inputContext .getCounters() .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.CONNECTION.toString()); wrongReduceErrs = inputContext .getCounters() .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_REDUCE.toString()); if (ConfigUtils.isIntermediateInputCompressed(job)) { Class<? extends CompressionCodec> codecClass = ConfigUtils.getIntermediateInputCompressorClass(job, DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, job); decompressor = CodecPool.getDecompressor(codec); } else { codec = null; decompressor = null; } this.connectionTimeout = job.getInt( TezJobConfig.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_STALLED_COPY_TIMEOUT); this.readTimeout = job.getInt( TezJobConfig.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT); setName("fetcher#" + id); setDaemon(true); synchronized (Fetcher.class) { sslShuffle = job.getBoolean( TezJobConfig.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_ENABLE_SSL); if (sslShuffle && sslFactory == null) { sslFactory = new SSLFactory(SSLFactory.Mode.CLIENT, job); try { sslFactory.init(); } catch (Exception ex) { sslFactory.destroy(); throw new RuntimeException(ex); } } } }