コード例 #1
0
 /**
  * Merge zero or more spill files together, choosing the fastest merging strategy based on the
  * number of spills and the IO compression codec.
  *
  * @return the partition lengths in the merged file.
  */
 private long[] mergeSpills(SpillInfo[] spills) throws IOException {
   final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
   final boolean compressionEnabled = sparkConf.getBoolean("spark.shuffle.compress", true);
   final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf);
   final boolean fastMergeEnabled =
       sparkConf.getBoolean("spark.shuffle.unsafe.fastMergeEnabled", true);
   final boolean fastMergeIsSupported =
       !compressionEnabled || compressionCodec instanceof LZFCompressionCodec;
   try {
     if (spills.length == 0) {
       new FileOutputStream(outputFile).close(); // Create an empty file
       return new long[partitioner.numPartitions()];
     } else if (spills.length == 1) {
       // Here, we don't need to perform any metrics updates because the bytes written to this
       // output file would have already been counted as shuffle bytes written.
       Files.move(spills[0].file, outputFile);
       return spills[0].partitionLengths;
     } else {
       final long[] partitionLengths;
       // There are multiple spills to merge, so none of these spill files' lengths were counted
       // towards our shuffle write count or shuffle write time. If we use the slow merge path,
       // then the final output file's size won't necessarily be equal to the sum of the spill
       // files' sizes. To guard against this case, we look at the output file's actual size when
       // computing shuffle bytes written.
       //
       // We allow the individual merge methods to report their own IO times since different merge
       // strategies use different IO techniques.  We count IO during merge towards the shuffle
       // shuffle write time, which appears to be consistent with the "not bypassing merge-sort"
       // branch in ExternalSorter.
       if (fastMergeEnabled && fastMergeIsSupported) {
         // Compression is disabled or we are using an IO compression codec that supports
         // decompression of concatenated compressed streams, so we can perform a fast spill merge
         // that doesn't need to interpret the spilled bytes.
         if (transferToEnabled) {
           logger.debug("Using transferTo-based fast merge");
           partitionLengths = mergeSpillsWithTransferTo(spills, outputFile);
         } else {
           logger.debug("Using fileStream-based fast merge");
           partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null);
         }
       } else {
         logger.debug("Using slow merge");
         partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec);
       }
       // When closing an UnsafeShuffleExternalSorter that has already spilled once but also has
       // in-memory records, we write out the in-memory records to a file but do not count that
       // final write as bytes spilled (instead, it's accounted as shuffle write). The merge needs
       // to be counted as shuffle write, but this will lead to double-counting of the final
       // SpillInfo's bytes.
       writeMetrics.decShuffleBytesWritten(spills[spills.length - 1].file.length());
       writeMetrics.incShuffleBytesWritten(outputFile.length());
       return partitionLengths;
     }
   } catch (IOException e) {
     if (outputFile.exists() && !outputFile.delete()) {
       logger.error("Unable to delete output file {}", outputFile.getPath());
     }
     throw e;
   }
 }
コード例 #2
0
 public UnsafeShuffleWriter(
     BlockManager blockManager,
     IndexShuffleBlockResolver shuffleBlockResolver,
     TaskMemoryManager memoryManager,
     ShuffleMemoryManager shuffleMemoryManager,
     UnsafeShuffleHandle<K, V> handle,
     int mapId,
     TaskContext taskContext,
     SparkConf sparkConf)
     throws IOException {
   final int numPartitions = handle.dependency().partitioner().numPartitions();
   if (numPartitions > UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS()) {
     throw new IllegalArgumentException(
         "UnsafeShuffleWriter can only be used for shuffles with at most "
             + UnsafeShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS()
             + " reduce partitions");
   }
   this.blockManager = blockManager;
   this.shuffleBlockResolver = shuffleBlockResolver;
   this.memoryManager = memoryManager;
   this.shuffleMemoryManager = shuffleMemoryManager;
   this.mapId = mapId;
   final ShuffleDependency<K, V, V> dep = handle.dependency();
   this.shuffleId = dep.shuffleId();
   this.serializer = Serializer.getSerializer(dep.serializer()).newInstance();
   this.partitioner = dep.partitioner();
   this.writeMetrics = new ShuffleWriteMetrics();
   taskContext.taskMetrics().shuffleWriteMetrics_$eq(Option.apply(writeMetrics));
   this.taskContext = taskContext;
   this.sparkConf = sparkConf;
   this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true);
   open();
 }