/**
  * Merge zero or more spill files together, choosing the fastest merging strategy based on the
  * number of spills and the IO compression codec.
  *
  * @return the partition lengths in the merged file.
  */
 private long[] mergeSpills(SpillInfo[] spills) throws IOException {
   final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
   final boolean compressionEnabled = sparkConf.getBoolean("spark.shuffle.compress", true);
   final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf);
   final boolean fastMergeEnabled =
       sparkConf.getBoolean("spark.shuffle.unsafe.fastMergeEnabled", true);
   final boolean fastMergeIsSupported =
       !compressionEnabled || compressionCodec instanceof LZFCompressionCodec;
   try {
     if (spills.length == 0) {
       new FileOutputStream(outputFile).close(); // Create an empty file
       return new long[partitioner.numPartitions()];
     } else if (spills.length == 1) {
       // Here, we don't need to perform any metrics updates because the bytes written to this
       // output file would have already been counted as shuffle bytes written.
       Files.move(spills[0].file, outputFile);
       return spills[0].partitionLengths;
     } else {
       final long[] partitionLengths;
       // There are multiple spills to merge, so none of these spill files' lengths were counted
       // towards our shuffle write count or shuffle write time. If we use the slow merge path,
       // then the final output file's size won't necessarily be equal to the sum of the spill
       // files' sizes. To guard against this case, we look at the output file's actual size when
       // computing shuffle bytes written.
       //
       // We allow the individual merge methods to report their own IO times since different merge
       // strategies use different IO techniques.  We count IO during merge towards the shuffle
       // shuffle write time, which appears to be consistent with the "not bypassing merge-sort"
       // branch in ExternalSorter.
       if (fastMergeEnabled && fastMergeIsSupported) {
         // Compression is disabled or we are using an IO compression codec that supports
         // decompression of concatenated compressed streams, so we can perform a fast spill merge
         // that doesn't need to interpret the spilled bytes.
         if (transferToEnabled) {
           logger.debug("Using transferTo-based fast merge");
           partitionLengths = mergeSpillsWithTransferTo(spills, outputFile);
         } else {
           logger.debug("Using fileStream-based fast merge");
           partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null);
         }
       } else {
         logger.debug("Using slow merge");
         partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec);
       }
       // When closing an UnsafeShuffleExternalSorter that has already spilled once but also has
       // in-memory records, we write out the in-memory records to a file but do not count that
       // final write as bytes spilled (instead, it's accounted as shuffle write). The merge needs
       // to be counted as shuffle write, but this will lead to double-counting of the final
       // SpillInfo's bytes.
       writeMetrics.decShuffleBytesWritten(spills[spills.length - 1].file.length());
       writeMetrics.incShuffleBytesWritten(outputFile.length());
       return partitionLengths;
     }
   } catch (IOException e) {
     if (outputFile.exists() && !outputFile.delete()) {
       logger.error("Unable to delete output file {}", outputFile.getPath());
     }
     throw e;
   }
 }
  @Override
  public void insertAll(Iterator<Product2<K, V>> records) throws IOException {
    assert (partitionWriters == null);
    if (!records.hasNext()) {
      return;
    }
    final SerializerInstance serInstance = serializer.newInstance();
    final long openStartTime = System.nanoTime();
    partitionWriters = new DiskBlockObjectWriter[numPartitions];
    for (int i = 0; i < numPartitions; i++) {
      final Tuple2<TempShuffleBlockId, File> tempShuffleBlockIdPlusFile =
          blockManager.diskBlockManager().createTempShuffleBlock();
      final File file = tempShuffleBlockIdPlusFile._2();
      final BlockId blockId = tempShuffleBlockIdPlusFile._1();
      partitionWriters[i] =
          blockManager
              .getDiskWriter(blockId, file, serInstance, fileBufferSize, writeMetrics)
              .open();
    }
    // Creating the file to write to and creating a disk writer both involve interacting with
    // the disk, and can take a long time in aggregate when we open many files, so should be
    // included in the shuffle write time.
    writeMetrics.incShuffleWriteTime(System.nanoTime() - openStartTime);

    while (records.hasNext()) {
      final Product2<K, V> record = records.next();
      final K key = record._1();
      partitionWriters[partitioner.getPartition(key)].write(key, record._2());
    }

    for (DiskBlockObjectWriter writer : partitionWriters) {
      writer.commitAndClose();
    }
  }
  @Override
  public long[] writePartitionedFile(BlockId blockId, TaskContext context, File outputFile)
      throws IOException {
    // Track location of the partition starts in the output file
    final long[] lengths = new long[numPartitions];
    if (partitionWriters == null) {
      // We were passed an empty iterator
      return lengths;
    }

    final FileOutputStream out = new FileOutputStream(outputFile, true);
    final long writeStartTime = System.nanoTime();
    boolean threwException = true;
    try {
      for (int i = 0; i < numPartitions; i++) {
        final FileInputStream in = new FileInputStream(partitionWriters[i].fileSegment().file());
        boolean copyThrewException = true;
        try {
          lengths[i] = Utils.copyStream(in, out, false, transferToEnabled);
          copyThrewException = false;
        } finally {
          Closeables.close(in, copyThrewException);
        }
        if (!partitionWriters[i].fileSegment().file().delete()) {
          logger.error("Unable to delete file for partition {}", i);
        }
      }
      threwException = false;
    } finally {
      Closeables.close(out, threwException);
      writeMetrics.incShuffleWriteTime(System.nanoTime() - writeStartTime);
    }
    partitionWriters = null;
    return lengths;
  }
  /**
   * Merges spill files by using NIO's transferTo to concatenate spill partitions' bytes. This is
   * only safe when the IO compression codec and serializer support concatenation of serialized
   * streams.
   *
   * @return the partition lengths in the merged file.
   */
  private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) throws IOException {
    assert (spills.length >= 2);
    final int numPartitions = partitioner.numPartitions();
    final long[] partitionLengths = new long[numPartitions];
    final FileChannel[] spillInputChannels = new FileChannel[spills.length];
    final long[] spillInputChannelPositions = new long[spills.length];
    FileChannel mergedFileOutputChannel = null;

    boolean threwException = true;
    try {
      for (int i = 0; i < spills.length; i++) {
        spillInputChannels[i] = new FileInputStream(spills[i].file).getChannel();
      }
      // This file needs to opened in append mode in order to work around a Linux kernel bug that
      // affects transferTo; see SPARK-3948 for more details.
      mergedFileOutputChannel = new FileOutputStream(outputFile, true).getChannel();

      long bytesWrittenToMergedFile = 0;
      for (int partition = 0; partition < numPartitions; partition++) {
        for (int i = 0; i < spills.length; i++) {
          final long partitionLengthInSpill = spills[i].partitionLengths[partition];
          long bytesToTransfer = partitionLengthInSpill;
          final FileChannel spillInputChannel = spillInputChannels[i];
          final long writeStartTime = System.nanoTime();
          while (bytesToTransfer > 0) {
            final long actualBytesTransferred =
                spillInputChannel.transferTo(
                    spillInputChannelPositions[i], bytesToTransfer, mergedFileOutputChannel);
            spillInputChannelPositions[i] += actualBytesTransferred;
            bytesToTransfer -= actualBytesTransferred;
          }
          writeMetrics.incShuffleWriteTime(System.nanoTime() - writeStartTime);
          bytesWrittenToMergedFile += partitionLengthInSpill;
          partitionLengths[partition] += partitionLengthInSpill;
        }
      }
      // Check the position after transferTo loop to see if it is in the right position and raise an
      // exception if it is incorrect. The position will not be increased to the expected length
      // after calling transferTo in kernel version 2.6.32. This issue is described at
      // https://bugs.openjdk.java.net/browse/JDK-7052359 and SPARK-3948.
      if (mergedFileOutputChannel.position() != bytesWrittenToMergedFile) {
        throw new IOException(
            "Current position "
                + mergedFileOutputChannel.position()
                + " does not equal expected "
                + "position "
                + bytesWrittenToMergedFile
                + " after transferTo. Please check your kernel"
                + " version to see if it is 2.6.32, as there is a kernel bug which will lead to "
                + "unexpected behavior when using transferTo. You can set spark.file.transferTo=false "
                + "to disable this NIO feature.");
      }
      threwException = false;
    } finally {
      // To avoid masking exceptions that caused us to prematurely enter the finally block, only
      // throw exceptions during cleanup if threwException == false.
      for (int i = 0; i < spills.length; i++) {
        assert (spillInputChannelPositions[i] == spills[i].file.length());
        Closeables.close(spillInputChannels[i], threwException);
      }
      Closeables.close(mergedFileOutputChannel, threwException);
    }
    return partitionLengths;
  }