Example #1
0
 @Override
 public int compareTo(Object obj) {
   if (obj instanceof CompressAwarePath) {
     CompressAwarePath compPath = (CompressAwarePath) obj;
     if (this.compressedSize < compPath.getCompressedSize()) {
       return -1;
     } else if (this.getCompressedSize() > compPath.getCompressedSize()) {
       return 1;
     }
     // Not returning 0 here so that objects with the same size (but
     // different paths) are still added to the TreeSet.
   }
   return super.compareTo(obj);
 }
Example #2
0
  private RawKeyValueIterator finalMerge(
      JobConf job,
      FileSystem fs,
      List<InMemoryMapOutput<K, V>> inMemoryMapOutputs,
      List<CompressAwarePath> onDiskMapOutputs)
      throws IOException {
    LOG.info(
        "finalMerge called with "
            + inMemoryMapOutputs.size()
            + " in-memory map-outputs and "
            + onDiskMapOutputs.size()
            + " on-disk map-outputs");

    final float maxRedPer = job.getFloat(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT, 0f);
    if (maxRedPer > 1.0 || maxRedPer < 0.0) {
      throw new IOException(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT + maxRedPer);
    }
    int maxInMemReduce =
        (int) Math.min(Runtime.getRuntime().maxMemory() * maxRedPer, Integer.MAX_VALUE);

    // merge config params
    Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
    Class<V> valueClass = (Class<V>) job.getMapOutputValueClass();
    boolean keepInputs = job.getKeepFailedTaskFiles();
    final Path tmpDir = new Path(reduceId.toString());
    final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();

    // segments required to vacate memory
    List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
      TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID();
      inMemToDiskBytes =
          createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
      final int numMemDiskSegments = memDiskSegments.size();
      if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {

        // If we reach here, it implies that we have less than io.sort.factor
        // disk segments and this will be incremented by 1 (result of the
        // memory segments merge). Since this total would still be
        // <= io.sort.factor, we will not do any more intermediate merges,
        // the merge of all these disk segments would be directly fed to the
        // reduce method

        mergePhaseFinished = true;
        // must spill to disk, but can't retain in-mem for intermediate merge
        final Path outputPath =
            mapOutputFile
                .getInputFileForWrite(mapId, inMemToDiskBytes)
                .suffix(Task.MERGED_OUTPUT_PREFIX);
        final RawKeyValueIterator rIter =
            Merger.merge(
                job,
                fs,
                keyClass,
                valueClass,
                memDiskSegments,
                numMemDiskSegments,
                tmpDir,
                comparator,
                reporter,
                spilledRecordsCounter,
                null,
                mergePhase);
        Writer<K, V> writer =
            new Writer<K, V>(job, fs, outputPath, keyClass, valueClass, codec, null);
        try {
          Merger.writeFile(rIter, writer, reporter, job);
          writer.close();
          onDiskMapOutputs.add(
              new CompressAwarePath(
                  outputPath, writer.getRawLength(), writer.getCompressedLength()));
          writer = null;
          // add to list of final disk outputs.
        } catch (IOException e) {
          if (null != outputPath) {
            try {
              fs.delete(outputPath, true);
            } catch (IOException ie) {
              // NOTHING
            }
          }
          throw e;
        } finally {
          if (null != writer) {
            writer.close();
          }
        }
        LOG.info(
            "Merged "
                + numMemDiskSegments
                + " segments, "
                + inMemToDiskBytes
                + " bytes to disk to satisfy "
                + "reduce memory limit");
        inMemToDiskBytes = 0;
        memDiskSegments.clear();
      } else if (inMemToDiskBytes != 0) {
        LOG.info(
            "Keeping "
                + numMemDiskSegments
                + " segments, "
                + inMemToDiskBytes
                + " bytes in memory for "
                + "intermediate, on-disk merge");
      }
    }

    // segments on disk
    List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>();
    long onDiskBytes = inMemToDiskBytes;
    long rawBytes = inMemToDiskBytes;
    CompressAwarePath[] onDisk =
        onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]);
    for (CompressAwarePath file : onDisk) {
      long fileLength = fs.getFileStatus(file).getLen();
      onDiskBytes += fileLength;
      rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength;

      LOG.debug("Disk file: " + file + " Length is " + fileLength);
      diskSegments.add(
          new Segment<K, V>(
              job,
              fs,
              file,
              codec,
              keepInputs,
              (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX)
                  ? null
                  : mergedMapOutputsCounter),
              file.getRawDataLength()));
    }
    LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
    Collections.sort(
        diskSegments,
        new Comparator<Segment<K, V>>() {
          public int compare(Segment<K, V> o1, Segment<K, V> o2) {
            if (o1.getLength() == o2.getLength()) {
              return 0;
            }
            return o1.getLength() < o2.getLength() ? -1 : 1;
          }
        });

    // build final list of segments from merged backed by disk + in-mem
    List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
    LOG.info(
        "Merging "
            + finalSegments.size()
            + " segments, "
            + inMemBytes
            + " bytes from memory into reduce");
    if (0 != onDiskBytes) {
      final int numInMemSegments = memDiskSegments.size();
      diskSegments.addAll(0, memDiskSegments);
      memDiskSegments.clear();
      // Pass mergePhase only if there is a going to be intermediate
      // merges. See comment where mergePhaseFinished is being set
      Progress thisPhase = (mergePhaseFinished) ? null : mergePhase;
      RawKeyValueIterator diskMerge =
          Merger.merge(
              job,
              fs,
              keyClass,
              valueClass,
              codec,
              diskSegments,
              ioSortFactor,
              numInMemSegments,
              tmpDir,
              comparator,
              reporter,
              false,
              spilledRecordsCounter,
              null,
              thisPhase);
      diskSegments.clear();
      if (0 == finalSegments.size()) {
        return diskMerge;
      }
      finalSegments.add(
          new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));
    }
    return Merger.merge(
        job,
        fs,
        keyClass,
        valueClass,
        finalSegments,
        finalSegments.size(),
        tmpDir,
        comparator,
        reporter,
        spilledRecordsCounter,
        null,
        null);
  }