public void initialize(TezOutputContext outputContext, Configuration conf, int numOutputs)
      throws IOException {
    this.outputContext = outputContext;
    this.conf = conf;
    this.partitions = numOutputs;

    rfs = ((LocalFileSystem) FileSystem.getLocal(this.conf)).getRaw();

    // sorter
    sorter =
        ReflectionUtils.newInstance(
            this.conf.getClass(
                TezJobConfig.TEZ_RUNTIME_INTERNAL_SORTER_CLASS,
                QuickSort.class,
                IndexedSorter.class),
            this.conf);

    comparator = ConfigUtils.getIntermediateOutputKeyComparator(this.conf);

    // k/v serialization
    keyClass = ConfigUtils.getIntermediateOutputKeyClass(this.conf);
    valClass = ConfigUtils.getIntermediateOutputValueClass(this.conf);
    serializationFactory = new SerializationFactory(this.conf);
    keySerializer = serializationFactory.getSerializer(keyClass);
    valSerializer = serializationFactory.getSerializer(valClass);

    //    counters
    mapOutputByteCounter = outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_BYTES);
    mapOutputRecordCounter =
        outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_RECORDS);
    fileOutputByteCounter =
        outputContext.getCounters().findCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES);
    spilledRecordsCounter = outputContext.getCounters().findCounter(TaskCounter.SPILLED_RECORDS);
    // compression
    if (ConfigUtils.shouldCompressIntermediateOutput(this.conf)) {
      Class<? extends CompressionCodec> codecClass =
          ConfigUtils.getIntermediateOutputCompressorClass(this.conf, DefaultCodec.class);
      codec = ReflectionUtils.newInstance(codecClass, this.conf);
    } else {
      codec = null;
    }

    // Task outputs
    mapOutputFile = TezRuntimeUtils.instantiateTaskOutputManager(conf, outputContext);

    LOG.info(
        "Instantiating Partitioner: ["
            + conf.get(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS)
            + "]");
    this.conf.setInt(TezJobConfig.TEZ_RUNTIME_NUM_EXPECTED_PARTITIONS, this.partitions);
    this.partitioner = TezRuntimeUtils.instantiatePartitioner(this.conf);
    this.combiner = TezRuntimeUtils.instantiateCombiner(this.conf, outputContext);
  }
Beispiel #2
0
  private void mergeParts() throws IOException {
    // get the approximate size of the final output/index files
    long finalOutFileSize = 0;
    long finalIndexFileSize = 0;
    final Path[] filename = new Path[numSpills];
    final String taskIdentifier = outputContext.getUniqueIdentifier();

    for (int i = 0; i < numSpills; i++) {
      filename[i] = mapOutputFile.getSpillFile(i);
      finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
    }
    if (numSpills == 1) { // the spill is the final output
      sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0]));
      if (indexCacheList.size() == 0) {
        sameVolRename(
            mapOutputFile.getSpillIndexFile(0),
            mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]));
      } else {
        indexCacheList
            .get(0)
            .writeToFile(mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), conf);
      }
      return;
    }

    // read in paged indices
    for (int i = indexCacheList.size(); i < numSpills; ++i) {
      Path indexFileName = mapOutputFile.getSpillIndexFile(i);
      indexCacheList.add(new TezSpillRecord(indexFileName, conf));
    }

    // make correction in the length to include the sequence file header
    // lengths for each partition
    finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
    finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize);
    Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);

    // The output stream for the final single output file
    FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);

    if (numSpills == 0) {
      // TODO Change event generation to say there is no data rather than generating a dummy file
      // create dummy files

      TezSpillRecord sr = new TezSpillRecord(partitions);
      try {
        for (int i = 0; i < partitions; i++) {
          long segmentStart = finalOut.getPos();
          Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null);
          writer.close();

          TezIndexRecord rec =
              new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
          // Covers the case of multiple spills.
          outputBytesWithOverheadCounter.increment(writer.getRawLength());
          sr.putIndex(rec, i);
        }
        sr.writeToFile(finalIndexFile, conf);
      } finally {
        finalOut.close();
      }
      return;
    } else {
      final TezSpillRecord spillRec = new TezSpillRecord(partitions);
      for (int parts = 0; parts < partitions; parts++) {
        // create the segments to be merged
        List<Segment> segmentList = new ArrayList<Segment>(numSpills);
        for (int i = 0; i < numSpills; i++) {
          TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);

          Segment s =
              new Segment(
                  conf,
                  rfs,
                  filename[i],
                  indexRecord.getStartOffset(),
                  indexRecord.getPartLength(),
                  codec,
                  ifileReadAhead,
                  ifileReadAheadLength,
                  ifileBufferSize,
                  true);
          segmentList.add(i, s);

          if (LOG.isDebugEnabled()) {
            LOG.debug(
                "TaskIdentifier="
                    + taskIdentifier
                    + " Partition="
                    + parts
                    + "Spill ="
                    + i
                    + "("
                    + indexRecord.getStartOffset()
                    + ","
                    + indexRecord.getRawLength()
                    + ", "
                    + indexRecord.getPartLength()
                    + ")");
          }
        }

        int mergeFactor =
            this.conf.getInt(
                TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR,
                TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
        // sort the segments only if there are intermediate merges
        boolean sortSegments = segmentList.size() > mergeFactor;
        // merge
        TezRawKeyValueIterator kvIter =
            TezMerger.merge(
                conf,
                rfs,
                keyClass,
                valClass,
                codec,
                segmentList,
                mergeFactor,
                new Path(taskIdentifier),
                (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf),
                nullProgressable,
                sortSegments,
                true,
                null,
                spilledRecordsCounter,
                additionalSpillBytesRead,
                null); // Not using any Progress in TezMerger. Should just work.

        // write merged output to disk
        long segmentStart = finalOut.getPos();
        Writer writer =
            new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null);
        if (combiner == null || numSpills < minSpillsForCombine) {
          TezMerger.writeFile(
              kvIter,
              writer,
              nullProgressable,
              TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
        } else {
          runCombineProcessor(kvIter, writer);
        }
        writer.close();

        // record offsets
        final TezIndexRecord rec =
            new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
        spillRec.putIndex(rec, parts);
      }
      spillRec.writeToFile(finalIndexFile, conf);
      finalOut.close();
      for (int i = 0; i < numSpills; i++) {
        rfs.delete(filename[i], true);
      }
    }
  }
  public Fetcher(
      Configuration job,
      ShuffleScheduler scheduler,
      MergeManager merger,
      ShuffleClientMetrics metrics,
      Shuffle shuffle,
      SecretKey jobTokenSecret,
      TezInputContext inputContext)
      throws IOException {
    this.job = job;
    this.scheduler = scheduler;
    this.merger = merger;
    this.metrics = metrics;
    this.shuffle = shuffle;
    this.id = ++nextId;
    this.jobTokenSecret = jobTokenSecret;
    ioErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.IO_ERROR.toString());
    wrongLengthErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_LENGTH.toString());
    badIdErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.BAD_ID.toString());
    wrongMapErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_MAP.toString());
    connectionErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.CONNECTION.toString());
    wrongReduceErrs =
        inputContext
            .getCounters()
            .findCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_REDUCE.toString());

    if (ConfigUtils.isIntermediateInputCompressed(job)) {
      Class<? extends CompressionCodec> codecClass =
          ConfigUtils.getIntermediateInputCompressorClass(job, DefaultCodec.class);
      codec = ReflectionUtils.newInstance(codecClass, job);
      decompressor = CodecPool.getDecompressor(codec);
    } else {
      codec = null;
      decompressor = null;
    }

    this.connectionTimeout =
        job.getInt(
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT,
            TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_STALLED_COPY_TIMEOUT);
    this.readTimeout =
        job.getInt(
            TezJobConfig.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT,
            TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT);

    setName("fetcher#" + id);
    setDaemon(true);

    synchronized (Fetcher.class) {
      sslShuffle =
          job.getBoolean(
              TezJobConfig.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL,
              TezJobConfig.DEFAULT_TEZ_RUNTIME_SHUFFLE_ENABLE_SSL);
      if (sslShuffle && sslFactory == null) {
        sslFactory = new SSLFactory(SSLFactory.Mode.CLIENT, job);
        try {
          sslFactory.init();
        } catch (Exception ex) {
          sslFactory.destroy();
          throw new RuntimeException(ex);
        }
      }
    }
  }