Пример #1
0
    private void mergeParts() throws IOException {
      // get the approximate size of the final output/index files
      long finalOutFileSize = 0;
      long finalIndexFileSize = 0;
      Path[] filename = new Path[numSpills];
      Path[] indexFileName = new Path[numSpills];
      FileSystem localFs = FileSystem.getLocal(job);

      for (int i = 0; i < numSpills; i++) {
        filename[i] = mapOutputFile.getSpillFile(getTaskID(), i);
        indexFileName[i] = mapOutputFile.getSpillIndexFile(getTaskID(), i);
        finalOutFileSize += localFs.getFileStatus(filename[i]).getLen();
      }

      if (numSpills == 1) { // the spill is the final output
        localFs.rename(filename[0], new Path(filename[0].getParent(), "file.out"));
        localFs.rename(indexFileName[0], new Path(indexFileName[0].getParent(), "file.out.index"));
        return;
      }
      // make correction in the length to include the sequence file header
      // lengths for each partition
      finalOutFileSize += partitions * APPROX_HEADER_LENGTH;

      finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;

      Path finalOutputFile = mapOutputFile.getOutputFileForWrite(getTaskID(), finalOutFileSize);
      Path finalIndexFile =
          mapOutputFile.getOutputIndexFileForWrite(getTaskID(), finalIndexFileSize);

      // The output stream for the final single output file
      FSDataOutputStream finalOut = localFs.create(finalOutputFile, true, 4096);

      // The final index file output stream
      FSDataOutputStream finalIndexOut = localFs.create(finalIndexFile, true, 4096);
      if (numSpills == 0) {
        // create dummy files
        for (int i = 0; i < partitions; i++) {
          long segmentStart = finalOut.getPos();
          Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec);
          writer.close();
          writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer);
        }
        finalOut.close();
        finalIndexOut.close();
        return;
      }
      {
        for (int parts = 0; parts < partitions; parts++) {
          // create the segments to be merged
          List<Segment<K, V>> segmentList = new ArrayList<Segment<K, V>>(numSpills);
          for (int i = 0; i < numSpills; i++) {
            FSDataInputStream indexIn = localFs.open(indexFileName[i]);
            indexIn.seek(parts * MAP_OUTPUT_INDEX_RECORD_LENGTH);
            long segmentOffset = indexIn.readLong();
            long rawSegmentLength = indexIn.readLong();
            long segmentLength = indexIn.readLong();
            indexIn.close();
            Segment<K, V> s =
                new Segment<K, V>(
                    job, localFs, filename[i], segmentOffset, segmentLength, codec, true);
            segmentList.add(i, s);

            if (LOG.isDebugEnabled()) {
              LOG.debug(
                  "Index: ("
                      + indexFileName[i]
                      + ", "
                      + segmentOffset
                      + rawSegmentLength
                      + ", "
                      + segmentLength
                      + ")");
            }
          }

          // merge
          @SuppressWarnings("unchecked")
          RawKeyValueIterator kvIter =
              Merger.merge(
                  job,
                  localFs,
                  keyClass,
                  valClass,
                  segmentList,
                  job.getInt("io.sort.factor", 100),
                  new Path(getTaskID().toString()),
                  job.getOutputKeyComparator(),
                  reporter);

          // write merged output to disk
          long segmentStart = finalOut.getPos();
          Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec);
          if (null == combinerClass
              || job.getCombineOnceOnly()
              || numSpills < minSpillsForCombine) {
            Merger.writeFile(kvIter, writer, reporter, job);
          } else {
            combineCollector.setWriter(writer);
            combineAndSpill(kvIter, combineInputCounter);
          }

          // close
          writer.close();

          // write index record
          writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer);
        }
        finalOut.close();
        finalIndexOut.close();
        // cleanup
        for (int i = 0; i < numSpills; i++) {
          localFs.delete(filename[i], true);
          localFs.delete(indexFileName[i], true);
        }
      }
    }
Пример #2
0
 @SuppressWarnings("unchecked")
 public MapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job, Reporter reporter)
     throws IOException {
   this.job = job;
   this.reporter = reporter;
   localFs = FileSystem.getLocal(job);
   partitions = job.getNumReduceTasks();
   partitioner = (Partitioner) ReflectionUtils.newInstance(job.getPartitionerClass(), job);
   // sanity checks
   final float spillper = job.getFloat("io.sort.spill.percent", (float) 0.8);
   final float recper = job.getFloat("io.sort.record.percent", (float) 0.05);
   final int sortmb = job.getInt("io.sort.mb", 100);
   if (spillper > (float) 1.0 || spillper < (float) 0.0) {
     throw new IOException("Invalid \"io.sort.spill.percent\": " + spillper);
   }
   if (recper > (float) 1.0 || recper < (float) 0.01) {
     throw new IOException("Invalid \"io.sort.record.percent\": " + recper);
   }
   if ((sortmb & 0x7FF) != sortmb) {
     throw new IOException("Invalid \"io.sort.mb\": " + sortmb);
   }
   sorter =
       (IndexedSorter)
           ReflectionUtils.newInstance(job.getClass("map.sort.class", QuickSort.class), job);
   LOG.info("io.sort.mb = " + sortmb);
   // buffers and accounting
   int maxMemUsage = sortmb << 20;
   int recordCapacity = (int) (maxMemUsage * recper);
   recordCapacity -= recordCapacity % RECSIZE;
   kvbuffer = new byte[maxMemUsage - recordCapacity];
   bufvoid = kvbuffer.length;
   recordCapacity /= RECSIZE;
   kvoffsets = new int[recordCapacity];
   kvindices = new int[recordCapacity * ACCTSIZE];
   softBufferLimit = (int) (kvbuffer.length * spillper);
   softRecordLimit = (int) (kvoffsets.length * spillper);
   LOG.info("data buffer = " + softBufferLimit + "/" + kvbuffer.length);
   LOG.info("record buffer = " + softRecordLimit + "/" + kvoffsets.length);
   // k/v serialization
   comparator = job.getOutputKeyComparator();
   keyClass = (Class<K>) job.getMapOutputKeyClass();
   valClass = (Class<V>) job.getMapOutputValueClass();
   serializationFactory = new SerializationFactory(job);
   keySerializer = serializationFactory.getSerializer(keyClass);
   keySerializer.open(bb);
   valSerializer = serializationFactory.getSerializer(valClass);
   valSerializer.open(bb);
   // counters
   Counters counters = getCounters();
   mapOutputByteCounter = counters.findCounter(MAP_OUTPUT_BYTES);
   mapOutputRecordCounter = counters.findCounter(MAP_OUTPUT_RECORDS);
   combineInputCounter = counters.findCounter(COMBINE_INPUT_RECORDS);
   combineOutputCounter = counters.findCounter(COMBINE_OUTPUT_RECORDS);
   // compression
   if (job.getCompressMapOutput()) {
     Class<? extends CompressionCodec> codecClass =
         job.getMapOutputCompressorClass(DefaultCodec.class);
     codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job);
   }
   // combiner
   combinerClass = job.getCombinerClass();
   combineCollector =
       (null != combinerClass) ? new CombineOutputCollector(combineOutputCounter) : null;
   minSpillsForCombine = job.getInt("min.num.spills.for.combine", 3);
 }