Example #1
0
    /**
     * Handles the degenerate case where serialization fails to fit in the in-memory buffer, so we
     * must spill the record from collect directly to a spill file. Consider this "losing".
     */
    @SuppressWarnings("unchecked")
    private void spillSingleRecord(final K key, final V value) throws IOException {
      long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
      FSDataOutputStream out = null;
      FSDataOutputStream indexOut = null;
      final int partition = partitioner.getPartition(key, value, partitions);
      try {
        // create spill file
        Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, size);
        out = localFs.create(filename);
        // create spill index
        Path indexFilename =
            mapOutputFile.getSpillIndexFileForWrite(
                getTaskID(), numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
        indexOut = localFs.create(indexFilename);
        // we don't run the combiner for a single record
        for (int i = 0; i < partitions; ++i) {
          IFile.Writer writer = null;
          try {
            long segmentStart = out.getPos();
            // Create a new codec, don't care!
            writer = new IFile.Writer(job, out, keyClass, valClass, codec);

            if (i == partition) {
              if (job.getCombineOnceOnly()) {
                Reducer combiner = (Reducer) ReflectionUtils.newInstance(combinerClass, job);
                combineCollector.setWriter(writer);
                combiner.reduce(
                    key,
                    new Iterator<V>() {
                      private boolean done = false;

                      public boolean hasNext() {
                        return !done;
                      }

                      public V next() {
                        if (done) throw new NoSuchElementException();
                        done = true;
                        return value;
                      }

                      public void remove() {
                        throw new UnsupportedOperationException();
                      }
                    },
                    combineCollector,
                    reporter);
              } else {
                final long recordStart = out.getPos();
                writer.append(key, value);
                // Note that our map byte count will not be accurate with
                // compression
                mapOutputByteCounter.increment(out.getPos() - recordStart);
              }
            }
            writer.close();

            // index record
            writeIndexRecord(indexOut, out, segmentStart, writer);
          } catch (IOException e) {
            if (null != writer) writer.close();
            throw e;
          }
        }
        ++numSpills;
      } finally {
        if (out != null) out.close();
        if (indexOut != null) indexOut.close();
      }
    }
Example #2
0
    private void mergeParts() throws IOException {
      // get the approximate size of the final output/index files
      long finalOutFileSize = 0;
      long finalIndexFileSize = 0;
      Path[] filename = new Path[numSpills];
      Path[] indexFileName = new Path[numSpills];
      FileSystem localFs = FileSystem.getLocal(job);

      for (int i = 0; i < numSpills; i++) {
        filename[i] = mapOutputFile.getSpillFile(getTaskID(), i);
        indexFileName[i] = mapOutputFile.getSpillIndexFile(getTaskID(), i);
        finalOutFileSize += localFs.getFileStatus(filename[i]).getLen();
      }

      if (numSpills == 1) { // the spill is the final output
        localFs.rename(filename[0], new Path(filename[0].getParent(), "file.out"));
        localFs.rename(indexFileName[0], new Path(indexFileName[0].getParent(), "file.out.index"));
        return;
      }
      // make correction in the length to include the sequence file header
      // lengths for each partition
      finalOutFileSize += partitions * APPROX_HEADER_LENGTH;

      finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;

      Path finalOutputFile = mapOutputFile.getOutputFileForWrite(getTaskID(), finalOutFileSize);
      Path finalIndexFile =
          mapOutputFile.getOutputIndexFileForWrite(getTaskID(), finalIndexFileSize);

      // The output stream for the final single output file
      FSDataOutputStream finalOut = localFs.create(finalOutputFile, true, 4096);

      // The final index file output stream
      FSDataOutputStream finalIndexOut = localFs.create(finalIndexFile, true, 4096);
      if (numSpills == 0) {
        // create dummy files
        for (int i = 0; i < partitions; i++) {
          long segmentStart = finalOut.getPos();
          Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec);
          writer.close();
          writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer);
        }
        finalOut.close();
        finalIndexOut.close();
        return;
      }
      {
        for (int parts = 0; parts < partitions; parts++) {
          // create the segments to be merged
          List<Segment<K, V>> segmentList = new ArrayList<Segment<K, V>>(numSpills);
          for (int i = 0; i < numSpills; i++) {
            FSDataInputStream indexIn = localFs.open(indexFileName[i]);
            indexIn.seek(parts * MAP_OUTPUT_INDEX_RECORD_LENGTH);
            long segmentOffset = indexIn.readLong();
            long rawSegmentLength = indexIn.readLong();
            long segmentLength = indexIn.readLong();
            indexIn.close();
            Segment<K, V> s =
                new Segment<K, V>(
                    job, localFs, filename[i], segmentOffset, segmentLength, codec, true);
            segmentList.add(i, s);

            if (LOG.isDebugEnabled()) {
              LOG.debug(
                  "Index: ("
                      + indexFileName[i]
                      + ", "
                      + segmentOffset
                      + rawSegmentLength
                      + ", "
                      + segmentLength
                      + ")");
            }
          }

          // merge
          @SuppressWarnings("unchecked")
          RawKeyValueIterator kvIter =
              Merger.merge(
                  job,
                  localFs,
                  keyClass,
                  valClass,
                  segmentList,
                  job.getInt("io.sort.factor", 100),
                  new Path(getTaskID().toString()),
                  job.getOutputKeyComparator(),
                  reporter);

          // write merged output to disk
          long segmentStart = finalOut.getPos();
          Writer<K, V> writer = new Writer<K, V>(job, finalOut, keyClass, valClass, codec);
          if (null == combinerClass
              || job.getCombineOnceOnly()
              || numSpills < minSpillsForCombine) {
            Merger.writeFile(kvIter, writer, reporter, job);
          } else {
            combineCollector.setWriter(writer);
            combineAndSpill(kvIter, combineInputCounter);
          }

          // close
          writer.close();

          // write index record
          writeIndexRecord(finalIndexOut, finalOut, segmentStart, writer);
        }
        finalOut.close();
        finalIndexOut.close();
        // cleanup
        for (int i = 0; i < numSpills; i++) {
          localFs.delete(filename[i], true);
          localFs.delete(indexFileName[i], true);
        }
      }
    }
Example #3
0
    private void sortAndSpill() throws IOException {
      // approximate the length of the output file to be the length of the
      // buffer + header lengths for the partitions
      long size =
          (bufend >= bufstart ? bufend - bufstart : (bufvoid - bufend) + bufstart)
              + partitions * APPROX_HEADER_LENGTH;
      FSDataOutputStream out = null;
      FSDataOutputStream indexOut = null;
      try {
        // create spill file
        Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, size);
        out = localFs.create(filename);
        // create spill index
        Path indexFilename =
            mapOutputFile.getSpillIndexFileForWrite(
                getTaskID(), numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
        indexOut = localFs.create(indexFilename);
        final int endPosition = (kvend > kvstart) ? kvend : kvoffsets.length + kvend;
        sorter.sort(MapOutputBuffer.this, kvstart, endPosition, reporter);
        int spindex = kvstart;
        InMemValBytes value = new InMemValBytes();
        for (int i = 0; i < partitions; ++i) {
          IFile.Writer<K, V> writer = null;
          try {
            long segmentStart = out.getPos();
            writer = new Writer<K, V>(job, out, keyClass, valClass, codec);
            if (null == combinerClass) {
              // spill directly
              DataInputBuffer key = new DataInputBuffer();
              while (spindex < endPosition
                  && kvindices[kvoffsets[spindex % kvoffsets.length] + PARTITION] == i) {
                final int kvoff = kvoffsets[spindex % kvoffsets.length];
                getVBytesForOffset(kvoff, value);
                key.reset(
                    kvbuffer,
                    kvindices[kvoff + KEYSTART],
                    (kvindices[kvoff + VALSTART] - kvindices[kvoff + KEYSTART]));
                writer.append(key, value);
                ++spindex;
              }
            } else {
              int spstart = spindex;
              while (spindex < endPosition
                  && kvindices[kvoffsets[spindex % kvoffsets.length] + PARTITION] == i) {
                ++spindex;
              }
              // Note: we would like to avoid the combiner if we've fewer
              // than some threshold of records for a partition
              if (spstart != spindex) {
                combineCollector.setWriter(writer);
                RawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex);
                combineAndSpill(kvIter, combineInputCounter);
              }
            }

            // close the writer
            writer.close();

            // write the index as <offset, raw-length, compressed-length>
            writeIndexRecord(indexOut, out, segmentStart, writer);
            writer = null;
          } finally {
            if (null != writer) writer.close();
          }
        }
        LOG.info("Finished spill " + numSpills);
        ++numSpills;
      } finally {
        if (out != null) out.close();
        if (indexOut != null) indexOut.close();
      }
    }