@Override
 public boolean seekToNewSource(long targetPos) throws IOException {
   long sumsPos = getChecksumFilePos(targetPos);
   fs.reportChecksumFailure(file, datas, targetPos, sums, sumsPos);
   boolean newDataSource = datas.seekToNewSource(targetPos);
   return sums.seekToNewSource(sumsPos) || newDataSource;
 }
 public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file) throws IOException {
   this(
       fs,
       file,
       fs.getConf()
           .getInt(
               LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_KEY,
               LocalFileSystemConfigKeys.LOCAL_FS_STREAM_BUFFER_SIZE_DEFAULT));
 }
    public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file, int bufferSize)
        throws IOException {
      super(file, fs.getFileStatus(file).getReplication());
      this.datas = fs.getRawFileSystem().open(file, bufferSize);
      this.fs = fs;
      Path sumFile = fs.getChecksumFile(file);
      try {
        int sumBufferSize = fs.getSumBufferSize(fs.getBytesPerSum(), bufferSize);
        sums = fs.getRawFileSystem().open(sumFile, sumBufferSize);

        byte[] version = new byte[CHECKSUM_VERSION.length];
        sums.readFully(version);
        if (!Arrays.equals(version, CHECKSUM_VERSION))
          throw new IOException("Not a checksum file: " + sumFile);
        this.bytesPerSum = sums.readInt();
        set(fs.verifyChecksum, new CRC32(), bytesPerSum, 4);
      } catch (FileNotFoundException e) { // quietly ignore
        set(fs.verifyChecksum, null, 1, 0);
      } catch (IOException e) { // loudly ignore
        LOG.warn(
            "Problem opening checksum file: "
                + file
                + ".  Ignoring exception: "
                + StringUtils.stringifyException(e));
        set(fs.verifyChecksum, null, 1, 0);
      }
    }
 public ChecksumFSOutputSummer(
     ChecksumFileSystem fs,
     Path file,
     boolean overwrite,
     int bufferSize,
     short replication,
     long blockSize,
     Progressable progress)
     throws IOException {
   super(new CRC32(), fs.getBytesPerSum(), 4);
   int bytesPerSum = fs.getBytesPerSum();
   this.datas =
       fs.getRawFileSystem()
           .create(file, overwrite, bufferSize, replication, blockSize, progress);
   int sumBufferSize = fs.getSumBufferSize(bytesPerSum, bufferSize);
   this.sums =
       fs.getRawFileSystem()
           .create(fs.getChecksumFile(file), true, sumBufferSize, replication, blockSize);
   sums.write(CHECKSUM_VERSION, 0, CHECKSUM_VERSION.length);
   sums.writeInt(bytesPerSum);
 }
 /* Return the file length */
 private long getFileLength() throws IOException {
   if (fileLen == -1L) {
     fileLen = fs.getContentSummary(file).getLength();
   }
   return fileLen;
 }
 public ChecksumFSInputChecker(ChecksumFileSystem fs, Path file) throws IOException {
   this(fs, file, fs.getConf().getInt("io.file.buffer.size", 4096));
 }
Example #7
0
    @Override
    public void merge(List<CompressAwarePath> inputs) throws IOException {
      // sanity check
      if (inputs == null || inputs.isEmpty()) {
        LOG.info("No ondisk files to merge...");
        return;
      }

      long approxOutputSize = 0;
      int bytesPerSum = jobConf.getInt("io.bytes.per.checksum", 512);

      LOG.info(
          "OnDiskMerger: We have  " + inputs.size() + " map outputs on disk. Triggering merge...");

      // 1. Prepare the list of files to be merged.
      for (CompressAwarePath file : inputs) {
        approxOutputSize += localFS.getFileStatus(file).getLen();
      }

      // add the checksum length
      approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum);

      // 2. Start the on-disk merge process
      Path outputPath =
          localDirAllocator
              .getLocalPathForWrite(inputs.get(0).toString(), approxOutputSize, jobConf)
              .suffix(Task.MERGED_OUTPUT_PREFIX);
      Writer<K, V> writer =
          new Writer<K, V>(
              jobConf,
              rfs,
              outputPath,
              (Class<K>) jobConf.getMapOutputKeyClass(),
              (Class<V>) jobConf.getMapOutputValueClass(),
              codec,
              null);
      RawKeyValueIterator iter = null;
      CompressAwarePath compressAwarePath;
      Path tmpDir = new Path(reduceId.toString());
      try {
        iter =
            Merger.merge(
                jobConf,
                rfs,
                (Class<K>) jobConf.getMapOutputKeyClass(),
                (Class<V>) jobConf.getMapOutputValueClass(),
                codec,
                inputs.toArray(new Path[inputs.size()]),
                true,
                ioSortFactor,
                tmpDir,
                (RawComparator<K>) jobConf.getOutputKeyComparator(),
                reporter,
                spilledRecordsCounter,
                null,
                mergedMapOutputsCounter,
                null);

        Merger.writeFile(iter, writer, reporter, jobConf);
        writer.close();
        compressAwarePath =
            new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength());
      } catch (IOException e) {
        localFS.delete(outputPath, true);
        throw e;
      }

      closeOnDiskFile(compressAwarePath);

      LOG.info(
          reduceId
              + " Finished merging "
              + inputs.size()
              + " map output files on disk of total-size "
              + approxOutputSize
              + "."
              + " Local output file is "
              + outputPath
              + " of size "
              + localFS.getFileStatus(outputPath).getLen());
    }
Example #8
0
    RawKeyValueIterator merge(
        Class<K> keyClass, Class<V> valueClass, int factor, int inMem, Path tmpDir)
        throws IOException {
      LOG.info("Merging " + segments.size() + " sorted segments");

      // create the MergeStreams from the sorted map created in the constructor
      // and dump the final output to a file
      int numSegments = segments.size();
      int origFactor = factor;
      int passNo = 1;
      do {
        // get the factor for this pass of merge. We assume in-memory segments
        // are the first entries in the segment list and that the pass factor
        // doesn't apply to them
        factor = getPassFactor(factor, passNo, numSegments - inMem);
        if (1 == passNo) {
          factor += inMem;
        }
        List<Segment<K, V>> segmentsToMerge = new ArrayList<Segment<K, V>>();
        int segmentsConsidered = 0;
        int numSegmentsToConsider = factor;
        while (true) {
          // extract the smallest 'factor' number of segments
          // Call cleanup on the empty segments (no key/value data)
          List<Segment<K, V>> mStream = getSegmentDescriptors(numSegmentsToConsider);
          for (Segment<K, V> segment : mStream) {
            // Initialize the segment at the last possible moment;
            // this helps in ensuring we don't use buffers until we need them
            segment.init();
            long startPos = segment.getPosition();
            boolean hasNext = segment.next();
            long endPos = segment.getPosition();
            totalBytesProcessed += endPos - startPos;
            mergeProgress.set(totalBytesProcessed * progPerByte);
            if (hasNext) {
              segmentsToMerge.add(segment);
              segmentsConsidered++;
            } else {
              segment.close();
              numSegments--; // we ignore this segment for the merge
            }
          }
          // if we have the desired number of segments
          // or looked at all available segments, we break
          if (segmentsConsidered == factor || segments.size() == 0) {
            break;
          }

          numSegmentsToConsider = factor - segmentsConsidered;
        }

        // feed the streams to the priority queue
        initialize(segmentsToMerge.size());
        clear();
        for (Segment<K, V> segment : segmentsToMerge) {
          put(segment);
        }

        // if we have lesser number of segments remaining, then just return the
        // iterator, else do another single level merge
        if (numSegments <= factor) {
          // calculate the length of the remaining segments. Required for
          // calculating the merge progress
          long totalBytes = 0;
          for (int i = 0; i < segmentsToMerge.size(); i++) {
            totalBytes += segmentsToMerge.get(i).getLength();
          }
          if (totalBytes != 0) // being paranoid
          progPerByte = 1.0f / (float) totalBytes;

          if (totalBytes != 0) mergeProgress.set(totalBytesProcessed * progPerByte);
          else mergeProgress.set(1.0f); // Last pass and no segments left - we're done

          LOG.info(
              "Down to the last merge-pass, with "
                  + numSegments
                  + " segments left of total size: "
                  + totalBytes
                  + " bytes");
          return this;
        } else {
          LOG.info(
              "Merging "
                  + segmentsToMerge.size()
                  + " intermediate segments out of a total of "
                  + (segments.size() + segmentsToMerge.size()));

          // we want to spread the creation of temp files on multiple disks if
          // available under the space constraints
          long approxOutputSize = 0;
          for (Segment<K, V> s : segmentsToMerge) {
            approxOutputSize +=
                s.getLength() + ChecksumFileSystem.getApproxChkSumLength(s.getLength());
          }
          Path tmpFilename = new Path(tmpDir, "intermediate").suffix("." + passNo);

          Path outputFile =
              lDirAlloc.getLocalPathForWrite(tmpFilename.toString(), approxOutputSize, conf);

          Writer<K, V> writer = new Writer<K, V>(conf, fs, outputFile, keyClass, valueClass, codec);
          writeFile(this, writer, reporter);
          writer.close();

          // we finished one single level merge; now clean up the priority
          // queue
          this.close();

          // Add the newly create segment to the list of segments to be merged
          Segment<K, V> tempSegment = new Segment<K, V>(conf, fs, outputFile, codec, false);
          segments.add(tempSegment);
          numSegments = segments.size();
          Collections.sort(segments, segmentComparator);

          passNo++;
        }
        // we are worried about only the first pass merge factor. So reset the
        // factor to what it originally was
        factor = origFactor;
      } while (true);
    }