Exemplo n.º 1
0
    RawKeyValueIterator merge(
        Class<K> keyClass, Class<V> valueClass, int factor, int inMem, Path tmpDir)
        throws IOException {
      LOG.info("Merging " + segments.size() + " sorted segments");

      // create the MergeStreams from the sorted map created in the constructor
      // and dump the final output to a file
      int numSegments = segments.size();
      int origFactor = factor;
      int passNo = 1;
      do {
        // get the factor for this pass of merge. We assume in-memory segments
        // are the first entries in the segment list and that the pass factor
        // doesn't apply to them
        factor = getPassFactor(factor, passNo, numSegments - inMem);
        if (1 == passNo) {
          factor += inMem;
        }
        List<Segment<K, V>> segmentsToMerge = new ArrayList<Segment<K, V>>();
        int segmentsConsidered = 0;
        int numSegmentsToConsider = factor;
        while (true) {
          // extract the smallest 'factor' number of segments
          // Call cleanup on the empty segments (no key/value data)
          List<Segment<K, V>> mStream = getSegmentDescriptors(numSegmentsToConsider);
          for (Segment<K, V> segment : mStream) {
            // Initialize the segment at the last possible moment;
            // this helps in ensuring we don't use buffers until we need them
            segment.init();
            long startPos = segment.getPosition();
            boolean hasNext = segment.next();
            long endPos = segment.getPosition();
            totalBytesProcessed += endPos - startPos;
            mergeProgress.set(totalBytesProcessed * progPerByte);
            if (hasNext) {
              segmentsToMerge.add(segment);
              segmentsConsidered++;
            } else {
              segment.close();
              numSegments--; // we ignore this segment for the merge
            }
          }
          // if we have the desired number of segments
          // or looked at all available segments, we break
          if (segmentsConsidered == factor || segments.size() == 0) {
            break;
          }

          numSegmentsToConsider = factor - segmentsConsidered;
        }

        // feed the streams to the priority queue
        initialize(segmentsToMerge.size());
        clear();
        for (Segment<K, V> segment : segmentsToMerge) {
          put(segment);
        }

        // if we have lesser number of segments remaining, then just return the
        // iterator, else do another single level merge
        if (numSegments <= factor) {
          // calculate the length of the remaining segments. Required for
          // calculating the merge progress
          long totalBytes = 0;
          for (int i = 0; i < segmentsToMerge.size(); i++) {
            totalBytes += segmentsToMerge.get(i).getLength();
          }
          if (totalBytes != 0) // being paranoid
          progPerByte = 1.0f / (float) totalBytes;

          if (totalBytes != 0) mergeProgress.set(totalBytesProcessed * progPerByte);
          else mergeProgress.set(1.0f); // Last pass and no segments left - we're done

          LOG.info(
              "Down to the last merge-pass, with "
                  + numSegments
                  + " segments left of total size: "
                  + totalBytes
                  + " bytes");
          return this;
        } else {
          LOG.info(
              "Merging "
                  + segmentsToMerge.size()
                  + " intermediate segments out of a total of "
                  + (segments.size() + segmentsToMerge.size()));

          // we want to spread the creation of temp files on multiple disks if
          // available under the space constraints
          long approxOutputSize = 0;
          for (Segment<K, V> s : segmentsToMerge) {
            approxOutputSize +=
                s.getLength() + ChecksumFileSystem.getApproxChkSumLength(s.getLength());
          }
          Path tmpFilename = new Path(tmpDir, "intermediate").suffix("." + passNo);

          Path outputFile =
              lDirAlloc.getLocalPathForWrite(tmpFilename.toString(), approxOutputSize, conf);

          Writer<K, V> writer = new Writer<K, V>(conf, fs, outputFile, keyClass, valueClass, codec);
          writeFile(this, writer, reporter);
          writer.close();

          // we finished one single level merge; now clean up the priority
          // queue
          this.close();

          // Add the newly create segment to the list of segments to be merged
          Segment<K, V> tempSegment = new Segment<K, V>(conf, fs, outputFile, codec, false);
          segments.add(tempSegment);
          numSegments = segments.size();
          Collections.sort(segments, segmentComparator);

          passNo++;
        }
        // we are worried about only the first pass merge factor. So reset the
        // factor to what it originally was
        factor = origFactor;
      } while (true);
    }