Пример #1
0
 @Override
 public void configure(JobConf conf) {
   this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD);
   int reducerID = conf.getInt("mapred.task.partition", -1);
   int max = conf.getInt(PARAM_APS_MAXKEY, 0);
   int nstripes = conf.getInt(PARAM_APS_STRIPES, 1);
   int spread = conf.getInt(PARAM_APS_REDUCER_PER_STRIPE, 1);
   if (reducerID < 0 || max == 0) {
     LOG.error("Could not find stripe ID, reverting to whole rest file loading");
     LOG.debug("reducer = " + reducerID + "\t max = " + max + "\t nstripes = " + nstripes);
     // open the pruned part file in the DistrubutedCache
     haspruned = FileUtils.readRestFile(conf, pruned);
   } else {
     int stripe = GenericKey.StripePartitioner.findStripe(reducerID, spread);
     int from = GenericKey.StripePartitioner.minKeyInStripe(stripe, nstripes, max);
     int to = from + GenericKey.StripePartitioner.numKeysInStripe(stripe, nstripes, max);
     // read from 'from' included, to 'to' excluded
     LOG.info(
         "Reducer "
             + reducerID
             + " loading stripe "
             + stripe
             + " of "
             + nstripes
             + " ("
             + from
             + ","
             + (to - 1)
             + ")");
     haspruned = FileUtils.readRestFile(conf, pruned, from, to);
   }
   if (!haspruned) LOG.warn("No pruned file provided in DistributedCache");
   else LOG.info("Read " + pruned.size() + " entries from pruned file");
 }
Пример #2
0
 @Override
 public void configure(JobConf conf) {
   threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD);
   mos = new MultipleOutputs(conf);
   // open the maxWeight_i file in the DistributedCache
   boolean succeded = FileUtils.readMaxWiFile(conf, maxWi);
   if (!succeded) throw new AssertionError("Could not read maxWi file");
 }
Пример #3
0
 public void configure(JobConf job) {
   this.jobConf = job;
   urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
   interval = jobConf.getInt("db.fetch.interval.default", 2592000);
   filters = new URLFilters(jobConf);
   scfilters = new ScoringFilters(jobConf);
   scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
   curTime = job.getLong("injector.current.time", System.currentTimeMillis());
 }
Пример #4
0
 @Override
 public void configure(JobConf conf) {
   this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD);
 }
Пример #5
0
  private RawKeyValueIterator finalMerge(
      JobConf job,
      FileSystem fs,
      List<InMemoryMapOutput<K, V>> inMemoryMapOutputs,
      List<CompressAwarePath> onDiskMapOutputs)
      throws IOException {
    LOG.info(
        "finalMerge called with "
            + inMemoryMapOutputs.size()
            + " in-memory map-outputs and "
            + onDiskMapOutputs.size()
            + " on-disk map-outputs");

    final float maxRedPer = job.getFloat(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT, 0f);
    if (maxRedPer > 1.0 || maxRedPer < 0.0) {
      throw new IOException(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT + maxRedPer);
    }
    int maxInMemReduce =
        (int) Math.min(Runtime.getRuntime().maxMemory() * maxRedPer, Integer.MAX_VALUE);

    // merge config params
    Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
    Class<V> valueClass = (Class<V>) job.getMapOutputValueClass();
    boolean keepInputs = job.getKeepFailedTaskFiles();
    final Path tmpDir = new Path(reduceId.toString());
    final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();

    // segments required to vacate memory
    List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
      TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID();
      inMemToDiskBytes =
          createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
      final int numMemDiskSegments = memDiskSegments.size();
      if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {

        // If we reach here, it implies that we have less than io.sort.factor
        // disk segments and this will be incremented by 1 (result of the
        // memory segments merge). Since this total would still be
        // <= io.sort.factor, we will not do any more intermediate merges,
        // the merge of all these disk segments would be directly fed to the
        // reduce method

        mergePhaseFinished = true;
        // must spill to disk, but can't retain in-mem for intermediate merge
        final Path outputPath =
            mapOutputFile
                .getInputFileForWrite(mapId, inMemToDiskBytes)
                .suffix(Task.MERGED_OUTPUT_PREFIX);
        final RawKeyValueIterator rIter =
            Merger.merge(
                job,
                fs,
                keyClass,
                valueClass,
                memDiskSegments,
                numMemDiskSegments,
                tmpDir,
                comparator,
                reporter,
                spilledRecordsCounter,
                null,
                mergePhase);
        Writer<K, V> writer =
            new Writer<K, V>(job, fs, outputPath, keyClass, valueClass, codec, null);
        try {
          Merger.writeFile(rIter, writer, reporter, job);
          writer.close();
          onDiskMapOutputs.add(
              new CompressAwarePath(
                  outputPath, writer.getRawLength(), writer.getCompressedLength()));
          writer = null;
          // add to list of final disk outputs.
        } catch (IOException e) {
          if (null != outputPath) {
            try {
              fs.delete(outputPath, true);
            } catch (IOException ie) {
              // NOTHING
            }
          }
          throw e;
        } finally {
          if (null != writer) {
            writer.close();
          }
        }
        LOG.info(
            "Merged "
                + numMemDiskSegments
                + " segments, "
                + inMemToDiskBytes
                + " bytes to disk to satisfy "
                + "reduce memory limit");
        inMemToDiskBytes = 0;
        memDiskSegments.clear();
      } else if (inMemToDiskBytes != 0) {
        LOG.info(
            "Keeping "
                + numMemDiskSegments
                + " segments, "
                + inMemToDiskBytes
                + " bytes in memory for "
                + "intermediate, on-disk merge");
      }
    }

    // segments on disk
    List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>();
    long onDiskBytes = inMemToDiskBytes;
    long rawBytes = inMemToDiskBytes;
    CompressAwarePath[] onDisk =
        onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]);
    for (CompressAwarePath file : onDisk) {
      long fileLength = fs.getFileStatus(file).getLen();
      onDiskBytes += fileLength;
      rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength;

      LOG.debug("Disk file: " + file + " Length is " + fileLength);
      diskSegments.add(
          new Segment<K, V>(
              job,
              fs,
              file,
              codec,
              keepInputs,
              (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX)
                  ? null
                  : mergedMapOutputsCounter),
              file.getRawDataLength()));
    }
    LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
    Collections.sort(
        diskSegments,
        new Comparator<Segment<K, V>>() {
          public int compare(Segment<K, V> o1, Segment<K, V> o2) {
            if (o1.getLength() == o2.getLength()) {
              return 0;
            }
            return o1.getLength() < o2.getLength() ? -1 : 1;
          }
        });

    // build final list of segments from merged backed by disk + in-mem
    List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
    LOG.info(
        "Merging "
            + finalSegments.size()
            + " segments, "
            + inMemBytes
            + " bytes from memory into reduce");
    if (0 != onDiskBytes) {
      final int numInMemSegments = memDiskSegments.size();
      diskSegments.addAll(0, memDiskSegments);
      memDiskSegments.clear();
      // Pass mergePhase only if there is a going to be intermediate
      // merges. See comment where mergePhaseFinished is being set
      Progress thisPhase = (mergePhaseFinished) ? null : mergePhase;
      RawKeyValueIterator diskMerge =
          Merger.merge(
              job,
              fs,
              keyClass,
              valueClass,
              codec,
              diskSegments,
              ioSortFactor,
              numInMemSegments,
              tmpDir,
              comparator,
              reporter,
              false,
              spilledRecordsCounter,
              null,
              thisPhase);
      diskSegments.clear();
      if (0 == finalSegments.size()) {
        return diskMerge;
      }
      finalSegments.add(
          new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));
    }
    return Merger.merge(
        job,
        fs,
        keyClass,
        valueClass,
        finalSegments,
        finalSegments.size(),
        tmpDir,
        comparator,
        reporter,
        spilledRecordsCounter,
        null,
        null);
  }
Пример #6
0
  public MergeManagerImpl(
      TaskAttemptID reduceId,
      JobConf jobConf,
      FileSystem localFS,
      LocalDirAllocator localDirAllocator,
      Reporter reporter,
      CompressionCodec codec,
      Class<? extends Reducer> combinerClass,
      CombineOutputCollector<K, V> combineCollector,
      Counters.Counter spilledRecordsCounter,
      Counters.Counter reduceCombineInputCounter,
      Counters.Counter mergedMapOutputsCounter,
      ExceptionReporter exceptionReporter,
      Progress mergePhase,
      MapOutputFile mapOutputFile) {
    this.reduceId = reduceId;
    this.jobConf = jobConf;
    this.localDirAllocator = localDirAllocator;
    this.exceptionReporter = exceptionReporter;

    this.reporter = reporter;
    this.codec = codec;
    this.combinerClass = combinerClass;
    this.combineCollector = combineCollector;
    this.reduceCombineInputCounter = reduceCombineInputCounter;
    this.spilledRecordsCounter = spilledRecordsCounter;
    this.mergedMapOutputsCounter = mergedMapOutputsCounter;
    this.mapOutputFile = mapOutputFile;
    this.mapOutputFile.setConf(jobConf);

    this.localFS = localFS;
    this.rfs = ((LocalFileSystem) localFS).getRaw();

    final float maxInMemCopyUse = jobConf.getFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.90f);
    if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) {
      throw new IllegalArgumentException(
          "Invalid value for " + MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT + ": " + maxInMemCopyUse);
    }

    // Allow unit tests to fix Runtime memory
    this.memoryLimit =
        (long)
            (jobConf.getLong(
                    MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES,
                    Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE))
                * maxInMemCopyUse);

    this.ioSortFactor = jobConf.getInt(MRJobConfig.IO_SORT_FACTOR, 100);

    final float singleShuffleMemoryLimitPercent =
        jobConf.getFloat(
            MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT);
    if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) {
      throw new IllegalArgumentException(
          "Invalid value for "
              + MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT
              + ": "
              + singleShuffleMemoryLimitPercent);
    }

    usedMemory = 0L;
    commitMemory = 0L;
    this.maxSingleShuffleLimit = (long) (memoryLimit * singleShuffleMemoryLimitPercent);
    this.memToMemMergeOutputsThreshold =
        jobConf.getInt(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, ioSortFactor);
    this.mergeThreshold =
        (long) (this.memoryLimit * jobConf.getFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.90f));
    LOG.info(
        "MergerManager: memoryLimit="
            + memoryLimit
            + ", "
            + "maxSingleShuffleLimit="
            + maxSingleShuffleLimit
            + ", "
            + "mergeThreshold="
            + mergeThreshold
            + ", "
            + "ioSortFactor="
            + ioSortFactor
            + ", "
            + "memToMemMergeOutputsThreshold="
            + memToMemMergeOutputsThreshold);

    if (this.maxSingleShuffleLimit >= this.mergeThreshold) {
      throw new RuntimeException(
          "Invlaid configuration: "
              + "maxSingleShuffleLimit should be less than mergeThreshold"
              + "maxSingleShuffleLimit: "
              + this.maxSingleShuffleLimit
              + "mergeThreshold: "
              + this.mergeThreshold);
    }

    boolean allowMemToMemMerge = jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
    if (allowMemToMemMerge) {
      this.memToMemMerger =
          new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold);
      this.memToMemMerger.start();
    } else {
      this.memToMemMerger = null;
    }

    this.inMemoryMerger = createInMemoryMerger();
    this.inMemoryMerger.start();

    this.onDiskMerger = new OnDiskMerger(this);
    this.onDiskMerger.start();

    this.mergePhase = mergePhase;
  }
Пример #7
0
 public void configure(JobConf job) {
   interval = job.getInt("db.fetch.interval.default", 2592000);
   scoreInjected = job.getFloat("db.score.injected", 1.0f);
   overwrite = job.getBoolean("db.injector.overwrite", false);
   update = job.getBoolean("db.injector.update", false);
 }
Пример #8
0
 public void configure(JobConf job) {
   maxDocLength = job.getFloat("max.doc.length", 500);
 }