@Override public void configure(JobConf conf) { this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD); int reducerID = conf.getInt("mapred.task.partition", -1); int max = conf.getInt(PARAM_APS_MAXKEY, 0); int nstripes = conf.getInt(PARAM_APS_STRIPES, 1); int spread = conf.getInt(PARAM_APS_REDUCER_PER_STRIPE, 1); if (reducerID < 0 || max == 0) { LOG.error("Could not find stripe ID, reverting to whole rest file loading"); LOG.debug("reducer = " + reducerID + "\t max = " + max + "\t nstripes = " + nstripes); // open the pruned part file in the DistrubutedCache haspruned = FileUtils.readRestFile(conf, pruned); } else { int stripe = GenericKey.StripePartitioner.findStripe(reducerID, spread); int from = GenericKey.StripePartitioner.minKeyInStripe(stripe, nstripes, max); int to = from + GenericKey.StripePartitioner.numKeysInStripe(stripe, nstripes, max); // read from 'from' included, to 'to' excluded LOG.info( "Reducer " + reducerID + " loading stripe " + stripe + " of " + nstripes + " (" + from + "," + (to - 1) + ")"); haspruned = FileUtils.readRestFile(conf, pruned, from, to); } if (!haspruned) LOG.warn("No pruned file provided in DistributedCache"); else LOG.info("Read " + pruned.size() + " entries from pruned file"); }
@Override public void configure(JobConf conf) { threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD); mos = new MultipleOutputs(conf); // open the maxWeight_i file in the DistributedCache boolean succeded = FileUtils.readMaxWiFile(conf, maxWi); if (!succeded) throw new AssertionError("Could not read maxWi file"); }
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); }
@Override public void configure(JobConf conf) { this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD); }
private RawKeyValueIterator finalMerge( JobConf job, FileSystem fs, List<InMemoryMapOutput<K, V>> inMemoryMapOutputs, List<CompressAwarePath> onDiskMapOutputs) throws IOException { LOG.info( "finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and " + onDiskMapOutputs.size() + " on-disk map-outputs"); final float maxRedPer = job.getFloat(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT, 0f); if (maxRedPer > 1.0 || maxRedPer < 0.0) { throw new IOException(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT + maxRedPer); } int maxInMemReduce = (int) Math.min(Runtime.getRuntime().maxMemory() * maxRedPer, Integer.MAX_VALUE); // merge config params Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); Class<V> valueClass = (Class<V>) job.getMapOutputValueClass(); boolean keepInputs = job.getKeepFailedTaskFiles(); final Path tmpDir = new Path(reduceId.toString()); final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); // segments required to vacate memory List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>(); long inMemToDiskBytes = 0; boolean mergePhaseFinished = false; if (inMemoryMapOutputs.size() > 0) { TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID(); inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce); final int numMemDiskSegments = memDiskSegments.size(); if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) { // If we reach here, it implies that we have less than io.sort.factor // disk segments and this will be incremented by 1 (result of the // memory segments merge). Since this total would still be // <= io.sort.factor, we will not do any more intermediate merges, // the merge of all these disk segments would be directly fed to the // reduce method mergePhaseFinished = true; // must spill to disk, but can't retain in-mem for intermediate merge final Path outputPath = mapOutputFile .getInputFileForWrite(mapId, inMemToDiskBytes) .suffix(Task.MERGED_OUTPUT_PREFIX); final RawKeyValueIterator rIter = Merger.merge( job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase); Writer<K, V> writer = new Writer<K, V>(job, fs, outputPath, keyClass, valueClass, codec, null); try { Merger.writeFile(rIter, writer, reporter, job); writer.close(); onDiskMapOutputs.add( new CompressAwarePath( outputPath, writer.getRawLength(), writer.getCompressedLength())); writer = null; // add to list of final disk outputs. } catch (IOException e) { if (null != outputPath) { try { fs.delete(outputPath, true); } catch (IOException ie) { // NOTHING } } throw e; } finally { if (null != writer) { writer.close(); } } LOG.info( "Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit"); inMemToDiskBytes = 0; memDiskSegments.clear(); } else if (inMemToDiskBytes != 0) { LOG.info( "Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge"); } } // segments on disk List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>(); long onDiskBytes = inMemToDiskBytes; long rawBytes = inMemToDiskBytes; CompressAwarePath[] onDisk = onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]); for (CompressAwarePath file : onDisk) { long fileLength = fs.getFileStatus(file).getLen(); onDiskBytes += fileLength; rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength; LOG.debug("Disk file: " + file + " Length is " + fileLength); diskSegments.add( new Segment<K, V>( job, fs, file, codec, keepInputs, (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter), file.getRawDataLength())); } LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk"); Collections.sort( diskSegments, new Comparator<Segment<K, V>>() { public int compare(Segment<K, V> o1, Segment<K, V> o2) { if (o1.getLength() == o2.getLength()) { return 0; } return o1.getLength() < o2.getLength() ? -1 : 1; } }); // build final list of segments from merged backed by disk + in-mem List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>(); long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0); LOG.info( "Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce"); if (0 != onDiskBytes) { final int numInMemSegments = memDiskSegments.size(); diskSegments.addAll(0, memDiskSegments); memDiskSegments.clear(); // Pass mergePhase only if there is a going to be intermediate // merges. See comment where mergePhaseFinished is being set Progress thisPhase = (mergePhaseFinished) ? null : mergePhase; RawKeyValueIterator diskMerge = Merger.merge( job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null, thisPhase); diskSegments.clear(); if (0 == finalSegments.size()) { return diskMerge; } finalSegments.add( new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes)); } return Merger.merge( job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null, null); }
public MergeManagerImpl( TaskAttemptID reduceId, JobConf jobConf, FileSystem localFS, LocalDirAllocator localDirAllocator, Reporter reporter, CompressionCodec codec, Class<? extends Reducer> combinerClass, CombineOutputCollector<K, V> combineCollector, Counters.Counter spilledRecordsCounter, Counters.Counter reduceCombineInputCounter, Counters.Counter mergedMapOutputsCounter, ExceptionReporter exceptionReporter, Progress mergePhase, MapOutputFile mapOutputFile) { this.reduceId = reduceId; this.jobConf = jobConf; this.localDirAllocator = localDirAllocator; this.exceptionReporter = exceptionReporter; this.reporter = reporter; this.codec = codec; this.combinerClass = combinerClass; this.combineCollector = combineCollector; this.reduceCombineInputCounter = reduceCombineInputCounter; this.spilledRecordsCounter = spilledRecordsCounter; this.mergedMapOutputsCounter = mergedMapOutputsCounter; this.mapOutputFile = mapOutputFile; this.mapOutputFile.setConf(jobConf); this.localFS = localFS; this.rfs = ((LocalFileSystem) localFS).getRaw(); final float maxInMemCopyUse = jobConf.getFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.90f); if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) { throw new IllegalArgumentException( "Invalid value for " + MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT + ": " + maxInMemCopyUse); } // Allow unit tests to fix Runtime memory this.memoryLimit = (long) (jobConf.getLong( MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES, Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE)) * maxInMemCopyUse); this.ioSortFactor = jobConf.getInt(MRJobConfig.IO_SORT_FACTOR, 100); final float singleShuffleMemoryLimitPercent = jobConf.getFloat( MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT); if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) { throw new IllegalArgumentException( "Invalid value for " + MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT + ": " + singleShuffleMemoryLimitPercent); } usedMemory = 0L; commitMemory = 0L; this.maxSingleShuffleLimit = (long) (memoryLimit * singleShuffleMemoryLimitPercent); this.memToMemMergeOutputsThreshold = jobConf.getInt(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, ioSortFactor); this.mergeThreshold = (long) (this.memoryLimit * jobConf.getFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.90f)); LOG.info( "MergerManager: memoryLimit=" + memoryLimit + ", " + "maxSingleShuffleLimit=" + maxSingleShuffleLimit + ", " + "mergeThreshold=" + mergeThreshold + ", " + "ioSortFactor=" + ioSortFactor + ", " + "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold); if (this.maxSingleShuffleLimit >= this.mergeThreshold) { throw new RuntimeException( "Invlaid configuration: " + "maxSingleShuffleLimit should be less than mergeThreshold" + "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit + "mergeThreshold: " + this.mergeThreshold); } boolean allowMemToMemMerge = jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false); if (allowMemToMemMerge) { this.memToMemMerger = new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold); this.memToMemMerger.start(); } else { this.memToMemMerger = null; } this.inMemoryMerger = createInMemoryMerger(); this.inMemoryMerger.start(); this.onDiskMerger = new OnDiskMerger(this); this.onDiskMerger.start(); this.mergePhase = mergePhase; }
public void configure(JobConf job) { interval = job.getInt("db.fetch.interval.default", 2592000); scoreInjected = job.getFloat("db.score.injected", 1.0f); overwrite = job.getBoolean("db.injector.overwrite", false); update = job.getBoolean("db.injector.update", false); }
public void configure(JobConf job) { maxDocLength = job.getFloat("max.doc.length", 500); }