/** * Generate the requested number of file splits, with the filename set to the filename of the * output file. */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { /** 设置输入分片的个数* */ JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; } System.out.println("numMaps-------" + numMaps); InputSplit[] result = new InputSplit[numMaps]; Path outDir = FileOutputFormat.getOutputPath(job); for (int i = 0; i < result.length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null); } return result; }
/** * This is the main routine for launching a distributed random write job. It runs 10 maps/node and * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); /** 建议型的 * */ System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println( "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
/** Save the values out of the configuaration that we need to write the data. */ @Override public void configure(JobConf job) { numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 20); minKeySize = job.getInt("test.randomwrite.min_key", 100); keySizeRange = job.getInt("test.randomwrite.max_key", 1000) - minKeySize; minValueSize = job.getInt("test.randomwrite.min_value", 0); valueSizeRange = job.getInt("test.randomwrite.max_value", 20000) - minValueSize; }
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); }
/** * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster). * * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static void setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min( numMaps, job.getInt( MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); job.setNumMapTasks(Math.max(numMaps, 1)); }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
/** * Use the input splits to take samples of the input and generate sample keys. By default reads * 100,000 keys from 10 locations in the input, sorts them and picks N-1 keys to generate N * equally sized partitions. * * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(JobConf conf, Path partFile) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null); while (reader.next(key, value)) { sampler.addKey(key); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
private void getWriter(Reporter reporter, ObjectInspector inspector, int bucket) throws IOException { if (writer == null) { AcidOutputFormat.Options options = new AcidOutputFormat.Options(jobConf); options .inspector(inspector) .writingBase(jobConf.getBoolean(IS_MAJOR, false)) .isCompressed(jobConf.getBoolean(IS_COMPRESSED, false)) .tableProperties(new StringableMap(jobConf.get(TABLE_PROPS)).toProperties()) .reporter(reporter) .minimumTransactionId(jobConf.getLong(MIN_TXN, Long.MAX_VALUE)) .maximumTransactionId(jobConf.getLong(MAX_TXN, Long.MIN_VALUE)) .bucket(bucket) .statementId(-1); // setting statementId == -1 makes compacted delta files use // delta_xxxx_yyyy format // Instantiate the underlying output format @SuppressWarnings("unchecked") // since there is no way to parametrize instance of Class AcidOutputFormat<WritableComparable, V> aof = instantiate(AcidOutputFormat.class, jobConf.get(OUTPUT_FORMAT_CLASS_NAME)); writer = aof.getRawRecordWriter(new Path(jobConf.get(TMP_LOCATION)), options); } }
private void getOptions(JobConf job) { pages = job.getLong("pages", 0); slotpages = job.getLong("slotpages", 0); delim = job.get("delimiter"); }
private void getOptions(JobConf job) { pages = job.getLong("pages", 0); slotpages = job.getLong("slotpages", 0); }
public MergeManagerImpl( TaskAttemptID reduceId, JobConf jobConf, FileSystem localFS, LocalDirAllocator localDirAllocator, Reporter reporter, CompressionCodec codec, Class<? extends Reducer> combinerClass, CombineOutputCollector<K, V> combineCollector, Counters.Counter spilledRecordsCounter, Counters.Counter reduceCombineInputCounter, Counters.Counter mergedMapOutputsCounter, ExceptionReporter exceptionReporter, Progress mergePhase, MapOutputFile mapOutputFile) { this.reduceId = reduceId; this.jobConf = jobConf; this.localDirAllocator = localDirAllocator; this.exceptionReporter = exceptionReporter; this.reporter = reporter; this.codec = codec; this.combinerClass = combinerClass; this.combineCollector = combineCollector; this.reduceCombineInputCounter = reduceCombineInputCounter; this.spilledRecordsCounter = spilledRecordsCounter; this.mergedMapOutputsCounter = mergedMapOutputsCounter; this.mapOutputFile = mapOutputFile; this.mapOutputFile.setConf(jobConf); this.localFS = localFS; this.rfs = ((LocalFileSystem) localFS).getRaw(); final float maxInMemCopyUse = jobConf.getFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.90f); if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) { throw new IllegalArgumentException( "Invalid value for " + MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT + ": " + maxInMemCopyUse); } // Allow unit tests to fix Runtime memory this.memoryLimit = (long) (jobConf.getLong( MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES, Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE)) * maxInMemCopyUse); this.ioSortFactor = jobConf.getInt(MRJobConfig.IO_SORT_FACTOR, 100); final float singleShuffleMemoryLimitPercent = jobConf.getFloat( MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT); if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) { throw new IllegalArgumentException( "Invalid value for " + MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT + ": " + singleShuffleMemoryLimitPercent); } usedMemory = 0L; commitMemory = 0L; this.maxSingleShuffleLimit = (long) (memoryLimit * singleShuffleMemoryLimitPercent); this.memToMemMergeOutputsThreshold = jobConf.getInt(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, ioSortFactor); this.mergeThreshold = (long) (this.memoryLimit * jobConf.getFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.90f)); LOG.info( "MergerManager: memoryLimit=" + memoryLimit + ", " + "maxSingleShuffleLimit=" + maxSingleShuffleLimit + ", " + "mergeThreshold=" + mergeThreshold + ", " + "ioSortFactor=" + ioSortFactor + ", " + "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold); if (this.maxSingleShuffleLimit >= this.mergeThreshold) { throw new RuntimeException( "Invlaid configuration: " + "maxSingleShuffleLimit should be less than mergeThreshold" + "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit + "mergeThreshold: " + this.mergeThreshold); } boolean allowMemToMemMerge = jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false); if (allowMemToMemMerge) { this.memToMemMerger = new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold); this.memToMemMerger.start(); } else { this.memToMemMerger = null; } this.inMemoryMerger = createInMemoryMerger(); this.inMemoryMerger.start(); this.onDiskMerger = new OnDiskMerger(this); this.onDiskMerger.start(); this.mergePhase = mergePhase; }
@Override public void configure(JobConf job) { super.configure(job); this.job = job; this.maxNumOfValuesPerGroup = job.getLong("datajoin.maxNumOfValuesPerGroup", 100); }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException, MessageException { long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode; } else { minSizeNode = job.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = job.getLong("mapred.max.split.size", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException( "Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException( "Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException( "Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(job)); List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>(); if (paths.length == 0) { return splits.toArray(new CombineFileSplit[splits.size()]); } // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contans paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (int i = 0; i < paths.length; i++) { if (paths[i] == null) { // already processed continue; } FileSystem fs = paths[i].getFileSystem(job); Path p = new Path(paths[i].toUri().getPath()); if (onepool.accept(p)) { myPaths.add(paths[i]); // add it to my output set paths[i] = null; // already processed } } // create splits for all files in this pool. getMoreSplits( job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // Finally, process all paths that do not belong to any pool. ArrayList<Path> myPaths = new ArrayList<Path>(); for (int i = 0; i < paths.length; i++) { if (paths[i] == null) { // already processed continue; } myPaths.add(paths[i]); } // create splits for all files that are not in any pool. getMoreSplits( job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits.toArray(new CombineFileSplit[splits.size()]); }
@Override // Mapper public void configure(JobConf conf) { super.configure(conf); skipSize = conf.getLong("test.io.skip.size", 0); }