/** * Generate the requested number of file splits, with the filename set to the filename of the * output file. */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { /** 设置输入分片的个数* */ JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; } System.out.println("numMaps-------" + numMaps); InputSplit[] result = new InputSplit[numMaps]; Path outDir = FileOutputFormat.getOutputPath(job); for (int i = 0; i < result.length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null); } return result; }
/** * This method call when injected into a class will modify the output path, only if output is into * HDFS * * @param job Job whose output path need to be changed */ public static void modifyOutputPath(JobConf job) { Path path = org.apache.hadoop.mapred.FileOutputFormat.getOutputPath(job); if (path == null) { throw new IllegalArgumentException("Job Output path is null, expecting not null path value"); } StringBuilder out = new StringBuilder(path.toString()); out.append(SEPARATOR_UNDERSCORE).append(System.currentTimeMillis()); org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(job, new Path(out.toString())); }
public RecordWriter<WritableComparable, Writable> getRecordWriter( final FileSystem fs, JobConf job, String name, final Progressable progress) throws IOException { final Path segmentDumpFile = new Path(FileOutputFormat.getOutputPath(job), name); // Get the old copy out of the way if (fs.exists(segmentDumpFile)) fs.delete(segmentDumpFile, true); final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile)); return new RecordWriter<WritableComparable, Writable>() { public synchronized void write(WritableComparable key, Writable value) throws IOException { printStream.println(value); } public synchronized void close(Reporter reporter) throws IOException { printStream.close(); } }; }
public void configure(JobConf job) { _jobConf = job; crawlerCount = job.getInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, CrawlEnvironment.CRAWLERS.length); partitionNumber = job.getInt("mapred.task.partition", -1); try { FileSystem fs = FileSystem.get(job); Path workPath = FileOutputFormat.getOutputPath(job); debugURLStream = fs.create(new Path(workPath, "debugURLS-" + NUMBER_FORMAT.format(partitionNumber))); urlDebugURLWriter = new OutputStreamWriter(debugURLStream, Charset.forName("UTF-8")); _emittedURLSFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } }
static void checkRecords( Configuration defaults, int noMaps, int noReduces, Path sortInput, Path sortOutput) throws IOException { JobConf jobConf = new JobConf(defaults, RecordChecker.class); jobConf.setJobName("sortvalidate-record-checker"); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(BytesWritable.class); jobConf.setOutputValueClass(IntWritable.class); jobConf.setMapperClass(Map.class); jobConf.setReducerClass(Reduce.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (noMaps == -1) { noMaps = cluster.getTaskTrackers() * jobConf.getInt("test.sortvalidate.maps_per_host", 10); } if (noReduces == -1) { noReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sortvalidate.reduces_per_host"); if (sortReduces != null) { noReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } } jobConf.setNumMapTasks(noMaps); jobConf.setNumReduceTasks(noReduces); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordchecker"); FileSystem fs = FileSystem.get(defaults); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordChecker: Running on " + cluster.getTaskTrackers() + " nodes to validate sort from " + inputPaths[0] + ", " + inputPaths[1] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + noReduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); }
static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput) throws IOException { FileSystem inputfs = sortInput.getFileSystem(defaults); FileSystem outputfs = sortOutput.getFileSystem(defaults); FileSystem defaultfs = FileSystem.get(defaults); JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class); jobConf.setJobName("sortvalidate-recordstats-checker"); int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length; jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks); int noSortInputpaths = inputfs.listStatus(sortInput).length; jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(IntWritable.class); jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class); jobConf.setMapperClass(Map.class); jobConf.setCombinerClass(Reduce.class); jobConf.setReducerClass(Reduce.class); jobConf.setNumMapTasks(noSortReduceTasks); jobConf.setNumReduceTasks(1); FileInputFormat.setInputPaths(jobConf, sortInput); FileInputFormat.addInputPath(jobConf, sortOutput); Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker"); if (defaultfs.exists(outputPath)) { defaultfs.delete(outputPath, true); } FileOutputFormat.setOutputPath(jobConf, outputPath); // Uncomment to run locally in a single process // job_conf.set("mapred.job.tracker", "local"); Path[] inputPaths = FileInputFormat.getInputPaths(jobConf); System.out.println( "\nSortValidator.RecordStatsChecker: Validate sort " + "from " + inputPaths[0] + " (" + noSortInputpaths + " files), " + inputPaths[1] + " (" + noSortReduceTasks + " files) into " + FileOutputFormat.getOutputPath(jobConf) + " with 1 reducer."); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println( "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); // Check to ensure that the statistics of the // framework's sort-input and sort-output match SequenceFile.Reader stats = new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults); IntWritable k1 = new IntWritable(); IntWritable k2 = new IntWritable(); RecordStatsWritable v1 = new RecordStatsWritable(); RecordStatsWritable v2 = new RecordStatsWritable(); if (!stats.next(k1, v1)) { throw new IOException("Failed to read record #1 from reduce's output"); } if (!stats.next(k2, v2)) { throw new IOException("Failed to read record #2 from reduce's output"); } if ((v1.getBytes() != v2.getBytes()) || (v1.getRecords() != v2.getRecords()) || v1.getChecksum() != v2.getChecksum()) { throw new IOException( "(" + v1.getBytes() + ", " + v1.getRecords() + ", " + v1.getChecksum() + ") v/s (" + v2.getBytes() + ", " + v2.getRecords() + ", " + v2.getChecksum() + ")"); } }