public void configure(JobConf job) { bytesToWrite = job.getLong(RandomTextWriter.BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); keymin = job.getInt(RandomTextWriter.MIN_KEY, 5); keymax = job.getInt(RandomTextWriter.MAX_KEY, 10); valmin = job.getInt(RandomTextWriter.MIN_VALUE, 5); valmax = job.getInt(RandomTextWriter.MAX_VALUE, 10); }
public void configure(JobConf conf) { numberOfCenters = Integer.valueOf(conf.get("numberOfCenters")); centersDirectory = conf.get("centersReadDirectory"); try { Configuration c = new Configuration(); FileSystem fs = FileSystem.get(c); for (int index = 0; index < numberOfCenters; ++index) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c); LongWritable key = new LongWritable(); Point value = new Point(); reader.next(key, value); Point center = (Point) value; centers.add(center); reader.close(); } } catch (IOException e) { // do nothing // I hope this doesn't happen System.out.println("well, damn."); e.printStackTrace(); } }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf( "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); // vv MaxTemperatureDriverV6 conf.setProfileEnabled(true); conf.setProfileParams( "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s"); conf.setProfileTaskRange(true, "0-2"); // ^^ MaxTemperatureDriverV6 JobClient.runJob(conf); return 0; }
private void testMapFileOutputCommitterInternal(int version) throws Exception { JobConf conf = new JobConf(); FileOutputFormat.setOutputPath(conf, outDir); conf.set(JobContext.TASK_ATTEMPT_ID, attempt); conf.setInt( org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter .FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, version); JobContext jContext = new JobContextImpl(conf, taskID.getJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(); // setup committer.setupJob(jContext); committer.setupTask(tContext); // write output MapFileOutputFormat theOutputFormat = new MapFileOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(null, conf, partFile, null); writeMapFileOutput(theRecordWriter, tContext); // do commit if (committer.needsTaskCommit(tContext)) { committer.commitTask(tContext); } committer.commitJob(jContext); // validate output validateMapFileOutputContent(FileSystem.get(conf), outDir); FileUtil.fullyDelete(new File(outDir.toString())); }
/** Set the max number of attempts before we declare a TIP as "failed" */ private void setMaxTaskAttempts() { if (isMapTask()) { this.maxTaskAttempts = conf.getMaxMapAttempts(); } else { this.maxTaskAttempts = conf.getMaxReduceAttempts(); } }
/** * Start simulated task trackers based on topology. * * @param clusterStory the cluster topology. * @param jobConf configuration object. * @param now time stamp when the simulator is started, {@link SimulatorTaskTracker}s are started * uniformly randomly spread in [now,now+startDuration). * @return time stamp by which the entire cluster is booted up and all task trackers are sending * hearbeats in their steady rate. */ long startTaskTrackers(ClusterStory cluster, JobConf jobConf, long now) { /** port assigned to TTs, incremented by 1 for each TT */ int port = 10000; int numTaskTrackers = 0; Random random = new Random(RandomSeedGenerator.getSeed("forStartTaskTrackers()", masterRandomSeed)); final int startDuration = jobConf.getInt("mumak.cluster.startup.duration", DEFAULT_CLUSTER_STARTUP_DURATION); for (MachineNode node : cluster.getMachines()) { jobConf.set("mumak.tasktracker.host.name", node.getName()); jobConf.set( "mumak.tasktracker.tracker.name", "tracker_" + node.getName() + ":localhost/127.0.0.1:" + port); long subRandomSeed = RandomSeedGenerator.getSeed("forTaskTracker" + numTaskTrackers, masterRandomSeed); jobConf.setLong("mumak.tasktracker.random.seed", subRandomSeed); numTaskTrackers++; port++; SimulatorTaskTracker tt = new SimulatorTaskTracker(jt, jobConf); long firstHeartbeat = now + random.nextInt(startDuration); queue.addAll(tt.init(firstHeartbeat)); } // In startDuration + heartbeat interval of the full cluster time each // TT is started up and told on its 2nd heartbeat to beat at a rate // corresponding to the steady state of the cluster long clusterSteady = now + startDuration + jt.getNextHeartbeatInterval(); return clusterSteady; }
private void testAbortInternal(int version) throws IOException, InterruptedException { JobConf conf = new JobConf(); FileOutputFormat.setOutputPath(conf, outDir); conf.set(JobContext.TASK_ATTEMPT_ID, attempt); conf.setInt( org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter .FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, version); JobContext jContext = new JobContextImpl(conf, taskID.getJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(); // do setup committer.setupJob(jContext); committer.setupTask(tContext); // write output TextOutputFormat theOutputFormat = new TextOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(null, conf, partFile, null); writeOutput(theRecordWriter, tContext); // do abort committer.abortTask(tContext); File out = new File(outDir.toUri().getPath()); Path workPath = committer.getWorkPath(tContext, outDir); File wp = new File(workPath.toUri().getPath()); File expectedFile = new File(wp, partFile); assertFalse("task temp dir still exists", expectedFile.exists()); committer.abortJob(jContext, JobStatus.State.FAILED); expectedFile = new File(out, FileOutputCommitter.TEMP_DIR_NAME); assertFalse("job temp dir still exists", expectedFile.exists()); assertEquals("Output directory not empty", 0, out.listFiles().length); FileUtil.fullyDelete(out); }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
static List<String> getClassPaths( JobConf conf, File workDir, TaskDistributedCacheManager taskDistributedCacheManager) throws IOException { // Accumulates class paths for child. List<String> classPaths = new ArrayList<String>(); boolean userClassesTakesPrecedence = conf.userClassesTakesPrecedence(); if (!userClassesTakesPrecedence) { // start with same classpath as parent process appendSystemClasspaths(classPaths); } // include the user specified classpath appendJobJarClasspaths(conf.getJar(), classPaths); // Distributed cache paths if (taskDistributedCacheManager != null) classPaths.addAll(taskDistributedCacheManager.getClassPaths()); // Include the working dir too classPaths.add(workDir.toString()); if (userClassesTakesPrecedence) { // parent process's classpath is added last appendSystemClasspaths(classPaths); } return classPaths; }
// Mostly for setting up the symlinks. Note that when we setup the distributed // cache, we didn't create the symlinks. This is done on a per task basis // by the currently executing task. public static void setupWorkDir(JobConf conf) throws IOException { File workDir = new File(".").getAbsoluteFile(); FileUtil.fullyDelete(workDir); if (DistributedCache.getSymlink(conf)) { URI[] archives = DistributedCache.getCacheArchives(conf); URI[] files = DistributedCache.getCacheFiles(conf); Path[] localArchives = DistributedCache.getLocalCacheArchives(conf); Path[] localFiles = DistributedCache.getLocalCacheFiles(conf); if (archives != null) { for (int i = 0; i < archives.length; i++) { String link = archives[i].getFragment(); if (link != null) { link = workDir.toString() + Path.SEPARATOR + link; File flink = new File(link); if (!flink.exists()) { FileUtil.symLink(localArchives[i].toString(), link); } } } } if (files != null) { for (int i = 0; i < files.length; i++) { String link = files[i].getFragment(); if (link != null) { link = workDir.toString() + Path.SEPARATOR + link; File flink = new File(link); if (!flink.exists()) { FileUtil.symLink(localFiles[i].toString(), link); } } } } } File jobCacheDir = null; if (conf.getJar() != null) { jobCacheDir = new File(new Path(conf.getJar()).getParent().toString()); } // create symlinks for all the files in job cache dir in current // workingdir for streaming try { DistributedCache.createAllSymlink(conf, jobCacheDir, workDir); } catch (IOException ie) { // Do not exit even if symlinks have not been created. LOG.warn(StringUtils.stringifyException(ie)); } // add java.io.tmpdir given by mapred.child.tmp String tmp = conf.get("mapred.child.tmp", "./tmp"); Path tmpDir = new Path(tmp); // if temp directory path is not absolute // prepend it with workDir. if (!tmpDir.isAbsolute()) { tmpDir = new Path(workDir.toString(), tmp); FileSystem localFs = FileSystem.getLocal(conf); if (!localFs.mkdirs(tmpDir) && !localFs.getFileStatus(tmpDir).isDir()) { throw new IOException("Mkdirs failed to create " + tmpDir.toString()); } } }
public void configure(JobConf conf) { /* * It reads all the configurations and distributed cache from outside. */ // Read number of nodes in input layer and output layer from configuration inputNumdims = conf.get("numdims"); inputNumhid = conf.get("numhid"); // Read the weights from distributed cache Path[] pathwaysFiles = new Path[0]; try { pathwaysFiles = DistributedCache.getLocalCacheFiles(conf); for (Path path : pathwaysFiles) { /* * this loop reads all the distributed cache files * In fact, the driver program ensures that there is only one distributed cache file */ BufferedReader fis = new BufferedReader(new FileReader(path.toString())); weightline = fis.readLine(); } } catch (Exception e) { e.printStackTrace(); } }
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
/** * Driver to copy srcPath to destPath depending on required protocol. * * @param args arguments */ static void copy(final Configuration conf, final Arguments args) throws IOException { LOG.info("srcPaths=" + args.srcs); LOG.info("destPath=" + args.dst); checkSrcPath(conf, args.srcs); JobConf job = createJobConf(conf); if (args.preservedAttributes != null) { job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes); } if (args.mapredSslConf != null) { job.set("dfs.https.client.keystore.resource", args.mapredSslConf); } // Initialize the mapper try { setup(conf, job, args); JobClient.runJob(job); finalize(conf, job, args.dst, args.preservedAttributes); } finally { // delete tmp fullyDelete(job.get(TMP_DIR_LABEL), job); // delete jobDirectory fullyDelete(job.get(JOB_DIR_LABEL), job); } }
private void testMapOnlyNoOutputInternal(int version) throws Exception { JobConf conf = new JobConf(); // This is not set on purpose. FileOutputFormat.setOutputPath(conf, outDir); conf.set(JobContext.TASK_ATTEMPT_ID, attempt); conf.setInt( org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter .FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, version); JobContext jContext = new JobContextImpl(conf, taskID.getJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(); // setup committer.setupJob(jContext); committer.setupTask(tContext); if (committer.needsTaskCommit(tContext)) { // do commit committer.commitTask(tContext); } committer.commitJob(jContext); // validate output FileUtil.fullyDelete(new File(outDir.toString())); }
public Job(JobID jobid, String jobSubmitDir) throws IOException { this.systemJobDir = new Path(jobSubmitDir); this.systemJobFile = new Path(systemJobDir, "job.xml"); this.id = jobid; this.localFs = FileSystem.getLocal(conf); this.localJobDir = localFs.makeQualified(conf.getLocalPath(jobDir)); this.localJobFile = new Path(this.localJobDir, id + ".xml"); // Manage the distributed cache. If there are files to be copied, // this will trigger localFile to be re-written again. this.trackerDistributedCacheManager = new TrackerDistributedCacheManager(conf, taskController); this.taskDistributedCacheManager = trackerDistributedCacheManager.newTaskDistributedCacheManager(jobid, conf); taskDistributedCacheManager.setupCache(conf, "archive", "archive"); if (DistributedCache.getSymlink(conf)) { // This is not supported largely because, // for a Child subprocess, the cwd in LocalJobRunner // is not a fresh slate, but rather the user's working directory. // This is further complicated because the logic in // setupWorkDir only creates symlinks if there's a jarfile // in the configuration. LOG.warn("LocalJobRunner does not support " + "symlinking into current working dir."); } // Setup the symlinks for the distributed cache. TaskRunner.setupWorkDir(conf, new File(localJobDir.toUri()).getAbsoluteFile()); // Write out configuration file. Instead of copying it from // systemJobFile, we re-write it, since setup(), above, may have // updated it. OutputStream out = localFs.create(localJobFile); try { conf.writeXml(out); } finally { out.close(); } this.job = new JobConf(localJobFile); // Job (the current object) is a Thread, so we wrap its class loader. if (!taskDistributedCacheManager.getClassPaths().isEmpty()) { setContextClassLoader(taskDistributedCacheManager.makeClassLoader(getContextClassLoader())); } profile = new JobProfile( job.getUser(), id, systemJobFile.toString(), "http://localhost:8080/", job.getJobName()); status = new JobStatus(id, 0.0f, 0.0f, JobStatus.RUNNING); jobs.put(id, this); this.start(); }
private MiniMRCluster startCluster(JobConf conf, int numTrackers) throws IOException { conf.setLong("mapred.job.tracker.retiredjobs.cache.size", 1); conf.setLong("mapred.jobtracker.retirejob.interval", 0); conf.setLong("mapred.jobtracker.retirejob.check", 0); conf.getLong("mapred.jobtracker.completeuserjobs.maximum", 0); return new MiniMRCluster(0, 0, numTrackers, "file:///", 1, null, null, null, conf, 0); }
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); }
@Test(timeout = 20000) public void testWarnCommandOpts() throws Exception { Logger logger = Logger.getLogger(YARNRunner.class); ByteArrayOutputStream bout = new ByteArrayOutputStream(); Layout layout = new SimpleLayout(); Appender appender = new WriterAppender(layout, bout); logger.addAppender(appender); JobConf jobConf = new JobConf(); jobConf.set( MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS, "-Djava.net.preferIPv4Stack=true -Djava.library.path=foo"); jobConf.set(MRJobConfig.MR_AM_COMMAND_OPTS, "-Xmx1024m -Djava.library.path=bar"); YARNRunner yarnRunner = new YARNRunner(jobConf); File jobxml = new File(testWorkDir, MRJobConfig.JOB_CONF_FILE); OutputStream out = new FileOutputStream(jobxml); conf.writeXml(out); out.close(); File jobsplit = new File(testWorkDir, MRJobConfig.JOB_SPLIT); out = new FileOutputStream(jobsplit); out.close(); File jobsplitmetainfo = new File(testWorkDir, MRJobConfig.JOB_SPLIT_METAINFO); out = new FileOutputStream(jobsplitmetainfo); out.close(); File appTokens = new File(testWorkDir, MRJobConfig.APPLICATION_TOKENS_FILE); out = new FileOutputStream(appTokens); out.close(); @SuppressWarnings("unused") ApplicationSubmissionContext submissionContext = yarnRunner.createApplicationSubmissionContext( jobConf, testWorkDir.toString(), new Credentials()); String logMsg = bout.toString(); assertTrue( logMsg.contains( "WARN - Usage of -Djava.library.path in " + "yarn.app.mapreduce.am.admin-command-opts can cause programs to no " + "longer function if hadoop native libraries are used. These values " + "should be set as part of the LD_LIBRARY_PATH in the app master JVM " + "env using yarn.app.mapreduce.am.admin.user.env config settings.")); assertTrue( logMsg.contains( "WARN - Usage of -Djava.library.path in " + "yarn.app.mapreduce.am.command-opts can cause programs to no longer " + "function if hadoop native libraries are used. These values should " + "be set as part of the LD_LIBRARY_PATH in the app master JVM env " + "using yarn.app.mapreduce.am.env config settings.")); }
@Override protected void setUp() throws Exception { JobConf conf = new JobConf(); conf.set(JTConfig.JT_IPC_ADDRESS, "localhost:0"); conf.set(JTConfig.JT_HTTP_ADDRESS, "0.0.0.0:0"); conf.setLong(JTConfig.JT_TRACKER_EXPIRY_INTERVAL, 1000); conf.set(JTConfig.JT_MAX_TRACKER_BLACKLISTS, "1"); jobTracker = new FakeJobTracker(conf, (clock = new FakeClock()), trackers); jobTracker.startExpireTrackersThread(); }
/** * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster). * * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static void setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min( numMaps, job.getInt( MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); job.setNumMapTasks(Math.max(numMaps, 1)); }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
public TaskTrackerMetricsInst(TaskTracker t) { super(t); JobConf conf = tt.getJobConf(); String sessionId = conf.getSessionId(); // Initiate Java VM Metrics JvmMetrics.init("TaskTracker", sessionId); // Create a record for Task Tracker metrics MetricsContext context = MetricsUtil.getContext("mapred"); metricsRecord = MetricsUtil.createRecord(context, "tasktracker"); // guaranteed never null metricsRecord.setTag("sessionId", sessionId); context.registerUpdater(this); }
/** Configure a job given argv. */ public static boolean parseArgs(String[] argv, JobConf job) throws IOException { if (argv.length < 1) { return 0 == printUsage(); } for (int i = 0; i < argv.length; ++i) { if (argv.length == i + 1) { System.out.println("ERROR: Required parameter missing from " + argv[i]); return 0 == printUsage(); } try { if ("-m".equals(argv[i])) { job.setNumMapTasks(Integer.parseInt(argv[++i])); } else if ("-r".equals(argv[i])) { job.setNumReduceTasks(Integer.parseInt(argv[++i])); } else if ("-inFormat".equals(argv[i])) { job.setInputFormat(Class.forName(argv[++i]).asSubclass(InputFormat.class)); } else if ("-outFormat".equals(argv[i])) { job.setOutputFormat(Class.forName(argv[++i]).asSubclass(OutputFormat.class)); } else if ("-outKey".equals(argv[i])) { job.setOutputKeyClass(Class.forName(argv[++i]).asSubclass(WritableComparable.class)); } else if ("-outValue".equals(argv[i])) { job.setOutputValueClass(Class.forName(argv[++i]).asSubclass(Writable.class)); } else if ("-keepmap".equals(argv[i])) { job.set( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.MAP_PRESERVE_PERCENT, argv[++i]); } else if ("-keepred".equals(argv[i])) { job.set( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.REDUCE_PRESERVE_PERCENT, argv[++i]); } else if ("-outdir".equals(argv[i])) { FileOutputFormat.setOutputPath(job, new Path(argv[++i])); } else if ("-indir".equals(argv[i])) { FileInputFormat.addInputPaths(job, argv[++i]); } else if ("-inFormatIndirect".equals(argv[i])) { job.setClass( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.INDIRECT_INPUT_FORMAT, Class.forName(argv[++i]).asSubclass(InputFormat.class), InputFormat.class); job.setInputFormat(IndirectInputFormat.class); } else { System.out.println("Unexpected argument: " + argv[i]); return 0 == printUsage(); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + argv[i]); return 0 == printUsage(); } catch (Exception e) { throw (IOException) new IOException().initCause(e); } } return true; }
private long getTaskMemoryLimit(TaskAttemptID tid) { JobConf conf; synchronized (this.taskTracker) { conf = this.taskTracker.tasks.get(tid).getJobConf(); } long taskMemoryLimit = tid.isMap() ? conf.getInt( JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT) : conf.getInt( JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT); return taskMemoryLimit * 1024 * 1024L; }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
static JobConf configureJobConf( JobConf conf, String namenode, int jobTrackerPort, int jobTrackerInfoPort, UserGroupInformation ugi) { JobConf result = new JobConf(conf); FileSystem.setDefaultUri(result, namenode); result.set("mapred.job.tracker", "localhost:" + jobTrackerPort); result.set("mapred.job.tracker.http.address", "127.0.0.1:" + jobTrackerInfoPort); // for debugging have all task output sent to the test output JobClient.setTaskOutputFilter(result, JobClient.TaskStatusFilter.ALL); return result; }
public static void runJob(String[] args) { JobConf conf = new JobConf(CassandraBulkLoader.class); if (args.length >= 4) { conf.setNumReduceTasks(new Integer(args[3])); } try { // We store the cassandra storage-conf.xml on the HDFS cluster DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf); } catch (URISyntaxException e) { throw new RuntimeException(e); } conf.setInputFormat(KeyValueTextInputFormat.class); conf.setJobName("CassandraBulkLoader_v2"); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } }
public void testFailAbort() throws IOException { JobConf job = new JobConf(); job.set(FileSystem.FS_DEFAULT_NAME_KEY, "faildel:///"); job.setClass("fs.faildel.impl", FakeFileSystem.class, FileSystem.class); setConfForFileOutputCommitter(job); JobContext jContext = new JobContextImpl(job, taskID.getJobID()); TaskAttemptContext tContext = new TaskAttemptContextImpl(job, taskID); FileOutputCommitter committer = new FileOutputCommitter(); FileOutputFormat.setWorkOutputPath(job, committer.getTempTaskOutputPath(tContext)); // do setup committer.setupJob(jContext); committer.setupTask(tContext); String file = "test.txt"; // A reporter that does nothing Reporter reporter = Reporter.NULL; // write output FileSystem localFs = new FakeFileSystem(); TextOutputFormat theOutputFormat = new TextOutputFormat(); RecordWriter theRecordWriter = theOutputFormat.getRecordWriter(localFs, job, file, reporter); writeOutput(theRecordWriter, reporter); // do abort Throwable th = null; try { committer.abortTask(tContext); } catch (IOException ie) { th = ie; } assertNotNull(th); assertTrue(th instanceof IOException); assertTrue(th.getMessage().contains("fake delete failed")); File jobTmpDir = new File(new Path(outDir, FileOutputCommitter.TEMP_DIR_NAME).toString()); File taskTmpDir = new File(jobTmpDir, "_" + taskID); File expectedFile = new File(taskTmpDir, file); assertTrue(expectedFile + " does not exists", expectedFile.exists()); th = null; try { committer.abortJob(jContext, JobStatus.State.FAILED); } catch (IOException ie) { th = ie; } assertNotNull(th); assertTrue(th instanceof IOException); assertTrue(th.getMessage().contains("fake delete failed")); assertTrue("job temp dir does not exists", jobTmpDir.exists()); }
static JobConf configureJobConf( JobConf conf, String namenode, int jobTrackerPort, int jobTrackerInfoPort, UserGroupInformation ugi) { JobConf result = new JobConf(conf); FileSystem.setDefaultUri(result, namenode); result.set(MRConfig.FRAMEWORK_NAME, MRConfig.CLASSIC_FRAMEWORK_NAME); result.set(JTConfig.JT_IPC_ADDRESS, "localhost:" + jobTrackerPort); result.set(JTConfig.JT_HTTP_ADDRESS, "127.0.0.1:" + jobTrackerInfoPort); // for debugging have all task output sent to the test output JobClient.setTaskOutputFilter(result, JobClient.TaskStatusFilter.ALL); return result; }
private static void finalize( Configuration conf, JobConf jobconf, final Path destPath, String presevedAttributes) throws IOException { if (presevedAttributes == null) { return; } EnumSet<FileAttribute> preseved = FileAttribute.parse(presevedAttributes); if (!preseved.contains(FileAttribute.USER) && !preseved.contains(FileAttribute.GROUP) && !preseved.contains(FileAttribute.PERMISSION)) { return; } FileSystem dstfs = destPath.getFileSystem(conf); Path dstdirlist = new Path(jobconf.get(DST_DIR_LIST_LABEL)); SequenceFile.Reader in = null; try { in = new SequenceFile.Reader(dstdirlist.getFileSystem(jobconf), dstdirlist, jobconf); Text dsttext = new Text(); FilePair pair = new FilePair(); for (; in.next(dsttext, pair); ) { Path absdst = new Path(destPath, pair.output); updatePermissions(pair.input, dstfs.getFileStatus(absdst), preseved, dstfs); } } finally { checkAndClose(in); } }