public void configure(JobConf job) { bytesToWrite = job.getLong(RandomTextWriter.BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); keymin = job.getInt(RandomTextWriter.MIN_KEY, 5); keymax = job.getInt(RandomTextWriter.MAX_KEY, 10); valmin = job.getInt(RandomTextWriter.MIN_VALUE, 5); valmax = job.getInt(RandomTextWriter.MAX_VALUE, 10); }
private MiniMRCluster startCluster(JobConf conf, int numTrackers) throws IOException { conf.setLong("mapred.job.tracker.retiredjobs.cache.size", 1); conf.setLong("mapred.jobtracker.retirejob.interval", 0); conf.setLong("mapred.jobtracker.retirejob.check", 0); conf.getLong("mapred.jobtracker.completeuserjobs.maximum", 0); return new MiniMRCluster(0, 0, numTrackers, "file:///", 1, null, null, null, conf, 0); }
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); }
/** * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster). * * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static void setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min( numMaps, job.getInt( MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); job.setNumMapTasks(Math.max(numMaps, 1)); }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
/** When no input dir is specified, generate random data. */ protected static void confRandom(JobConf job) throws IOException { // from RandomWriter job.setInputFormat(RandomInputFormat.class); job.setMapperClass(RandomMapOutput.class); final ClusterStatus cluster = new JobClient(job).getClusterStatus(); int numMapsPerHost = job.getInt(RandomTextWriter.MAPS_PER_HOST, 10); long numBytesToWritePerMap = job.getLong(RandomTextWriter.BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { throw new IOException("Cannot have " + RandomTextWriter.BYTES_PER_MAP + " set to 0"); } long totalBytesToWrite = job.getLong( RandomTextWriter.TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong(RandomTextWriter.BYTES_PER_MAP, totalBytesToWrite); } job.setNumMapTasks(numMaps); }
/** * Get the desired maximum length of task's logs. * * @param conf the job to look in * @return the number of bytes to cap the log files at */ public static long getTaskLogLength(JobConf conf) { return conf.getLong("mapred.userlog.limit.kb", 100) * 1024; }
public CoronaJobHistory(JobConf conf, JobID jobId, String logPath) { try { this.conf = conf; this.jobId = jobId; if (logPath == null) { logPath = "file:///" + new File(System.getProperty("hadoop.log.dir", "/tmp")).getAbsolutePath() + File.separator + "history"; } logDir = new Path(logPath); logDirFs = logDir.getFileSystem(conf); if (!logDirFs.exists(logDir)) { LOG.info("Creating history folder at " + logDir); if (!logDirFs.mkdirs(logDir, new FsPermission(HISTORY_DIR_PERMISSION))) { throw new IOException("Mkdirs failed to create " + logDir.toString()); } } conf.set("hadoop.job.history.location", logDir.toString()); disableHistory = false; // set the job history block size (default is 3MB) jobHistoryBlockSize = conf.getLong("mapred.jobtracker.job.history.block.size", 3 * 1024 * 1024); doneDir = new Path(logDir, "done"); doneDirFs = logDirFs; if (!doneDirFs.exists(doneDir)) { LOG.info("Creating DONE folder at " + doneDir); if (!doneDirFs.mkdirs(doneDir, new FsPermission(HISTORY_DIR_PERMISSION))) { throw new IOException("Mkdirs failed to create " + doneDir); } } String logFileName = encodeJobHistoryFileName(CoronaJobHistoryFilesManager.getHistoryFilename(jobId)); logFile = new Path(logDir, logFileName); doneFile = new Path(doneDir, logFileName); // initialize the file manager conf.setInt("mapred.jobtracker.historythreads.maximum", 1); fileManager = new CoronaJobHistoryFilesManager( conf, new JobHistoryObserver() { public void historyFileCopied(JobID jobid, String historyFile) {} }, logDir); fileManager.setDoneDir(doneDir); // sleeping with the past means tolerating two start methods instead of one fileManager.start(); fileManager.startIOExecutor(); } catch (IOException e) { LOG.error("Failed to initialize JobHistory log file", e); disableHistory = true; } }
/** * Initiate components in the simulation. The JobConf is create separately and passed to the * init(). * * @param JobConf: The configuration for the jobtracker. * @throws InterruptedException * @throws IOException if trace or topology files cannot be opened. */ @SuppressWarnings("deprecation") void init(JobConf jobConf) throws InterruptedException, IOException { FileSystem lfs = FileSystem.getLocal(getConf()); Path logPath = new Path(System.getProperty("hadoop.log.dir")).makeQualified(lfs); jobConf.set("mapred.system.dir", logPath.toString()); jobConf.set("hadoop.job.history.location", (new Path(logPath, "history").toString())); // start time for virtual clock // possible improvement: set default value to sth more meaningful based on // the 1st job long now = getTimeProperty(jobConf, "mumak.start.time", System.currentTimeMillis()); jt = SimulatorJobTracker.startTracker(jobConf, now, this); jt.offerService(); masterRandomSeed = jobConf.getLong("mumak.random.seed", System.nanoTime()); // max Map/Reduce tasks per node int maxMaps = getConf() .getInt("mapred.tasktracker.map.tasks.maximum", SimulatorTaskTracker.DEFAULT_MAP_SLOTS); int maxReduces = getConf() .getInt( "mapred.tasktracker.reduce.tasks.maximum", SimulatorTaskTracker.DEFAULT_REDUCE_SLOTS); MachineNode defaultNode = new MachineNode.Builder("default", 2) .setMapSlots(maxMaps) .setReduceSlots(maxReduces) .build(); LoggedNetworkTopology topology = new ClusterTopologyReader(new Path(topologyFile), jobConf).get(); // Setting the static mapping before removing numeric IP hosts. setStaticMapping(topology); if (getConf().getBoolean("mumak.topology.filter-numeric-ips", true)) { removeIpHosts(topology); } ZombieCluster cluster = new ZombieCluster(topology, defaultNode); // create TTs based on topology.json long firstJobStartTime = startTaskTrackers(cluster, jobConf, now); long subRandomSeed = RandomSeedGenerator.getSeed("forSimulatorJobStoryProducer", masterRandomSeed); JobStoryProducer jobStoryProducer = new SimulatorJobStoryProducer( new Path(traceFile), cluster, firstJobStartTime, jobConf, subRandomSeed); final SimulatorJobSubmissionPolicy submissionPolicy = SimulatorJobSubmissionPolicy.getPolicy(jobConf); jc = new SimulatorJobClient(jt, jobStoryProducer, submissionPolicy); queue.addAll(jc.init(firstJobStartTime)); // if the taskScheduler is CapacityTaskScheduler start off the JobInitialization // threads too if (jobConf .get("mapred.jobtracker.taskScheduler") .equals(CapacityTaskScheduler.class.getName())) { LOG.info("CapacityScheduler used: starting simulatorThreads"); startSimulatorThreadsCapSched(now); } terminateTime = getTimeProperty(jobConf, "mumak.terminate.time", Long.MAX_VALUE); }