private MiniMRCluster startCluster(JobConf conf, int numTrackers) throws IOException { conf.setLong("mapred.job.tracker.retiredjobs.cache.size", 1); conf.setLong("mapred.jobtracker.retirejob.interval", 0); conf.setLong("mapred.jobtracker.retirejob.check", 0); conf.getLong("mapred.jobtracker.completeuserjobs.maximum", 0); return new MiniMRCluster(0, 0, numTrackers, "file:///", 1, null, null, null, conf, 0); }
@Override @SuppressWarnings("unchecked") public void run(final JobConf job, final TaskUmbilicalProtocol umbilical) throws IOException { final Reporter reporter = getReporter(umbilical); // start thread that will handle communication with parent startCommunicationThread(umbilical); int numReduceTasks = conf.getNumReduceTasks(); LOG.info("numReduceTasks: " + numReduceTasks); MapOutputCollector collector = null; if (numReduceTasks > 0) { collector = new MapOutputBuffer(umbilical, job, reporter); } else { collector = new DirectMapOutputCollector(umbilical, job, reporter); } // reinstantiate the split try { instantiatedSplit = (InputSplit) ReflectionUtils.newInstance(job.getClassByName(splitClass), job); } catch (ClassNotFoundException exp) { IOException wrap = new IOException("Split class " + splitClass + " not found"); wrap.initCause(exp); throw wrap; } DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(split.get(), 0, split.getSize()); instantiatedSplit.readFields(splitBuffer); // if it is a file split, we can give more details if (instantiatedSplit instanceof FileSplit) { FileSplit fileSplit = (FileSplit) instantiatedSplit; job.set("map.input.file", fileSplit.getPath().toString()); job.setLong("map.input.start", fileSplit.getStart()); job.setLong("map.input.length", fileSplit.getLength()); } RecordReader rawIn = // open input job.getInputFormat().getRecordReader(instantiatedSplit, job, reporter); RecordReader in = new TrackedRecordReader(rawIn, getCounters()); MapRunnable runner = (MapRunnable) ReflectionUtils.newInstance(job.getMapRunnerClass(), job); try { runner.run(in, collector, reporter); collector.flush(); } finally { // close in.close(); // close input collector.close(); } done(umbilical); }
/** * Start simulated task trackers based on topology. * * @param clusterStory the cluster topology. * @param jobConf configuration object. * @param now time stamp when the simulator is started, {@link SimulatorTaskTracker}s are started * uniformly randomly spread in [now,now+startDuration). * @return time stamp by which the entire cluster is booted up and all task trackers are sending * hearbeats in their steady rate. */ long startTaskTrackers(ClusterStory cluster, JobConf jobConf, long now) { /** port assigned to TTs, incremented by 1 for each TT */ int port = 10000; int numTaskTrackers = 0; Random random = new Random(RandomSeedGenerator.getSeed("forStartTaskTrackers()", masterRandomSeed)); final int startDuration = jobConf.getInt("mumak.cluster.startup.duration", DEFAULT_CLUSTER_STARTUP_DURATION); for (MachineNode node : cluster.getMachines()) { jobConf.set("mumak.tasktracker.host.name", node.getName()); jobConf.set( "mumak.tasktracker.tracker.name", "tracker_" + node.getName() + ":localhost/127.0.0.1:" + port); long subRandomSeed = RandomSeedGenerator.getSeed("forTaskTracker" + numTaskTrackers, masterRandomSeed); jobConf.setLong("mumak.tasktracker.random.seed", subRandomSeed); numTaskTrackers++; port++; SimulatorTaskTracker tt = new SimulatorTaskTracker(jt, jobConf); long firstHeartbeat = now + random.nextInt(startDuration); queue.addAll(tt.init(firstHeartbeat)); } // In startDuration + heartbeat interval of the full cluster time each // TT is started up and told on its 2nd heartbeat to beat at a rate // corresponding to the steady state of the cluster long clusterSteady = now + startDuration + jt.getNextHeartbeatInterval(); return clusterSteady; }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
/** * Creates the configuration for mumak simulation. This is kept modular mostly for testing * purposes. so that the standard configuration can be modified before passing it to the init() * function. * * @return JobConf: the configuration for the SimulatorJobTracker */ JobConf createMumakConf() { JobConf jobConf = new JobConf(getConf()); jobConf.setClass( "topology.node.switch.mapping.impl", StaticMapping.class, DNSToSwitchMapping.class); jobConf.set("fs.default.name", "file:///"); jobConf.set("mapred.job.tracker", "localhost:8012"); jobConf.setInt("mapred.jobtracker.job.history.block.size", 512); jobConf.setInt("mapred.jobtracker.job.history.buffer.size", 512); jobConf.setLong("mapred.tasktracker.expiry.interval", 5000); jobConf.setInt("mapred.reduce.copy.backoff", 4); jobConf.setLong("mapred.job.reuse.jvm.num.tasks", -1); jobConf.setUser("mumak"); jobConf.set( "mapred.system.dir", jobConf.get("hadoop.log.dir", "/tmp/hadoop-" + jobConf.getUser()) + "/mapred/system"); return jobConf; }
@Override protected void setUp() throws Exception { JobConf conf = new JobConf(); conf.set(JTConfig.JT_IPC_ADDRESS, "localhost:0"); conf.set(JTConfig.JT_HTTP_ADDRESS, "0.0.0.0:0"); conf.setLong(JTConfig.JT_TRACKER_EXPIRY_INTERVAL, 1000); conf.set(JTConfig.JT_MAX_TRACKER_BLACKLISTS, "1"); jobTracker = new FakeJobTracker(conf, (clock = new FakeClock()), trackers); jobTracker.startExpireTrackersThread(); }
/** When no input dir is specified, generate random data. */ protected static void confRandom(JobConf job) throws IOException { // from RandomWriter job.setInputFormat(RandomInputFormat.class); job.setMapperClass(RandomMapOutput.class); final ClusterStatus cluster = new JobClient(job).getClusterStatus(); int numMapsPerHost = job.getInt(RandomTextWriter.MAPS_PER_HOST, 10); long numBytesToWritePerMap = job.getLong(RandomTextWriter.BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { throw new IOException("Cannot have " + RandomTextWriter.BYTES_PER_MAP + " set to 0"); } long totalBytesToWrite = job.getLong( RandomTextWriter.TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong(RandomTextWriter.BYTES_PER_MAP, totalBytesToWrite); } job.setNumMapTasks(numMaps); }
/** * Initialize DFSCopyFileMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments */ private static void setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); // set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean( Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean( Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter( jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter( jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter( jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext(); ) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty(); ) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { // skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst)); // skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append( new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("srcCount=" + srcCount); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); }