public Job getJob(Configuration conf) throws IOException { Job job = new Job(conf, "pivoting"); job.setJarByClass(PivotingReducer.class); job.setMapperClass(Mapper.class); job.setReducerClass(PivotingReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(RuleWritable.class); job.setMapOutputValueClass(MapWritable.class); job.setOutputKeyClass(RuleWritable.class); job.setOutputValueClass(MapWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(RuleWritable.SourcePartitioner.class); FileInputFormat.setInputPaths(job, new Path(conf.get("thrax.work-dir") + "collected")); int maxSplitSize = conf.getInt("thrax.max-split-size", 0); if (maxSplitSize != 0) FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); int numReducers = conf.getInt("thrax.reducers", 4); job.setNumReduceTasks(numReducers); FileOutputFormat.setOutputPath(job, new Path(conf.get("thrax.work-dir") + "pivoted")); FileOutputFormat.setCompressOutput(job, true); return job; }
/** The main entry point if this class is called as a {@link Tool}. */ @Override public int run(String[] args) throws Exception { Path inputPath = null; Path outputPath = null; Configuration conf = getConf(); // retrieve our paths from the configuration inputPath = new Path(conf.get(Util.CONF_LOGDATA_PATH)); outputPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH)); final int numCores = conf.getInt(Util.CONF_NUM_CORES, Util.DEFAULT_NUM_CORES); final int numNodes = conf.getInt(Util.CONF_NUM_NODES, Util.DEFAULT_NUM_NODES); NUM_OF_REDUCE_TASKS = numCores * numNodes; // set the jobname String jobName = Util.JOB_NAME + " [" + CachingTool.ACTION + "] {logdata=" + inputPath.getName() + ", session=" + conf.get(Util.CONF_SESSION_DURATION) + "}"; Util.showStatus("Running " + jobName); conf.set("hadoop.job.ugi", Util.HADOOP_USER); conf.set("mapred.child.java.opts", "-Xmx1500M -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode"); conf.set("mapred.task.timeout", "1800000"); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, jobName); // set number of reduce tasks job.setNumReduceTasks(NUM_OF_REDUCE_TASKS); // set mapper, reducer, partitioner and grouping comperator job.setJarByClass(CachingTool.class); job.setMapperClass(CachingMapper.class); job.setReducerClass(CachingReducer.class); // GroupingComperator used for Secondary-Sort job.setGroupingComparatorClass(TextPair.FirstComparator.class); job.setPartitionerClass(TextPair.FirstPartitioner.class); job.setOutputKeyClass(TextPair.class); job.setOutputValueClass(Text.class); // set input and output format job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setMaxInputSplitSize(job, Util.DATASET_MB_SPLIT * 25); FileInputFormat.setMinInputSplitSize(job, Util.DATASET_MB_SPLIT * 25); // add input path subdirectories if there are any ArrayList<Path> inputPaths = Util.getInputDirectories(fs, inputPath); int pathsAdded = 0; if (inputPaths.size() > 0) { for (Path p : inputPaths) { if (!p.getName().contains(".") && !p.getName().contains("_")) { Util.showStatus("Adding input paths " + p); FileInputFormat.addInputPath(job, p); pathsAdded++; } } } if (pathsAdded == 0) { Util.showStatus("Adding input path " + inputPath); FileInputFormat.addInputPath(job, inputPath); } // clear output dir fs.delete(outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION)), true); FileOutputFormat.setOutputPath( job, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION))); // run the job and wait for it to be completed boolean b = job.waitForCompletion(true); // NOTE! The counters will be written HERE // retrieve the counters Counter numNewInCache = job.getCounters().findCounter(CachingReducer.CacheCounter.NEW_TO_CACHE); Counter numRenewCache = job.getCounters().findCounter(CachingReducer.CacheCounter.RENEW_CACHE); Counter numUsedFromCache = job.getCounters().findCounter(CachingReducer.CacheCounter.USED_FROM_CACHE); // write the counters to the metadata file Path headerPath = new Path(conf.get(Util.CONF_CACHING_SIMULATOR_PATH)); FSDataOutputStream out = fs.create(headerPath.suffix("/" + DataSetHeader.SIMULATE_CACHING_METADATA_FILE)); PrintWriter w = new PrintWriter(out); // the sum of all counters equals the sum of all queries in the log file w.println("hostnametypeAddedToCache=" + numNewInCache.getValue()); w.println("queriesAddedAgainToCache=" + numRenewCache.getValue()); w.println("queriesAnsweredFromCache=" + numUsedFromCache.getValue()); w.close(); out.close(); // Delete all empty output files Util.deleteEmptyFiles(fs, outputPath.suffix("/" + conf.get(Util.CONF_SESSION_DURATION))); return b ? 1 : 0; }