public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: " + getClass().getName() + " <input> <output> <nPopulation>"); // ToolRunner.printGenericCommandUsage(System.err); return -1; } // Create a JobConf using the processed <code>conf</code> final JobConf jobConf = new JobConf(getConf(), getClass()); // Specify various job-specific parameters jobConf.setJobName(MapreduceStringFinder.class.getSimpleName()); // setting the input format jobConf.setInputFormat(Individuals.class); // setting the output ke and value class jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(BooleanWritable.class); // setting the mapper class jobConf.setMapperClass(CIMapper.class); jobConf.setNumMapTasks(3); // setting number of maptasks // setting the reducer class jobConf.setReducerClass(CIReducer.class); // setup input/output directories final String dataset = args[0]; FileInputFormat.setInputPaths(jobConf, new Path(dataset)); FileOutputFormat.setOutputPath(jobConf, new Path(args[1])); final int pop = Integer.parseInt(args[2]); // based on the configuration, make this job threadable if (jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2) == 1) { jobConf.setMapRunnerClass(MultithreadedMapRunner.class); jobConf.setInt("mapred.map.multithreadedrunner.threads", 100); } jobConf.setInt("mapred.map.multithreadedrunner.threads", 100); // for computation intensive data, do not allow the job to fail if the task tracker does not // respond // with a heatbeat message before the timeout value final int timeout = 9000000; jobConf.setInt("mapred.task.timeout", timeout); // set the parameters to be available before a call to the mapper jobConf.setInt("popsize", pop); jobConf.setStrings("dataset", dataset); // int map = jobConf.getNumMapTasks(); // System.out.println("Number of Maps"+ map); // start the map/reduce job System.out.println("Starting Job"); // get the start time for this job final long startTime = System.currentTimeMillis(); // Submit the job, then poll for progress until the job is complete JobClient.runJob(jobConf); // get the end time for this job final long endTime = System.currentTimeMillis(); // get the duration of this job final double duration = (endTime - startTime) / 1000.0; // System.out.println("Job Finished in " + duration + " seconds"); // getElapsedTime(startTime - endTime); return 0; }
public JobBuilder mapRunner(Class<? extends MapRunner> mapRunner) throws IOException { _jobConf.setMapRunnerClass(mapRunner); _jobConf.setJarByClass(mapRunner); return this; }
public static void main(String[] args) { String accessKey = args[0]; String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path FileOutputFormat.setOutputPath(job, tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }
@SuppressWarnings("static-access") @Override public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("tmp output directory") .create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path") .hasArg() .withDescription("index file") .create(INDEX_FILE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } JobConf conf = new JobConf(getConf(), BuildWikipediaForwardIndex.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName( String.format( "BuildWikipediaForwardIndex[%s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.findCounter(Blocks.Total).getCounter(); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF("edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex"); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } return 0; }