public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = Integer.parseInt(args[2]); int reduceTasks = Integer.parseInt(args[3]); String stoplistPath = args[4]; sLogger.info("Tool: AFormatter"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(AFormatterWG.class); conf.setJobName("Authority Formatter -- Web Graph"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); // conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HITSNode.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setCompressMapOutput(true); conf.setSpeculativeExecution(false); // InputSampler.Sampler<IntWritable, Text> sampler = new // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10); // InputSampler.writePartitionFile(conf, sampler); // conf.setPartitionerClass(TotalOrderPartitioner.class); conf.setMapperClass(AFormatMapperIMC.class); conf.setCombinerClass(AFormatReducer.class); conf.setReducerClass(AFormatReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); Path stopList = new Path(stoplistPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); sLogger.info("Starting job"); DistributedCache.addCacheFile(stopList.toUri(), conf); JobClient.runJob(conf); sLogger.info( "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
/** * This method call when injected into a class will modify the output path, only if output is into * HDFS * * @param job Job whose output path need to be changed */ public static void modifyOutputPath(JobConf job) { Path path = org.apache.hadoop.mapred.FileOutputFormat.getOutputPath(job); if (path == null) { throw new IllegalArgumentException("Job Output path is null, expecting not null path value"); } StringBuilder out = new StringBuilder(path.toString()); out.append(SEPARATOR_UNDERSCORE).append(System.currentTimeMillis()); org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(job, new Path(out.toString())); }
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); // set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); // use the defined mapper job.setMapperClass(MapClass.class); // use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
public static void htableFile() throws Exception { Job job = new Job(conf, "ExampleSummaryToFile"); job.setJarByClass(HbaseMR.class); // class that contains mapper Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs // set other scan attrs TableMapReduceUtil.initTableMapperJob( "sourceTable", // input table scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper class Text.class, // mapper output key IntWritable.class, // mapper output value job); job.setReducerClass(MyReducer4.class); // reducer class job.setNumReduceTasks(1); // at least one, adjust as required FileOutputFormat.setOutputPath(new JobConf(conf), new Path("/tmp/mr/mySummaryFile")); // adjust // directories // as // required boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
/** * Run the job * * @param params The Job parameters containing the gramSize, input output folders, defaultCat, * encoding */ public static void runJob(Parameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesClassifierDriver.class); conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath")); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath"))); Path outPath = new Path(params.get("testDirPath") + "-output"); FileOutputFormat.setOutputPath(conf, outPath); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setMapperClass(BayesClassifierMapper.class); conf.setCombinerClass(BayesClassifierReducer.class); conf.setReducerClass(BayesClassifierReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); HadoopUtil.overwriteOutput(outPath); conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); Path outputFiles = new Path(outPath, "part*"); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params); log.info("{}", matrix.summarize()); }
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: Compressible"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); // JobConf conf = new JobConf(Stats.class); JobConf conf = new JobConf(Compressible.class); conf.setJobName("Compressible " + inputPath); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CompressibleMapper.class); conf.setReducerClass(CompressibleReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
public static void main(String[] args) throws Exception { String dir1 = "/user/miyuru/wcout"; String dir2 = "/user/miyuru/notinverts"; // We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } JobConf conf = new JobConf(); conf.setNumMapTasks(96); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setMapperClass(TokenizerMapper.class); conf.setReducerClass(IntSumReducer.class); conf.setCombinerClass(IntSumReducer.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job = new Job(conf, "NotInFinder"); job.setJarByClass(WordCount.class); // job.setMapperClass(TokenizerMapper.class); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); // job.setOutputKeyClass(LongWritable.class); // job.setOutputValueClass(LongWritable.class); job.setSortComparatorClass(SortComparator.class); job.waitForCompletion(true); }
/** * {@inheritDoc} * * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ @Override public int run(String[] args) throws Exception { JobConf configuration = new JobConf(getConf(), WordCountExtended.class); configuration.setJobName(JOB_NAME); configuration.setOutputKeyClass(Text.class); configuration.setOutputValueClass(IntWritable.class); configuration.setMapperClass(Map.class); configuration.setCombinerClass(Reduce.class); configuration.setReducerClass(Reduce.class); configuration.setInputFormat(TextInputFormat.class); configuration.setOutputFormat(TextOutputFormat.class); List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if (JOB_SKIP_ARGUMENT.equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration); configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true); } else { otherArgs.add(args[i]); } } FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0))); FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1))); JobClient.runJob(configuration); return 0; }
public static void main(String[] args) throws Exception { String input = "hdfs://centos:9000/access.log.10"; String output = "hdfs://centos:9000/out_kpitime"; JobConf conf = new JobConf(KPITime.class); conf.setJobName("KPITime"); // conf.addResource("classpath:/hadoop/core-site.xml"); // conf.addResource("classpath:/hadoop/hdfs-site.xml"); // conf.addResource("classpath:/hadoop/mapred-site.xml"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(KPITimeMapper.class); conf.setCombinerClass(KPITimeReducer.class); conf.setReducerClass(KPITimeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); System.exit(0); }
public void runParseTest( String fieldTerminator, String lineTerminator, String encloser, String escape, boolean encloseRequired) throws IOException { ClassLoader prevClassLoader = null; String[] argv = getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); runImport(argv); try { String tableClassName = getTableName(); argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true); CompilationManager compileMgr = new CompilationManager(opts); String jarFileName = compileMgr.getJarFilename(); // Make sure the user's class is loaded into our address space. prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName); JobConf job = new JobConf(); job.setJar(jarFileName); // Tell the job what class we're testing. job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName); // use local mode in the same JVM. ConfigurationHelper.setJobtrackerAddr(job, "local"); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } String warehouseDir = getWarehouseDir(); Path warehousePath = new Path(warehouseDir); Path inputPath = new Path(warehousePath, getTableName()); Path outputPath = new Path(warehousePath, getTableName() + "-out"); job.setMapperClass(ReparseMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); JobClient.runJob(job); } catch (InvalidOptionsException ioe) { fail(ioe.toString()); } catch (ParseException pe) { fail(pe.toString()); } finally { if (null != prevClassLoader) { ClassLoaderStack.setCurrentClassLoader(prevClassLoader); } } }
public static void run(Map<String, String> path) throws IOException { JobConf conf = Recommend.config(); String input = path.get("Step2Input"); String output = path.get("Step2Output"); HdfsDAO hdfs = new HdfsDAO(Recommend.HDFS, conf); hdfs.rmr(output); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Step2_UserVectorToCooccurrenceMapper.class); // conf.setCombinerClass(Step2_UserVectorToConoccurrenceReducer.class); // conf.setReducerClass(Step2_UserVectorToConoccurrenceReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); RunningJob job = JobClient.runJob(conf); while (!job.isComplete()) { job.waitForCompletion(); } }
public static void main(String[] args) throws Exception { String input = "hdfs://192.168.0.110:9000/input/access.log"; String output = "hdfs://192.168.0.110:9000/user/hdfs/pv"; JobConf conf = new JobConf(KPIPV.class); conf.setJobName("KPIPV"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(KPIPVMapper.class); conf.setCombinerClass(KPIPVReducer.class); conf.setReducerClass(KPIPVReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); System.exit(0); }
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: BuildGraph"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(BuildGraph.class); conf.setJobName("BuildGraph " + inputPath + " " + ContrailConfig.K); ContrailConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(BuildGraphMapper.class); conf.setReducerClass(BuildGraphReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
/** * Configure the job * * @param conf Job to configure * @param rules classification rules to evaluate * @param target label value to evaluate the rules for * @param inpath input path (the dataset) * @param outpath output <code>Path</code> * @param split DatasetSplit used to separate training and testing input */ private static void configureJob( JobConf conf, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) { split.storeJobParameters(conf); FileInputFormat.setInputPaths(conf, inpath); FileOutputFormat.setOutputPath(conf, outpath); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(CDFitness.class); conf.setMapperClass(CDMapper.class); conf.setCombinerClass(CDReducer.class); conf.setReducerClass(CDReducer.class); conf.setInputFormat(DatasetTextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // store the parameters conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules)); conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet())); conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target); }
/** * set up input file which has the list of input files. * * @return boolean * @throws IOException */ private boolean setup() throws IOException { estimateSavings(); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobconf); Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId); LOG.info(JOB_DIR_LABEL + "=" + jobdir); jobconf.set(JOB_DIR_LABEL, jobdir.toString()); Path log = new Path(jobdir, "_logs"); // The control file should have small size blocks. This helps // in spreading out the load from mappers that will be spawned. jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE); FileOutputFormat.setOutputPath(jobconf, log); LOG.info("log=" + log); // create operation list FileSystem fs = jobdir.getFileSystem(jobconf); Path opList = new Path(jobdir, "_" + OP_LIST_LABEL); jobconf.set(OP_LIST_LABEL, opList.toString()); int opCount = 0, synCount = 0; SequenceFile.Writer opWriter = null; try { opWriter = SequenceFile.createWriter( fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE); for (RaidPolicyPathPair p : raidPolicyPathPairList) { // If a large set of files are Raided for the first time, files // in the same directory that tend to have the same size will end up // with the same map. This shuffle mixes things up, allowing a better // mix of files. java.util.Collections.shuffle(p.srcPaths); for (FileStatus st : p.srcPaths) { opWriter.append(new Text(st.getPath().toString()), p.policy); opCount++; if (++synCount > SYNC_FILE_MAX) { opWriter.sync(); synCount = 0; } } } } finally { if (opWriter != null) { opWriter.close(); } fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file } raidPolicyPathPairList.clear(); jobconf.setInt(OP_COUNT_LABEL, opCount); LOG.info("Number of files=" + opCount); jobconf.setNumMapTasks( getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers())); LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks()); return opCount != 0; }
public RunningJob run(String inputPath, String outputPath) throws Exception { JobConf conf = new JobConf(BuildIndex.class); conf.setJobName("BuildIndex"); FileInputFormat.addInputPath(conf, new Path(inputPath)); // multiple path FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(LongWritable.class); conf.setMapOutputValueClass(LongWritable.class); conf.set("delim", delim); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setInt("keyFieldIndexTwo", keyFieldIndexTwo); conf.setMapperClass(BuildIndexMapper.class); conf.setNumReduceTasks(1); conf.setReducerClass(BuildIndexReducer.class); conf.setInputFormat(TextInputFormat.class); // conf.setInputFormat(CustomInputFormat.class); // FileOutputFormat.setCompressOutput(conf,true); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
public static void runJob(String[] args) { JobConf conf = new JobConf(CassandraBulkLoader.class); if (args.length >= 4) { conf.setNumReduceTasks(new Integer(args[3])); } try { // We store the cassandra storage-conf.xml on the HDFS cluster DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf); } catch (URISyntaxException e) { throw new RuntimeException(e); } conf.setInputFormat(KeyValueTextInputFormat.class); conf.setJobName("CassandraBulkLoader_v2"); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf( "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); // vv MaxTemperatureDriverV6 conf.setProfileEnabled(true); conf.setProfileParams( "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s"); conf.setProfileTaskRange(true, "0-2"); // ^^ MaxTemperatureDriverV6 JobClient.runJob(conf); return 0; }
public static void runSortJob(String... args) throws Exception { Path input = new Path(args[0]); Path output = new Path(args[1]); JobConf job = new JobConf(); job.setNumReduceTasks(2); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(SampleJob.class); output.getFileSystem(job).delete(output, true); JobClient jc = new JobClient(job); JobClient.setTaskOutputFilter(job, JobClient.TaskStatusFilter.ALL); RunningJob rj = jc.submitJob(job); try { if (!jc.monitorAndPrintJob(job, rj)) { System.out.println("Job Failed: " + rj.getFailureInfo()); throw new IOException("Job failed!"); } } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } }
/** * Generate the requested number of file splits, with the filename set to the filename of the * output file. */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { /** 设置输入分片的个数* */ JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; } System.out.println("numMaps-------" + numMaps); InputSplit[] result = new InputSplit[numMaps]; Path outDir = FileOutputFormat.getOutputPath(job); for (int i = 0; i < result.length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null); } return result; }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
/** * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce * job. * * @throws IOException When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCountSeqOutput.class); conf.setJobName("wordcount_seqOF"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) // conf.setOutputValueClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Combiner.class); conf.setReducerClass(Reduce.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // // compress Mapper output // conf.setCompressMapOutput(true); // conf.setMapOutputCompressorClass(org.apache.hadoop.io.compress.GzipCodec.class); // compress final output conf.set("mapred.output.compress", conf.get("mapred.output.compress", "true")); conf.set("mapred.output.compression.type", conf.get("mapred.output.compression.type", "BLOCK")); conf.set( "mapred.output.compression.codec", conf.get("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println( "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
/** * This is the main routine for launching a distributed random write job. It runs 10 maps/node and * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); /** 建议型的 * */ System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println( "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
/* (non-Javadoc) * @see org.apache.hadoop.chukwa.analysis.HiTune.AnalysisProcessor#run() */ @Override public void run() { // TODO Auto-generated method stub long timestamp = System.currentTimeMillis(); JobConf conf = new JobConf(this.conf, InstrumentDataflow.class); try { conf.setJobName(this.getClass().getSimpleName() + timestamp); conf.setInputFormat(MultiSequenceFileInputFormat.class); conf.setMapperClass(InstrumentDataflow.MapClass.class); conf.setReducerClass(InstrumentDataflow.ReduceClass.class); conf.setOutputKeyClass(Text.class); Class<? extends WritableComparable> outputKeyClass = Class.forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass)) .asSubclass(WritableComparable.class); Class<? extends Writable> outputValueClass = Class.forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass)) .asSubclass(Writable.class); conf.setMapOutputKeyClass(outputKeyClass); conf.setMapOutputValueClass(outputValueClass); conf.setOutputValueClass(TextArrayWritable.class); conf.setOutputFormat(CSVFileOutputFormat.class); String outputPaths = conf.get(AnalysisProcessorConfiguration.reportfolder) + "/" + conf.get(AnalysisProcessorConfiguration.reportfile); String temp_outputPaths = getTempOutputDir(outputPaths); if (this.inputfiles != null) { log.debug("inputPaths:" + inputfiles); FileInputFormat.setInputPaths(conf, inputfiles); FileOutputFormat.setOutputPath(conf, new Path(temp_outputPaths)); // FileInputFormat.setInputPathFilter(conf, evtFileFilter.class); // conf.setNumReduceTasks(1); try { JobClient.runJob(conf); moveResults(conf, outputPaths, temp_outputPaths); } catch (IOException e) { // TODO Auto-generated catch block log.warn("For " + getOutputFileName() + " :JOB fails!"); log.warn(e); e.printStackTrace(); this.MOVE_DONE = false; } } else { log.warn("For " + getOutputFileName() + " :No input path!"); } } catch (Exception e) { log.warn("Job preparation failure!"); log.warn(e); e.printStackTrace(); } }
/** * @param process * @param tap * @param conf */ @Override public void sinkConfInit( FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputFormat(MongoOutputFormat.class); MongoConfigUtil.setOutputURI(conf, this.mongoUri); FileOutputFormat.setOutputPath(conf, getPath()); }
/** Test compressible {@link GridmixRecord}. */ @Test public void testCompressibleGridmixRecord() throws IOException { JobConf conf = new JobConf(); CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true); CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true); FileSystem lfs = FileSystem.getLocal(conf); int dataSize = 1024 * 1024 * 10; // 10 MB float ratio = 0.357F; // define the test's root temp directory Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")) .makeQualified(lfs.getUri(), lfs.getWorkingDirectory()); Path tempDir = new Path(rootTempDir, "TestPossiblyCompressibleGridmixRecord"); lfs.delete(tempDir, true); // define a compressible GridmixRecord GridmixRecord record = new GridmixRecord(dataSize, 0); record.setCompressibility(true, ratio); // enable compression conf.setClass(FileOutputFormat.COMPRESS_CODEC, GzipCodec.class, CompressionCodec.class); org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true); // write the record to a file Path recordFile = new Path(tempDir, "record"); OutputStream outStream = CompressionEmulationUtil.getPossiblyCompressedOutputStream(recordFile, conf); DataOutputStream out = new DataOutputStream(outStream); record.write(out); out.close(); outStream.close(); // open the compressed stream for reading Path actualRecordFile = recordFile.suffix(".gz"); InputStream in = CompressionEmulationUtil.getPossiblyDecompressedInputStream(actualRecordFile, conf, 0); // get the compressed file size long compressedFileSize = lfs.listStatus(actualRecordFile)[0].getLen(); GridmixRecord recordRead = new GridmixRecord(); recordRead.readFields(new DataInputStream(in)); assertEquals( "Record size mismatch in a compressible GridmixRecord", dataSize, recordRead.getSize()); assertTrue( "Failed to generate a compressible GridmixRecord", recordRead.getSize() > compressedFileSize); // check if the record can generate data with the desired compression ratio float seenRatio = ((float) compressedFileSize) / dataSize; assertEquals( CompressionEmulationUtil.standardizeCompressionRatio(ratio), CompressionEmulationUtil.standardizeCompressionRatio(seenRatio), 1.0D); }
private void createPageRankLinksDirectly() throws IOException, URISyntaxException { log.info("Creating PageRank links", null); JobConf job = new JobConf(PagerankData.class); String jobname = "Create pagerank links"; Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME); job.setJobName(jobname); setPageRankLinksOptions(job); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); // job.setMapOutputKeyClass(LongWritable.class); // job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); job.setMapperClass(DummyToPageRankLinksMapper.class); if (options.isSequenceOut()) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.getCodecClass()) { job.set("mapred.output.compression.type", "BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); } FileOutputFormat.setOutputPath(job, fout); log.info("Running Job: " + jobname); log.info("Dummy file " + dummy.getPath() + " as input"); log.info("Edges file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); }
/** Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; LOG.info("Tool: " + DemoCountTrecDocuments.class.getCanonicalName()); LOG.info(" - input: " + inputPath); LOG.info(" - output dir: " + outputPath); LOG.info(" - docno mapping file: " + mappingFile); JobConf conf = new JobConf(getConf(), DemoCountTrecDocuments.class); conf.setJobName(DemoCountTrecDocuments.class.getSimpleName()); conf.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general in being able to load // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping // object. conf.set("DocnoMappingClass", TrecDocnoMapping.class.getCanonicalName()); // Put the mapping file in the distributed cache so each map worker will have it. DistributedCache.addCacheFile(new URI(mappingFile), conf); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
public static int main(String[] args) throws Exception { int i; String outPath; int numMaps = 0, numReds = 0; List<String> other_args = new ArrayList<String>(); for (i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { numMaps = Integer.parseInt(args[++i]); } else if ("-r".equals(args[i])) { numReds = Integer.parseInt(args[++i]); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); printUsage(); // exits } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println( "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); printUsage(); } Date startTime = new Date(); System.out.println("Job started: " + startTime); Date startIteration; Date endIteration; JobConf conf = new JobConf(Kmeans.class); conf.setJobName("kmeans"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ClusterWritable.class); conf.setMapperClass(MapClass.class); conf.setReducerClass(Reduce.class); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); outPath = new String(other_args.get(1)); FileOutputFormat.setOutputPath(conf, new Path(outPath)); startIteration = new Date(); JobClient.runJob(conf); endIteration = new Date(); System.out.println( "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds."); return 0; }