public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf( "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); // vv MaxTemperatureDriverV6 conf.setProfileEnabled(true); conf.setProfileParams( "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s"); conf.setProfileTaskRange(true, "0-2"); // ^^ MaxTemperatureDriverV6 JobClient.runJob(conf); return 0; }
public static void runJob(String[] args) { JobConf conf = new JobConf(CassandraBulkLoader.class); if (args.length >= 4) { conf.setNumReduceTasks(new Integer(args[3])); } try { // We store the cassandra storage-conf.xml on the HDFS cluster DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf); } catch (URISyntaxException e) { throw new RuntimeException(e); } conf.setInputFormat(KeyValueTextInputFormat.class); conf.setJobName("CassandraBulkLoader_v2"); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } }
/** Test getSplits */ @Test @SuppressWarnings("unchecked") public void testSplits() throws IOException { JobConf job = new JobConf(defaultConf); localFs.delete(workDir, true); writeFile( localFs, new Path(workDir, "test.txt"), "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); FileInputFormat.setInputPaths(job, workDir); CombineFileInputFormat format = new CombineFileInputFormat() { @Override public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { return new CombineFileRecordReader( job, (CombineFileSplit) split, reporter, CombineFileRecordReader.class); } }; final int SIZE_SPLITS = 1; LOG.info("Trying to getSplits with splits = " + SIZE_SPLITS); InputSplit[] splits = format.getSplits(job, SIZE_SPLITS); LOG.info("Got getSplits = " + splits.length); assertEquals("splits == " + SIZE_SPLITS, SIZE_SPLITS, splits.length); }
private String runJob() throws Exception { OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt")); Writer wr = new OutputStreamWriter(os); wr.write("hello1\n"); wr.write("hello2\n"); wr.write("hello3\n"); wr.close(); JobConf conf = createJobConf(); conf.setJobName("mr"); conf.setJobPriority(JobPriority.HIGH); conf.setInputFormat(TextInputFormat.class); conf.setMapOutputKeyClass(LongWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); FileInputFormat.setInputPaths(conf, getInputDir()); FileOutputFormat.setOutputPath(conf, getOutputDir()); return JobClient.runJob(conf).getID().toString(); }
public static void runSortJob(String... args) throws Exception { Path input = new Path(args[0]); Path output = new Path(args[1]); JobConf job = new JobConf(); job.setNumReduceTasks(2); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(SampleJob.class); output.getFileSystem(job).delete(output, true); JobClient jc = new JobClient(job); JobClient.setTaskOutputFilter(job, JobClient.TaskStatusFilter.ALL); RunningJob rj = jc.submitJob(job); try { if (!jc.monitorAndPrintJob(job, rj)) { System.out.println("Job Failed: " + rj.getFailureInfo()); throw new IOException("Job failed!"); } } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
/** Test using the gzip codec for reading */ @Test public void testGzip() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile( localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 6, results.size()); assertEquals("splits[0][5]", " dog", results.get(5).toString()); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
public void testComplexNameWithRegex() throws Exception { OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt")); Writer wr = new OutputStreamWriter(os); wr.write("b a\n"); wr.close(); JobConf conf = createJobConf(); conf.setJobName("name \\Evalue]"); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); FileInputFormat.setInputPaths(conf, getInputDir()); FileOutputFormat.setOutputPath(conf, getOutputDir()); JobClient.runJob(conf); Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(getOutputDir(), new OutputLogFilter())); assertEquals(1, outputFiles.length); InputStream is = getFileSystem().open(outputFiles[0]); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); assertEquals("0\tb a", reader.readLine()); assertNull(reader.readLine()); reader.close(); }
private static IntWritable deduceInputFile(JobConf job) { Path[] inputPaths = FileInputFormat.getInputPaths(job); Path inputFile = new Path(job.get("map.input.file")); // value == one for sort-input; value == two for sort-output return (inputFile.getParent().equals(inputPaths[0])) ? sortInput : sortOutput; }
public static int main(String[] args) throws Exception { int i; String outPath; int numMaps = 0, numReds = 0; List<String> other_args = new ArrayList<String>(); for (i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { numMaps = Integer.parseInt(args[++i]); } else if ("-r".equals(args[i])) { numReds = Integer.parseInt(args[++i]); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); printUsage(); // exits } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println( "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); printUsage(); } Date startTime = new Date(); System.out.println("Job started: " + startTime); Date startIteration; Date endIteration; JobConf conf = new JobConf(Kmeans.class); conf.setJobName("kmeans"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ClusterWritable.class); conf.setMapperClass(MapClass.class); conf.setReducerClass(Reduce.class); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); outPath = new String(other_args.get(1)); FileOutputFormat.setOutputPath(conf, new Path(outPath)); startIteration = new Date(); JobClient.runJob(conf); endIteration = new Date(); System.out.println( "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds."); return 0; }
public static void main(String[] args) throws IOException { /*JobConf conf = new JobConf(); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(IpCounterMapper.class); conf.setCombinerClass(IpCounterReducer.class); conf.setReducerClass(IpCounterReducer.class); String inputDir = args[0]; String outputDir = args[1]; FileInputFormat.setInputPaths(conf, inputDir); FileOutputFormat.setOutputPath(conf, new Path(outputDir)); boolean flag = JobClient.runJob(conf).isSuccessful(); System.out.println(args.length);*/ if (args.length < 2) { System.out.println("args not right!"); return; } JobConf conf = new JobConf(IpCount1.class); // set output key class conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // set mapper & reducer class conf.setMapperClass(IpCounterMapper.class); conf.setCombinerClass(IpCounterReducer.class); conf.setReducerClass(IpCounterReducer.class); // set format conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); String inputDir = args[0]; String outputDir = args[1]; // FileInputFormat.setInputPaths(conf, "/user/hadoop/rongxin/locationinput/"); FileInputFormat.setInputPaths(conf, inputDir); FileOutputFormat.setOutputPath(conf, new Path(outputDir)); boolean flag = JobClient.runJob(conf).isSuccessful(); }
/** Configure a job given argv. */ public static boolean parseArgs(String[] argv, JobConf job) throws IOException { if (argv.length < 1) { return 0 == printUsage(); } for (int i = 0; i < argv.length; ++i) { if (argv.length == i + 1) { System.out.println("ERROR: Required parameter missing from " + argv[i]); return 0 == printUsage(); } try { if ("-m".equals(argv[i])) { job.setNumMapTasks(Integer.parseInt(argv[++i])); } else if ("-r".equals(argv[i])) { job.setNumReduceTasks(Integer.parseInt(argv[++i])); } else if ("-inFormat".equals(argv[i])) { job.setInputFormat(Class.forName(argv[++i]).asSubclass(InputFormat.class)); } else if ("-outFormat".equals(argv[i])) { job.setOutputFormat(Class.forName(argv[++i]).asSubclass(OutputFormat.class)); } else if ("-outKey".equals(argv[i])) { job.setOutputKeyClass(Class.forName(argv[++i]).asSubclass(WritableComparable.class)); } else if ("-outValue".equals(argv[i])) { job.setOutputValueClass(Class.forName(argv[++i]).asSubclass(Writable.class)); } else if ("-keepmap".equals(argv[i])) { job.set( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.MAP_PRESERVE_PERCENT, argv[++i]); } else if ("-keepred".equals(argv[i])) { job.set( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.REDUCE_PRESERVE_PERCENT, argv[++i]); } else if ("-outdir".equals(argv[i])) { FileOutputFormat.setOutputPath(job, new Path(argv[++i])); } else if ("-indir".equals(argv[i])) { FileInputFormat.addInputPaths(job, argv[++i]); } else if ("-inFormatIndirect".equals(argv[i])) { job.setClass( org.apache.hadoop.mapreduce.GenericMRLoadGenerator.INDIRECT_INPUT_FORMAT, Class.forName(argv[++i]).asSubclass(InputFormat.class), InputFormat.class); job.setInputFormat(IndirectInputFormat.class); } else { System.out.println("Unexpected argument: " + argv[i]); return 0 == printUsage(); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + argv[i]); return 0 == printUsage(); } catch (Exception e) { throw (IOException) new IOException().initCause(e); } } return true; }
/** * @param process * @param tap * @param conf */ @Override public void sourceConfInit( FlowProcess<JobConf> process, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { MongoConfigUtil.setReadSplitsFromShards(conf, true); MongoConfigUtil.setInputURI(conf, this.mongoUri); FileInputFormat.setInputPaths(conf, this.getIdentifier()); conf.setInputFormat(MongoInputFormat.class); // TODO: MongoConfigUtil.setFields(conf, fieldsBson); // if (!this.query.isEmpty()) MongoConfigUtil.setQuery(conf, this.query); // TODO: MongoConfigUtil.setFields(conf, fields); }
private static void runIOTest(Class<? extends Mapper> mapperClass, Path outputDir) throws IOException { JobConf job = new JobConf(fsConfig, DFSCIOTest.class); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass); job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Add1.class); conf.setJobName("sumar1"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
/** Test using the gzip codec and an empty input file */ @Test public void testGzipEmpty() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "empty.gz"), gzip, ""); FileInputFormat.setInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals( "Compressed files of length 0 are not returned from FileInputFormat.getSplits().", 1, splits.length); List<Text> results = readSplit(format, splits[0], job); assertEquals("Compressed empty file length == 0", 0, results.size()); }
// Main function public static void main(String[] args) throws Exception { // TODO Auto-generated method stub JobConf conf = new JobConf(ProcessUnits.class); conf.setJobName("max_eletricityunits"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(EE_Mapper.class); conf.setCombinerClass(EE_Reducer.class); conf.setReducerClass(EE_Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public static void main(String args[]) throws IOException { JobConf job = new JobConf(WordCountJob.class); job.setJobName("Word Count Example"); FileInputFormat.setInputPaths(job, args[0]); job.setInputFormat(TextInputFormat.class); job.setMapperClass(MapTask.class); job.setCombinerClass(ReduceTask.class); job.setReducerClass(ReduceTask.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); }
public static void seekTest(FileSystem fs, boolean fastCheck) throws Exception { fs.delete(READ_DIR, true); JobConf job = new JobConf(conf, TestFileSystem.class); job.setBoolean("fs.test.fastCheck", fastCheck); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(SeekMapper.class); job.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(job, READ_DIR); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), XiangLi1_exercise3.class); conf.setJobName("xiangli1_exercise3"); conf.setNumReduceTasks(0); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); // conf.setCombinerClass(Reduce.class); // conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: OldMaxTemperature <input path> <output path>"); System.exit(-1); } /*[*/ JobConf conf = new JobConf(OldMaxTemperature.class); /*]*/ /*[*/ conf /*]*/.setJobName("Max temperature"); FileInputFormat.addInputPath(/*[*/ conf /*]*/, new Path(args[0])); FileOutputFormat.setOutputPath(/*[*/ conf /*]*/, new Path(args[1])); /*[*/ conf /*]*/.setMapperClass(OldMaxTemperatureMapper.class); /*[*/ conf /*]*/.setReducerClass(OldMaxTemperatureReducer.class); /*[*/ conf /*]*/.setOutputKeyClass(Text.class); /*[*/ conf /*]*/.setOutputValueClass(IntWritable.class); /*[*/ JobClient.runJob(conf); /*]*/ }
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); JobConf job = new JobConf(conf); job.setJarByClass(Jacobi.class); fs.delete(new Path("curX"), true); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path("preX")); FileOutputFormat.setOutputPath(job, new Path("curX")); JobClient.runJob(job); return 1; }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), Sqrt2.class); conf.setJobName("sqrt2"); conf.setOutputKeyClass(DoubleWritable.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); /*conf.setCombinerClass(Reduce.class);*/ conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Main.class); conf.setJobName("feels-analysis"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(TheOutputClass.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setNumReduceTasks(1); conf.setInputFormat(CSVTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // TODO: determine whether we need extra output MultipleOutputs.addMultiNamedOutput( conf, SECOND_OUTPUT, TextOutputFormat.class, Text.class, TheOutputClass.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(NeighborSearch.class); conf.setJobName("star searching"); conf.setOutputKeyClass(BlockIDWritable.class); conf.setOutputValueClass(PairWritable.class); conf.setMapperClass(Map.class); // conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); // conf.setPartitionerClass(BlockPartitioner.class); // conf.setFloat("mapred.reduce.slowstart.completed.maps", (float) 1.0); conf.setInputFormat(StarInputFormat.class); conf.setOutputFormat(StarOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
@Test public void testFormat() throws Exception { JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "test.txt"); // A reporter that does nothing Reporter reporter = Reporter.NULL; int seed = new Random().nextInt(); LOG.info("seed = " + seed); Random random = new Random(seed); localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, workDir); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { LOG.debug("creating; entries = " + length); // create a file with length entries Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < length; i++) { writer.write(Integer.toString(i)); writer.write("\n"); } } finally { writer.close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / 20) + 1; LOG.debug("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(job, numSplits); LOG.debug("splitting: got = " + splits.length); if (length == 0) { assertEquals( "Files of length 0 are not returned from FileInputFormat.getSplits().", 1, splits.length); assertEquals("Empty file length == 0", 0, splits[0].getLength()); } // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) { LOG.warn( "conflict with " + v + " in split " + j + " at position " + reader.getPos()); } assertFalse("Key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }
@Test public void testSplitableCodecs() throws IOException { JobConf conf = new JobConf(defaultConf); int seed = new Random().nextInt(); // Create the codec CompressionCodec codec = null; try { codec = (CompressionCodec) ReflectionUtils.newInstance( conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf); } catch (ClassNotFoundException cnfe) { throw new IOException("Illegal codec!"); } Path file = new Path(workDir, "test" + codec.getDefaultExtension()); // A reporter that does nothing Reporter reporter = Reporter.NULL; LOG.info("seed = " + seed); Random random = new Random(seed); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(workDir, true); FileInputFormat.setInputPaths(conf, workDir); final int MAX_LENGTH = 500000; // for a variety of lengths for (int length = MAX_LENGTH / 2; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) { LOG.info("creating; entries = " + length); // create a file with length entries Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file))); try { for (int i = 0; i < length; i++) { writer.write(Integer.toString(i)); writer.write("\n"); } } finally { writer.close(); } // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(conf); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 0; i < 3; i++) { int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1; LOG.info("splitting: requesting = " + numSplits); InputSplit[] splits = format.getSplits(conf, numSplits); LOG.info("splitting: got = " + splits.length); // check each split BitSet bits = new BitSet(length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], conf, reporter); try { int counter = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) { LOG.warn( "conflict with " + v + " in split " + j + " at position " + reader.getPos()); } assertFalse("Key in multiple partitions.", bits.get(v)); bits.set(v); counter++; } if (counter > 0) { LOG.info("splits[" + j + "]=" + splits[j] + " count=" + counter); } else { LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + counter); } } finally { reader.close(); } } assertEquals("Some keys in no partition.", length, bits.cardinality()); } } }