@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
public class AvroMR extends Configured implements Tool { public static final Schema PAIR_SCHEMA = Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)); public static final Schema OUTPUT_SCHEMA = ReflectData.get().getSchema(UFORecord.class); @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; } public static class AvroRecordMapper extends AvroMapper<GenericRecord, Pair<Utf8, Long>> { @Override public void map(GenericRecord in, AvroCollector<Pair<Utf8, Long>> collector, Reporter reporter) throws IOException { Pair<Utf8, Long> p = new Pair<Utf8, Long>(PAIR_SCHEMA); Utf8 shape = (Utf8) in.get("shape"); if (shape != null) { p.set(shape, 1L); collector.collect(p); } } } public static class AvroRecordReducer extends AvroReducer<Utf8, Long, GenericRecord> { public void reduce( Utf8 key, Iterable<Long> values, AvroCollector<GenericRecord> collector, Reporter reporter) throws IOException { long sum = 0; for (Long val : values) { sum += val; } GenericRecord value = new GenericData.Record(OUTPUT_SCHEMA); value.put("shape", key); value.put("count", sum); collector.collect(value); } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new AvroMR(), args); System.exit(res); } }
@Override public int run(String[] args) throws UncompletedStepException, IOException { if (isFinished()) { loadUnforwardedCounts(); return 0; } else reset(); JobConf conf = new JobConf(PageSummaryStep.class); DumpExtractor2.configureJob(conf, args); conf.setJobName("WM: page summary (" + getIteration() + ")"); if (getIteration() == 0) { conf.setMapperClass(InitialMapper.class); conf.setOutputKeyClass(AvroKey.class); conf.setOutputValueClass(AvroValue.class); conf.setInputFormat(XmlInputFormat.class); conf.set(XmlInputFormat.START_TAG_KEY, "<page>"); conf.set(XmlInputFormat.END_TAG_KEY, "</page>"); FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE)); DistributedCache.addCacheFile( new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf); } else { AvroJob.setMapperClass(conf, SubsequentMapper.class); AvroJob.setInputSchema( conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema())); FileInputFormat.setInputPaths( conf, getWorkingDir() + Path.SEPARATOR + "pageSummary_" + (getIteration() - 1)); } DistributedCache.addCacheFile( new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO) .toUri(), conf); DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf); AvroJob.setCombinerClass(conf, Combiner.class); AvroJob.setReducerClass(conf, Reducer.class); AvroJob.setOutputSchema( conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema())); FileOutputFormat.setOutputPath(conf, getDir()); RunningJob runningJob = JobClient.runJob(conf); if (runningJob.getJobState() == JobStatus.SUCCEEDED) { finish(runningJob); return 0; } throw new UncompletedStepException(); }