@Override public void map(GenericRecord in, AvroCollector<Pair<Utf8, Long>> collector, Reporter reporter) throws IOException { Pair<Utf8, Long> p = new Pair<Utf8, Long>(PAIR_SCHEMA); Utf8 shape = (Utf8) in.get("shape"); if (shape != null) { p.set(shape, 1L); collector.collect(p); } }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
@Override public void setConf(org.apache.hadoop.conf.Configuration conf) { if (conf == null) return; // you first get a null configuration - ignore that String mos = conf.get(AvroJob.MAP_OUTPUT_SCHEMA); Schema schema = Schema.parse(mos); pair = new Pair<Object, Object>(schema); Schema keySchema = Pair.getKeySchema(schema); final List<Field> fields = keySchema.getFields(); final GenericRecord key = new GenericData.Record(keySchema); projector = new Projector(key, fields); }
@Test @SuppressWarnings("rawtypes") public void testTableOf() throws Exception { AvroType at = Avros.tableOf(Avros.strings(), Avros.strings()); Pair<String, String> j = Pair.of("a", "b"); org.apache.avro.mapred.Pair w = new org.apache.avro.mapred.Pair(at.getSchema()); w.put(0, new Utf8("a")); w.put(1, new Utf8("b")); // TODO update this after resolving the o.a.a.m.Pair.equals issue initialize(at); assertEquals(j, at.getInputMapFn().map(w)); org.apache.avro.mapred.Pair converted = (org.apache.avro.mapred.Pair) at.getOutputMapFn().map(j); assertEquals(w.key(), converted.key()); assertEquals(w.value(), converted.value()); }
public void map(Object in, AvroCollector<Pair<Object, Object>> collector, Reporter reporter) throws IOException { Object key = projector.project(in); pair.set(key, in); collector.collect(pair); }
public class AvroMR extends Configured implements Tool { public static final Schema PAIR_SCHEMA = Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)); public static final Schema OUTPUT_SCHEMA = ReflectData.get().getSchema(UFORecord.class); @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; } public static class AvroRecordMapper extends AvroMapper<GenericRecord, Pair<Utf8, Long>> { @Override public void map(GenericRecord in, AvroCollector<Pair<Utf8, Long>> collector, Reporter reporter) throws IOException { Pair<Utf8, Long> p = new Pair<Utf8, Long>(PAIR_SCHEMA); Utf8 shape = (Utf8) in.get("shape"); if (shape != null) { p.set(shape, 1L); collector.collect(p); } } } public static class AvroRecordReducer extends AvroReducer<Utf8, Long, GenericRecord> { public void reduce( Utf8 key, Iterable<Long> values, AvroCollector<GenericRecord> collector, Reporter reporter) throws IOException { long sum = 0; for (Long val : values) { sum += val; } GenericRecord value = new GenericData.Record(OUTPUT_SCHEMA); value.put("shape", key); value.put("count", sum); collector.collect(value); } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new AvroMR(), args); System.exit(res); } }
@Override public int run(String[] args) throws UncompletedStepException, IOException { if (isFinished()) { loadUnforwardedCounts(); return 0; } else reset(); JobConf conf = new JobConf(PageSummaryStep.class); DumpExtractor2.configureJob(conf, args); conf.setJobName("WM: page summary (" + getIteration() + ")"); if (getIteration() == 0) { conf.setMapperClass(InitialMapper.class); conf.setOutputKeyClass(AvroKey.class); conf.setOutputValueClass(AvroValue.class); conf.setInputFormat(XmlInputFormat.class); conf.set(XmlInputFormat.START_TAG_KEY, "<page>"); conf.set(XmlInputFormat.END_TAG_KEY, "</page>"); FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE)); DistributedCache.addCacheFile( new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf); } else { AvroJob.setMapperClass(conf, SubsequentMapper.class); AvroJob.setInputSchema( conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema())); FileInputFormat.setInputPaths( conf, getWorkingDir() + Path.SEPARATOR + "pageSummary_" + (getIteration() - 1)); } DistributedCache.addCacheFile( new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO) .toUri(), conf); DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf); AvroJob.setCombinerClass(conf, Combiner.class); AvroJob.setReducerClass(conf, Reducer.class); AvroJob.setOutputSchema( conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema())); FileOutputFormat.setOutputPath(conf, getDir()); RunningJob runningJob = JobClient.runJob(conf); if (runningJob.getJobState() == JobStatus.SUCCEEDED) { finish(runningJob); return 0; } throw new UncompletedStepException(); }