@Override
 public void map(GenericRecord in, AvroCollector<Pair<Utf8, Long>> collector, Reporter reporter)
     throws IOException {
   Pair<Utf8, Long> p = new Pair<Utf8, Long>(PAIR_SCHEMA);
   Utf8 shape = (Utf8) in.get("shape");
   if (shape != null) {
     p.set(shape, 1L);
     collector.collect(p);
   }
 }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }
 @Override
 public void setConf(org.apache.hadoop.conf.Configuration conf) {
   if (conf == null) return; // you first get a null configuration - ignore that
   String mos = conf.get(AvroJob.MAP_OUTPUT_SCHEMA);
   Schema schema = Schema.parse(mos);
   pair = new Pair<Object, Object>(schema);
   Schema keySchema = Pair.getKeySchema(schema);
   final List<Field> fields = keySchema.getFields();
   final GenericRecord key = new GenericData.Record(keySchema);
   projector = new Projector(key, fields);
 }
Ejemplo n.º 4
0
 @Test
 @SuppressWarnings("rawtypes")
 public void testTableOf() throws Exception {
   AvroType at = Avros.tableOf(Avros.strings(), Avros.strings());
   Pair<String, String> j = Pair.of("a", "b");
   org.apache.avro.mapred.Pair w = new org.apache.avro.mapred.Pair(at.getSchema());
   w.put(0, new Utf8("a"));
   w.put(1, new Utf8("b"));
   // TODO update this after resolving the o.a.a.m.Pair.equals issue
   initialize(at);
   assertEquals(j, at.getInputMapFn().map(w));
   org.apache.avro.mapred.Pair converted =
       (org.apache.avro.mapred.Pair) at.getOutputMapFn().map(j);
   assertEquals(w.key(), converted.key());
   assertEquals(w.value(), converted.value());
 }
 public void map(Object in, AvroCollector<Pair<Object, Object>> collector, Reporter reporter)
     throws IOException {
   Object key = projector.project(in);
   pair.set(key, in);
   collector.collect(pair);
 }
public class AvroMR extends Configured implements Tool {
  public static final Schema PAIR_SCHEMA =
      Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG));
  public static final Schema OUTPUT_SCHEMA = ReflectData.get().getSchema(UFORecord.class);

  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }

  public static class AvroRecordMapper extends AvroMapper<GenericRecord, Pair<Utf8, Long>> {
    @Override
    public void map(GenericRecord in, AvroCollector<Pair<Utf8, Long>> collector, Reporter reporter)
        throws IOException {
      Pair<Utf8, Long> p = new Pair<Utf8, Long>(PAIR_SCHEMA);
      Utf8 shape = (Utf8) in.get("shape");
      if (shape != null) {
        p.set(shape, 1L);
        collector.collect(p);
      }
    }
  }

  public static class AvroRecordReducer extends AvroReducer<Utf8, Long, GenericRecord> {
    public void reduce(
        Utf8 key, Iterable<Long> values, AvroCollector<GenericRecord> collector, Reporter reporter)
        throws IOException {
      long sum = 0;
      for (Long val : values) {
        sum += val;
      }

      GenericRecord value = new GenericData.Record(OUTPUT_SCHEMA);
      value.put("shape", key);
      value.put("count", sum);

      collector.collect(value);
    }
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new AvroMR(), args);
    System.exit(res);
  }
}
Ejemplo n.º 7
0
  @Override
  public int run(String[] args) throws UncompletedStepException, IOException {

    if (isFinished()) {
      loadUnforwardedCounts();

      return 0;
    } else reset();

    JobConf conf = new JobConf(PageSummaryStep.class);
    DumpExtractor2.configureJob(conf, args);

    conf.setJobName("WM: page summary (" + getIteration() + ")");

    if (getIteration() == 0) {

      conf.setMapperClass(InitialMapper.class);

      conf.setOutputKeyClass(AvroKey.class);
      conf.setOutputValueClass(AvroValue.class);

      conf.setInputFormat(XmlInputFormat.class);
      conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
      conf.set(XmlInputFormat.END_TAG_KEY, "</page>");

      FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE));
      DistributedCache.addCacheFile(
          new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf);

    } else {

      AvroJob.setMapperClass(conf, SubsequentMapper.class);
      AvroJob.setInputSchema(
          conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema()));

      FileInputFormat.setInputPaths(
          conf, getWorkingDir() + Path.SEPARATOR + "pageSummary_" + (getIteration() - 1));
    }

    DistributedCache.addCacheFile(
        new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO)
            .toUri(),
        conf);
    DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf);

    AvroJob.setCombinerClass(conf, Combiner.class);
    AvroJob.setReducerClass(conf, Reducer.class);
    AvroJob.setOutputSchema(
        conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema()));

    FileOutputFormat.setOutputPath(conf, getDir());

    RunningJob runningJob = JobClient.runJob(conf);

    if (runningJob.getJobState() == JobStatus.SUCCEEDED) {
      finish(runningJob);
      return 0;
    }

    throw new UncompletedStepException();
  }