@Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }
public class AvroMR extends Configured implements Tool {
  public static final Schema PAIR_SCHEMA =
      Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG));
  public static final Schema OUTPUT_SCHEMA = ReflectData.get().getSchema(UFORecord.class);

  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }

  public static class AvroRecordMapper extends AvroMapper<GenericRecord, Pair<Utf8, Long>> {
    @Override
    public void map(GenericRecord in, AvroCollector<Pair<Utf8, Long>> collector, Reporter reporter)
        throws IOException {
      Pair<Utf8, Long> p = new Pair<Utf8, Long>(PAIR_SCHEMA);
      Utf8 shape = (Utf8) in.get("shape");
      if (shape != null) {
        p.set(shape, 1L);
        collector.collect(p);
      }
    }
  }

  public static class AvroRecordReducer extends AvroReducer<Utf8, Long, GenericRecord> {
    public void reduce(
        Utf8 key, Iterable<Long> values, AvroCollector<GenericRecord> collector, Reporter reporter)
        throws IOException {
      long sum = 0;
      for (Long val : values) {
        sum += val;
      }

      GenericRecord value = new GenericData.Record(OUTPUT_SCHEMA);
      value.put("shape", key);
      value.put("count", sum);

      collector.collect(value);
    }
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(), new AvroMR(), args);
    System.exit(res);
  }
}
Пример #3
0
  @Override
  public int run(String[] args) throws UncompletedStepException, IOException {

    if (isFinished()) {
      loadUnforwardedCounts();

      return 0;
    } else reset();

    JobConf conf = new JobConf(PageSummaryStep.class);
    DumpExtractor2.configureJob(conf, args);

    conf.setJobName("WM: page summary (" + getIteration() + ")");

    if (getIteration() == 0) {

      conf.setMapperClass(InitialMapper.class);

      conf.setOutputKeyClass(AvroKey.class);
      conf.setOutputValueClass(AvroValue.class);

      conf.setInputFormat(XmlInputFormat.class);
      conf.set(XmlInputFormat.START_TAG_KEY, "<page>");
      conf.set(XmlInputFormat.END_TAG_KEY, "</page>");

      FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE));
      DistributedCache.addCacheFile(
          new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf);

    } else {

      AvroJob.setMapperClass(conf, SubsequentMapper.class);
      AvroJob.setInputSchema(
          conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema()));

      FileInputFormat.setInputPaths(
          conf, getWorkingDir() + Path.SEPARATOR + "pageSummary_" + (getIteration() - 1));
    }

    DistributedCache.addCacheFile(
        new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO)
            .toUri(),
        conf);
    DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf);

    AvroJob.setCombinerClass(conf, Combiner.class);
    AvroJob.setReducerClass(conf, Reducer.class);
    AvroJob.setOutputSchema(
        conf, Pair.getPairSchema(PageKey.getClassSchema(), PageDetail.getClassSchema()));

    FileOutputFormat.setOutputPath(conf, getDir());

    RunningJob runningJob = JobClient.runJob(conf);

    if (runningJob.getJobState() == JobStatus.SUCCEEDED) {
      finish(runningJob);
      return 0;
    }

    throw new UncompletedStepException();
  }