@Override
  @SuppressWarnings({"unchecked", "deprecation"})
  public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
    Job job = new Job(conf);
    Format format = dataset.getDescriptor().getFormat();

    if (setInputPaths(jobContext, job)) {
      if (Formats.AVRO.equals(format)) {
        AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema());
        AvroKeyInputFormat<E> delegate = new AvroKeyInputFormat<E>();
        return delegate.getSplits(jobContext);
      } else if (Formats.PARQUET.equals(format)) {
        // TODO: use later version of parquet (with https://github.com/Parquet/parquet-mr/pull/282)
        // so we can set the schema correctly
        // AvroParquetInputFormat.setReadSchema(job, view.getDescriptor().getSchema());
        AvroParquetInputFormat delegate = new AvroParquetInputFormat();
        return delegate.getSplits(jobContext);
      } else if (Formats.JSON.equals(format)) {
        return new JSONInputFormat().getSplits(jobContext);
      } else if (Formats.CSV.equals(format)) {
        // this generates an unchecked cast exception?
        return new CSVInputFormat().getSplits(jobContext);
      } else if (Formats.INPUTFORMAT.equals(format)) {
        return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor())
            .getSplits(jobContext);
      } else {
        throw new UnsupportedOperationException("Not a supported format: " + format);
      }
    } else {
      return ImmutableList.of();
    }
  }
Esempio n. 2
0
  public int run(String[] args) throws Exception {
    Path inputPath = new Path("weblog_entries.txt");
    Path outputPath = new Path("output");
    Schema schema = ReflectData.get().getSchema(WeblogRecord.class);
    Configuration conf = getConf();

    Job weblogJob = Job.getInstance(conf);
    weblogJob.setJobName("Avro Writer");
    weblogJob.setJarByClass(getClass());

    weblogJob.setNumReduceTasks(0);
    weblogJob.setMapperClass(WeblogMapper_Ex_5.class);
    weblogJob.setMapOutputKeyClass(AvroWrapper.class);
    weblogJob.setMapOutputValueClass(NullWritable.class);

    weblogJob.setInputFormatClass(TextInputFormat.class);

    AvroJob.setOutputKeySchema(weblogJob, schema);

    FileInputFormat.setInputPaths(weblogJob, inputPath);
    FileOutputFormat.setOutputPath(weblogJob, outputPath);

    if (weblogJob.waitForCompletion(true)) {
      return 0;
    }
    return 1;
  }
Esempio n. 3
0
  private void initialiseInput(final Job job, final AddElementsFromHdfs operation)
      throws IOException {
    if (null == avroSchemaFilePath) {
      throw new IllegalArgumentException("Avro schema file path has not been set");
    }

    final Schema schema = new Parser().parse(new File(avroSchemaFilePath));
    AvroJob.setInputKeySchema(job, schema);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    AvroKeyInputFormat.addInputPath(job, operation.getInputPath());
  }
Esempio n. 4
0
  public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s <comma separated paths> <output path>\n", this.getClass().getName());
      return -1;
    }

    Job job = Job.getInstance();
    job.setJobName("PasmJoin");
    job.setJarByClass(PsamXY.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(AvroValue.class);
    job.setOutputKeyClass(AvroKey.class);
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(PsamXYMapper.class);
    job.setReducerClass(PsamXYReducer.class);

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);

    FileInputFormat.setInputPaths(job, args[0]);
    Path output = new Path(args[1]);
    FileOutputFormat.setOutputPath(job, output);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(output, true);

    AvroJob.setOutputKeySchema(job, outputSchema);
    AvroJob.setMapOutputValueSchema(job, outputSchema);

    // DistributedCache.addCacheFile(new Path("BM_TERM_TYPE_DMT.avro").toUri(),
    // job.getConfiguration());

    job.setNumReduceTasks(1);
    job.submit();

    job.waitForCompletion(true);
    return 0;
  }
  @Test
  public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException {
    // Create a test input file.
    File inputFile = createInputFile();

    // Configure the job input.
    Job job = new Job();
    FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath()));
    job.setInputFormatClass(CombineAvroKeyValueInputFormat.class);
    AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT));
    AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING));

    // Configure a mapper.
    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Configure a reducer.
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(AvroValue.class);
    AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT)));

    // Configure the output format.
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index");
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job.
    assertTrue(job.waitForCompletion(true));

    // Verify that the output Avro container file as the expected data.
    File avroFile = new File(outputPath.toString(), "part-r-00000.avro");
    DatumReader<GenericRecord> datumReader =
        new SpecificDatumReader<GenericRecord>(
            AvroKeyValue.getSchema(
                Schema.create(Schema.Type.STRING),
                Schema.createArray(Schema.create(Schema.Type.INT))));
    DataFileReader<GenericRecord> avroFileReader =
        new DataFileReader<GenericRecord>(avroFile, datumReader);
    assertTrue(avroFileReader.hasNext());

    AvroKeyValue<CharSequence, List<Integer>> appleRecord =
        new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next());
    assertNotNull(appleRecord.get());
    assertEquals("apple", appleRecord.getKey().toString());
    List<Integer> appleDocs = appleRecord.getValue();
    assertEquals(3, appleDocs.size());
    assertTrue(appleDocs.contains(1));
    assertTrue(appleDocs.contains(2));
    assertTrue(appleDocs.contains(3));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> bananaRecord =
        new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next());
    assertNotNull(bananaRecord.get());
    assertEquals("banana", bananaRecord.getKey().toString());
    List<Integer> bananaDocs = bananaRecord.getValue();
    assertEquals(2, bananaDocs.size());
    assertTrue(bananaDocs.contains(1));
    assertTrue(bananaDocs.contains(2));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> carrotRecord =
        new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next());
    assertEquals("carrot", carrotRecord.getKey().toString());
    List<Integer> carrotDocs = carrotRecord.getValue();
    assertEquals(1, carrotDocs.size());
    assertTrue(carrotDocs.contains(1));

    assertFalse(avroFileReader.hasNext());
    avroFileReader.close();
  }