@Override @SuppressWarnings({"unchecked", "deprecation"}) public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Job job = new Job(conf); Format format = dataset.getDescriptor().getFormat(); if (setInputPaths(jobContext, job)) { if (Formats.AVRO.equals(format)) { AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema()); AvroKeyInputFormat<E> delegate = new AvroKeyInputFormat<E>(); return delegate.getSplits(jobContext); } else if (Formats.PARQUET.equals(format)) { // TODO: use later version of parquet (with https://github.com/Parquet/parquet-mr/pull/282) // so we can set the schema correctly // AvroParquetInputFormat.setReadSchema(job, view.getDescriptor().getSchema()); AvroParquetInputFormat delegate = new AvroParquetInputFormat(); return delegate.getSplits(jobContext); } else if (Formats.JSON.equals(format)) { return new JSONInputFormat().getSplits(jobContext); } else if (Formats.CSV.equals(format)) { // this generates an unchecked cast exception? return new CSVInputFormat().getSplits(jobContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor()) .getSplits(jobContext); } else { throw new UnsupportedOperationException("Not a supported format: " + format); } } else { return ImmutableList.of(); } }
private void initialiseInput(final Job job, final AddElementsFromHdfs operation) throws IOException { if (null == avroSchemaFilePath) { throw new IllegalArgumentException("Avro schema file path has not been set"); } final Schema schema = new Parser().parse(new File(avroSchemaFilePath)); AvroJob.setInputKeySchema(job, schema); job.setInputFormatClass(AvroKeyInputFormat.class); AvroKeyInputFormat.addInputPath(job, operation.getInputPath()); }
@Test public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException { // Create a test input file. File inputFile = createInputFile(); // Configure the job input. Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath())); job.setInputFormatClass(CombineAvroKeyValueInputFormat.class); AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT)); AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING)); // Configure a mapper. job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // Configure a reducer. job.setReducerClass(IndexReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT))); // Configure the output format. job.setOutputFormatClass(AvroKeyValueOutputFormat.class); Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index"); FileOutputFormat.setOutputPath(job, outputPath); // Run the job. assertTrue(job.waitForCompletion(true)); // Verify that the output Avro container file as the expected data. File avroFile = new File(outputPath.toString(), "part-r-00000.avro"); DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>( AvroKeyValue.getSchema( Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT)))); DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertNotNull(appleRecord.get()); assertEquals("apple", appleRecord.getKey().toString()); List<Integer> appleDocs = appleRecord.getValue(); assertEquals(3, appleDocs.size()); assertTrue(appleDocs.contains(1)); assertTrue(appleDocs.contains(2)); assertTrue(appleDocs.contains(3)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertNotNull(bananaRecord.get()); assertEquals("banana", bananaRecord.getKey().toString()); List<Integer> bananaDocs = bananaRecord.getValue(); assertEquals(2, bananaDocs.size()); assertTrue(bananaDocs.contains(1)); assertTrue(bananaDocs.contains(2)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertEquals("carrot", carrotRecord.getKey().toString()); List<Integer> carrotDocs = carrotRecord.getValue(); assertEquals(1, carrotDocs.size()); assertTrue(carrotDocs.contains(1)); assertFalse(avroFileReader.hasNext()); avroFileReader.close(); }