@Override @SuppressWarnings({"unchecked", "deprecation"}) public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Job job = new Job(conf); Format format = dataset.getDescriptor().getFormat(); if (setInputPaths(jobContext, job)) { if (Formats.AVRO.equals(format)) { AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema()); AvroKeyInputFormat<E> delegate = new AvroKeyInputFormat<E>(); return delegate.getSplits(jobContext); } else if (Formats.PARQUET.equals(format)) { // TODO: use later version of parquet (with https://github.com/Parquet/parquet-mr/pull/282) // so we can set the schema correctly // AvroParquetInputFormat.setReadSchema(job, view.getDescriptor().getSchema()); AvroParquetInputFormat delegate = new AvroParquetInputFormat(); return delegate.getSplits(jobContext); } else if (Formats.JSON.equals(format)) { return new JSONInputFormat().getSplits(jobContext); } else if (Formats.CSV.equals(format)) { // this generates an unchecked cast exception? return new CSVInputFormat().getSplits(jobContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor()) .getSplits(jobContext); } else { throw new UnsupportedOperationException("Not a supported format: " + format); } } else { return ImmutableList.of(); } }
public int run(String[] args) throws Exception { Path inputPath = new Path("weblog_entries.txt"); Path outputPath = new Path("output"); Schema schema = ReflectData.get().getSchema(WeblogRecord.class); Configuration conf = getConf(); Job weblogJob = Job.getInstance(conf); weblogJob.setJobName("Avro Writer"); weblogJob.setJarByClass(getClass()); weblogJob.setNumReduceTasks(0); weblogJob.setMapperClass(WeblogMapper_Ex_5.class); weblogJob.setMapOutputKeyClass(AvroWrapper.class); weblogJob.setMapOutputValueClass(NullWritable.class); weblogJob.setInputFormatClass(TextInputFormat.class); AvroJob.setOutputKeySchema(weblogJob, schema); FileInputFormat.setInputPaths(weblogJob, inputPath); FileOutputFormat.setOutputPath(weblogJob, outputPath); if (weblogJob.waitForCompletion(true)) { return 0; } return 1; }
private void initialiseInput(final Job job, final AddElementsFromHdfs operation) throws IOException { if (null == avroSchemaFilePath) { throw new IllegalArgumentException("Avro schema file path has not been set"); } final Schema schema = new Parser().parse(new File(avroSchemaFilePath)); AvroJob.setInputKeySchema(job, schema); job.setInputFormatClass(AvroKeyInputFormat.class); AvroKeyInputFormat.addInputPath(job, operation.getInputPath()); }
public int run(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length != 2) { System.err.printf( "Usage: %s <comma separated paths> <output path>\n", this.getClass().getName()); return -1; } Job job = Job.getInstance(); job.setJobName("PasmJoin"); job.setJarByClass(PsamXY.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AvroValue.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); job.setMapperClass(PsamXYMapper.class); job.setReducerClass(PsamXYReducer.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); FileInputFormat.setInputPaths(job, args[0]); Path output = new Path(args[1]); FileOutputFormat.setOutputPath(job, output); FileSystem fs = FileSystem.get(conf); fs.delete(output, true); AvroJob.setOutputKeySchema(job, outputSchema); AvroJob.setMapOutputValueSchema(job, outputSchema); // DistributedCache.addCacheFile(new Path("BM_TERM_TYPE_DMT.avro").toUri(), // job.getConfiguration()); job.setNumReduceTasks(1); job.submit(); job.waitForCompletion(true); return 0; }
@Test public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException { // Create a test input file. File inputFile = createInputFile(); // Configure the job input. Job job = new Job(); FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath())); job.setInputFormatClass(CombineAvroKeyValueInputFormat.class); AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT)); AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING)); // Configure a mapper. job.setMapperClass(IndexMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // Configure a reducer. job.setReducerClass(IndexReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT))); // Configure the output format. job.setOutputFormatClass(AvroKeyValueOutputFormat.class); Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index"); FileOutputFormat.setOutputPath(job, outputPath); // Run the job. assertTrue(job.waitForCompletion(true)); // Verify that the output Avro container file as the expected data. File avroFile = new File(outputPath.toString(), "part-r-00000.avro"); DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>( AvroKeyValue.getSchema( Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT)))); DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertNotNull(appleRecord.get()); assertEquals("apple", appleRecord.getKey().toString()); List<Integer> appleDocs = appleRecord.getValue(); assertEquals(3, appleDocs.size()); assertTrue(appleDocs.contains(1)); assertTrue(appleDocs.contains(2)); assertTrue(appleDocs.contains(3)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertNotNull(bananaRecord.get()); assertEquals("banana", bananaRecord.getKey().toString()); List<Integer> bananaDocs = bananaRecord.getValue(); assertEquals(2, bananaDocs.size()); assertTrue(bananaDocs.contains(1)); assertTrue(bananaDocs.contains(2)); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(avroFileReader.next()); assertEquals("carrot", carrotRecord.getKey().toString()); List<Integer> carrotDocs = carrotRecord.getValue(); assertEquals(1, carrotDocs.size()); assertTrue(carrotDocs.contains(1)); assertFalse(avroFileReader.hasNext()); avroFileReader.close(); }