public FileSystemViewKeyInputFormat(FileSystemDataset<E> dataset, Configuration conf) { this.dataset = dataset; LOG.debug("Dataset: {}", dataset); Format format = dataset.getDescriptor().getFormat(); boolean isSpecific = SpecificRecord.class.isAssignableFrom(dataset.getType()); if (Formats.AVRO.equals(format)) { setModel.invoke(conf, DataModelUtil.getDataModelForType(dataset.getType()).getClass()); // Use the reader's schema type if provided. if (isSpecific) { conf.set(AVRO_SCHEMA_INPUT_KEY, SpecificData.get().getSchema(dataset.getType()).toString()); } } else if (Formats.PARQUET.equals(format)) { // Use the reader's schema type if provided. if (isSpecific) { AvroReadSupport.setAvroReadSchema(conf, SpecificData.get().getSchema(dataset.getType())); } } }
@Override @SuppressWarnings({"unchecked", "deprecation"}) public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Job job = new Job(conf); Format format = dataset.getDescriptor().getFormat(); if (setInputPaths(jobContext, job)) { if (Formats.AVRO.equals(format)) { AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema()); AvroKeyInputFormat<E> delegate = new AvroKeyInputFormat<E>(); return delegate.getSplits(jobContext); } else if (Formats.PARQUET.equals(format)) { // TODO: use later version of parquet (with https://github.com/Parquet/parquet-mr/pull/282) // so we can set the schema correctly // AvroParquetInputFormat.setReadSchema(job, view.getDescriptor().getSchema()); AvroParquetInputFormat delegate = new AvroParquetInputFormat(); return delegate.getSplits(jobContext); } else if (Formats.JSON.equals(format)) { return new JSONInputFormat().getSplits(jobContext); } else if (Formats.CSV.equals(format)) { // this generates an unchecked cast exception? return new CSVInputFormat().getSplits(jobContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor()) .getSplits(jobContext); } else { throw new UnsupportedOperationException("Not a supported format: " + format); } } else { return ImmutableList.of(); } }
@SuppressWarnings("unchecked") private RecordReader<E, Void> createUnfilteredRecordReader( InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Format format = dataset.getDescriptor().getFormat(); if (Formats.AVRO.equals(format)) { return new AvroKeyReaderWrapper(new AvroKeyInputFormat<E>()); } else if (Formats.PARQUET.equals(format)) { return new ValueReaderWrapper(new AvroParquetInputFormat()); } else if (Formats.JSON.equals(format)) { JSONInputFormat<E> delegate = new JSONInputFormat<E>(); delegate.setDescriptor(dataset.getDescriptor()); delegate.setType(dataset.getType()); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.CSV.equals(format)) { CSVInputFormat<E> delegate = new CSVInputFormat<E>(); delegate.setDescriptor(dataset.getDescriptor()); delegate.setType(dataset.getType()); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newRecordReader(dataset.getDescriptor()); } else { throw new UnsupportedOperationException("Not a supported format: " + format); } }
@VisibleForTesting @SuppressWarnings("unchecked") <E> FileAppender<E> newAppender(Path temp) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return (FileAppender<E>) new DurableParquetAppender( fs, temp, descriptor.getSchema(), conf, descriptor.getCompressionType()); } else { return (FileAppender<E>) new ParquetAppender( fs, temp, descriptor.getSchema(), conf, descriptor.getCompressionType()); } } else if (Formats.AVRO.equals(format)) { return new AvroAppender<E>(fs, temp, descriptor.getSchema(), descriptor.getCompressionType()); } else if (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) { return new CSVAppender<E>(fs, temp, descriptor); } else { this.state = ReaderWriterState.ERROR; throw new UnknownFormatException("Unknown format " + descriptor); } }
static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, DatasetDescriptor descriptor) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalWriter<E>(fs, path, descriptor); } else { return new FileSystemWriter<E>(fs, path, descriptor); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalWriter<E>(fs, path, descriptor); } else { return new FileSystemWriter<E>(fs, path, descriptor); } }
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, Parquet is not durable if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) { return new IncrementalPartitionedDatasetWriter<E>(view); } else { return new NonDurablePartitionedDatasetWriter<E>(view); } }
@SuppressWarnings("unchecked") private <E> FileAppender<E> newAppender(Path temp) { Format format = descriptor.getFormat(); if (Formats.PARQUET.equals(format)) { // by default, guarantee durability with the more costly writer if (DescriptorUtil.isEnabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) { return (FileAppender<E>) new ParquetAppender(fs, temp, descriptor.getSchema(), conf, true); } else { return (FileAppender<E>) new DurableParquetAppender(fs, temp, descriptor.getSchema(), conf, true); } } else if (Formats.AVRO.equals(format)) { return new AvroAppender<E>(fs, temp, descriptor.getSchema(), true); } else if (Formats.CSV.equals(format) && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) { return new CSVAppender<E>(fs, temp, descriptor); } else { this.state = ReaderWriterState.ERROR; throw new DatasetWriterException("Unknown format " + descriptor); } }