Exemplo n.º 1
0
  public FileSystemViewKeyInputFormat(FileSystemDataset<E> dataset, Configuration conf) {
    this.dataset = dataset;
    LOG.debug("Dataset: {}", dataset);

    Format format = dataset.getDescriptor().getFormat();

    boolean isSpecific = SpecificRecord.class.isAssignableFrom(dataset.getType());

    if (Formats.AVRO.equals(format)) {
      setModel.invoke(conf, DataModelUtil.getDataModelForType(dataset.getType()).getClass());

      // Use the reader's schema type if provided.
      if (isSpecific) {

        conf.set(AVRO_SCHEMA_INPUT_KEY, SpecificData.get().getSchema(dataset.getType()).toString());
      }
    } else if (Formats.PARQUET.equals(format)) {

      // Use the reader's schema type if provided.
      if (isSpecific) {

        AvroReadSupport.setAvroReadSchema(conf, SpecificData.get().getSchema(dataset.getType()));
      }
    }
  }
Exemplo n.º 2
0
  @Override
  @SuppressWarnings({"unchecked", "deprecation"})
  public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
    Job job = new Job(conf);
    Format format = dataset.getDescriptor().getFormat();

    if (setInputPaths(jobContext, job)) {
      if (Formats.AVRO.equals(format)) {
        AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema());
        AvroKeyInputFormat<E> delegate = new AvroKeyInputFormat<E>();
        return delegate.getSplits(jobContext);
      } else if (Formats.PARQUET.equals(format)) {
        // TODO: use later version of parquet (with https://github.com/Parquet/parquet-mr/pull/282)
        // so we can set the schema correctly
        // AvroParquetInputFormat.setReadSchema(job, view.getDescriptor().getSchema());
        AvroParquetInputFormat delegate = new AvroParquetInputFormat();
        return delegate.getSplits(jobContext);
      } else if (Formats.JSON.equals(format)) {
        return new JSONInputFormat().getSplits(jobContext);
      } else if (Formats.CSV.equals(format)) {
        // this generates an unchecked cast exception?
        return new CSVInputFormat().getSplits(jobContext);
      } else if (Formats.INPUTFORMAT.equals(format)) {
        return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor())
            .getSplits(jobContext);
      } else {
        throw new UnsupportedOperationException("Not a supported format: " + format);
      }
    } else {
      return ImmutableList.of();
    }
  }
Exemplo n.º 3
0
 @SuppressWarnings("unchecked")
 private RecordReader<E, Void> createUnfilteredRecordReader(
     InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
     throws IOException, InterruptedException {
   Format format = dataset.getDescriptor().getFormat();
   if (Formats.AVRO.equals(format)) {
     return new AvroKeyReaderWrapper(new AvroKeyInputFormat<E>());
   } else if (Formats.PARQUET.equals(format)) {
     return new ValueReaderWrapper(new AvroParquetInputFormat());
   } else if (Formats.JSON.equals(format)) {
     JSONInputFormat<E> delegate = new JSONInputFormat<E>();
     delegate.setDescriptor(dataset.getDescriptor());
     delegate.setType(dataset.getType());
     return delegate.createRecordReader(inputSplit, taskAttemptContext);
   } else if (Formats.CSV.equals(format)) {
     CSVInputFormat<E> delegate = new CSVInputFormat<E>();
     delegate.setDescriptor(dataset.getDescriptor());
     delegate.setType(dataset.getType());
     return delegate.createRecordReader(inputSplit, taskAttemptContext);
   } else if (Formats.INPUTFORMAT.equals(format)) {
     return InputFormatUtil.newRecordReader(dataset.getDescriptor());
   } else {
     throw new UnsupportedOperationException("Not a supported format: " + format);
   }
 }
Exemplo n.º 4
0
 @VisibleForTesting
 @SuppressWarnings("unchecked")
 <E> FileAppender<E> newAppender(Path temp) {
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, Parquet is not durable
     if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return (FileAppender<E>)
           new DurableParquetAppender(
               fs, temp, descriptor.getSchema(), conf, descriptor.getCompressionType());
     } else {
       return (FileAppender<E>)
           new ParquetAppender(
               fs, temp, descriptor.getSchema(), conf, descriptor.getCompressionType());
     }
   } else if (Formats.AVRO.equals(format)) {
     return new AvroAppender<E>(fs, temp, descriptor.getSchema(), descriptor.getCompressionType());
   } else if (Formats.CSV.equals(format)
       && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) {
     return new CSVAppender<E>(fs, temp, descriptor);
   } else {
     this.state = ReaderWriterState.ERROR;
     throw new UnknownFormatException("Unknown format " + descriptor);
   }
 }
Exemplo n.º 5
0
 static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path, DatasetDescriptor descriptor) {
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, Parquet is not durable
     if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return new IncrementalWriter<E>(fs, path, descriptor);
     } else {
       return new FileSystemWriter<E>(fs, path, descriptor);
     }
   } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
     return new IncrementalWriter<E>(fs, path, descriptor);
   } else {
     return new FileSystemWriter<E>(fs, path, descriptor);
   }
 }
Exemplo n.º 6
0
 static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
   DatasetDescriptor descriptor = view.getDataset().getDescriptor();
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, Parquet is not durable
     if (DescriptorUtil.isDisabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return new IncrementalPartitionedDatasetWriter<E>(view);
     } else {
       return new NonDurablePartitionedDatasetWriter<E>(view);
     }
   } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
     return new IncrementalPartitionedDatasetWriter<E>(view);
   } else {
     return new NonDurablePartitionedDatasetWriter<E>(view);
   }
 }
Exemplo n.º 7
0
 @SuppressWarnings("unchecked")
 private <E> FileAppender<E> newAppender(Path temp) {
   Format format = descriptor.getFormat();
   if (Formats.PARQUET.equals(format)) {
     // by default, guarantee durability with the more costly writer
     if (DescriptorUtil.isEnabled(FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
       return (FileAppender<E>) new ParquetAppender(fs, temp, descriptor.getSchema(), conf, true);
     } else {
       return (FileAppender<E>)
           new DurableParquetAppender(fs, temp, descriptor.getSchema(), conf, true);
     }
   } else if (Formats.AVRO.equals(format)) {
     return new AvroAppender<E>(fs, temp, descriptor.getSchema(), true);
   } else if (Formats.CSV.equals(format)
       && DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) {
     return new CSVAppender<E>(fs, temp, descriptor);
   } else {
     this.state = ReaderWriterState.ERROR;
     throw new DatasetWriterException("Unknown format " + descriptor);
   }
 }