@Test
 public void testBufferSizeFor1000Col() throws IOException {
   ObjectInspector inspector;
   synchronized (TestOrcFile.class) {
     inspector =
         ObjectInspectorFactory.getReflectionObjectInspector(
             Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
   }
   int bufferSize = 128 * 1024;
   String columns = getRandomColumnNames(1000);
   // just for testing. manually write the column names
   conf.set(IOConstants.COLUMNS, columns);
   Writer writer =
       OrcFile.createWriter(
           testFilePath,
           OrcFile.writerOptions(conf)
               .inspector(inspector)
               .stripeSize(100000)
               .compress(CompressionKind.NONE)
               .bufferSize(bufferSize));
   final int newBufferSize;
   if (writer instanceof WriterImpl) {
     WriterImpl orcWriter = (WriterImpl) writer;
     newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
     assertEquals(bufferSize, newBufferSize);
   }
 }
Exemple #2
0
  @Override
  public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getRawRecordWriter(
      Path path, Options options) throws IOException {
    final Path filename = AcidUtils.createFilename(path, options);
    final OrcFile.WriterOptions opts = OrcFile.writerOptions(options.getConfiguration());
    if (!options.isWritingBase()) {
      opts.bufferSize(OrcRecordUpdater.DELTA_BUFFER_SIZE)
          .stripeSize(OrcRecordUpdater.DELTA_STRIPE_SIZE)
          .blockPadding(false)
          .compress(CompressionKind.NONE)
          .rowIndexStride(0);
    }
    final OrcRecordUpdater.KeyIndexBuilder watcher = new OrcRecordUpdater.KeyIndexBuilder();
    opts.inspector(options.getInspector()).callback(watcher);
    final Writer writer = OrcFile.createWriter(filename, opts);
    return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() {
      @Override
      public void write(Writable w) throws IOException {
        OrcStruct orc = (OrcStruct) w;
        watcher.addKey(
            ((IntWritable) orc.getFieldValue(OrcRecordUpdater.OPERATION)).get(),
            ((LongWritable) orc.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION)).get(),
            ((IntWritable) orc.getFieldValue(OrcRecordUpdater.BUCKET)).get(),
            ((LongWritable) orc.getFieldValue(OrcRecordUpdater.ROW_ID)).get());
        writer.addRow(w);
      }

      @Override
      public void close(boolean abort) throws IOException {
        writer.close();
      }
    };
  }
  public int openFile() throws Exception {

    m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf));
    m_types = m_reader.getTypes();
    m_oi = (StructObjectInspector) m_reader.getObjectInspector();
    m_fields = m_oi.getAllStructFieldRefs();

    m_rr = m_reader.rows();

    return 0;
  }
 @Override
 public RecordReader<NullWritable, OrcStruct> createRecordReader(
     InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
   FileSplit fileSplit = (FileSplit) inputSplit;
   Path path = fileSplit.getPath();
   Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context);
   return new OrcRecordReader(
       OrcFile.createReader(path, OrcFile.readerOptions(conf)),
       ShimLoader.getHadoopShims().getConfiguration(context),
       fileSplit.getStart(),
       fileSplit.getLength());
 }
Exemple #5
0
 @Override
 public void write(NullWritable nullWritable, OrcSerdeRow row) throws IOException {
   if (writer == null) {
     options.inspector(row.getInspector());
     writer = OrcFile.createWriter(path, options);
   }
   writer.addRow(row.getRow());
 }
 OrcRecordUpdater(Path path, AcidOutputFormat.Options options) throws IOException {
   this.options = options;
   this.bucket.set(options.getBucket());
   this.path = AcidUtils.createFilename(path, options);
   FileSystem fs = options.getFilesystem();
   if (fs == null) {
     fs = path.getFileSystem(options.getConfiguration());
   }
   this.fs = fs;
   try {
     FSDataOutputStream strm = fs.create(new Path(path, ACID_FORMAT), false);
     strm.writeInt(ORC_ACID_VERSION);
     strm.close();
   } catch (IOException ioe) {
     if (LOG.isDebugEnabled()) {
       LOG.debug("Failed to create " + path + "/" + ACID_FORMAT + " with " + ioe);
     }
   }
   if (options.getMinimumTransactionId() != options.getMaximumTransactionId()
       && !options.isWritingBase()) {
     flushLengths = fs.create(getSideFile(this.path), true, 8, options.getReporter());
   } else {
     flushLengths = null;
   }
   OrcFile.WriterOptions writerOptions = null;
   if (options instanceof OrcOptions) {
     writerOptions = ((OrcOptions) options).getOrcOptions();
   }
   if (writerOptions == null) {
     writerOptions = OrcFile.writerOptions(options.getConfiguration());
   }
   writerOptions.fileSystem(fs).callback(indexBuilder);
   if (!options.isWritingBase()) {
     writerOptions.blockPadding(false);
     writerOptions.bufferSize(DELTA_BUFFER_SIZE);
     writerOptions.stripeSize(DELTA_STRIPE_SIZE);
   }
   writerOptions.inspector(createEventSchema(options.getInspector()));
   this.writer = OrcFile.createWriter(this.path, writerOptions);
   item = new OrcStruct(FIELDS);
   item.setFieldValue(OPERATION, operation);
   item.setFieldValue(CURRENT_TRANSACTION, currentTransaction);
   item.setFieldValue(ORIGINAL_TRANSACTION, originalTransaction);
   item.setFieldValue(BUCKET, bucket);
   item.setFieldValue(ROW_ID, rowId);
 }
Exemple #7
0
 @Override
 public void write(Writable row) throws IOException {
   OrcSerdeRow serdeRow = (OrcSerdeRow) row;
   if (writer == null) {
     options.inspector(serdeRow.getInspector());
     writer = OrcFile.createWriter(path, options);
   }
   writer.addRow(serdeRow.getRow());
 }
Exemple #8
0
 @Override
 public void close(boolean b) throws IOException {
   // if we haven't written any rows, we need to create a file with a
   // generic schema.
   if (writer == null) {
     // a row with no columns
     ObjectInspector inspector =
         ObjectInspectorFactory.getStandardStructObjectInspector(
             new ArrayList<String>(), new ArrayList<ObjectInspector>());
     options.inspector(inspector);
     writer = OrcFile.createWriter(path, options);
   }
   writer.close();
 }
 @Test
 public void testBufferSizeFor1Col() throws IOException {
   ObjectInspector inspector;
   synchronized (TestOrcFile.class) {
     inspector =
         ObjectInspectorFactory.getReflectionObjectInspector(
             Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
   }
   int bufferSize = 128 * 1024;
   Writer writer =
       OrcFile.createWriter(
           testFilePath,
           OrcFile.writerOptions(conf)
               .inspector(inspector)
               .stripeSize(100000)
               .compress(CompressionKind.NONE)
               .bufferSize(bufferSize));
   final int newBufferSize;
   if (writer instanceof WriterImpl) {
     WriterImpl orcWriter = (WriterImpl) writer;
     newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize);
     assertEquals(bufferSize, newBufferSize);
   }
 }
  @Override
  public FileSinkOperator.RecordWriter getHiveRecordWriter(
      JobConf conf,
      Path path,
      Class<? extends Writable> valueClass,
      boolean isCompressed,
      Properties tableProperties,
      Progressable reporter)
      throws IOException {
    OrcFile.WriterOptions options = OrcFile.writerOptions(conf);
    if (tableProperties.containsKey(OrcFile.STRIPE_SIZE)) {
      options.stripeSize(Long.parseLong(tableProperties.getProperty(OrcFile.STRIPE_SIZE)));
    }

    if (tableProperties.containsKey(OrcFile.COMPRESSION)) {
      options.compress(CompressionKind.valueOf(tableProperties.getProperty(OrcFile.COMPRESSION)));
    }

    if (tableProperties.containsKey(OrcFile.COMPRESSION_BLOCK_SIZE)) {
      options.bufferSize(
          Integer.parseInt(tableProperties.getProperty(OrcFile.COMPRESSION_BLOCK_SIZE)));
    }

    if (tableProperties.containsKey(OrcFile.ROW_INDEX_STRIDE)) {
      options.rowIndexStride(
          Integer.parseInt(tableProperties.getProperty(OrcFile.ROW_INDEX_STRIDE)));
    }

    if (tableProperties.containsKey(OrcFile.ENABLE_INDEXES)) {
      if ("false".equals(tableProperties.getProperty(OrcFile.ENABLE_INDEXES))) {
        options.rowIndexStride(0);
      }
    }

    if (tableProperties.containsKey(OrcFile.BLOCK_PADDING)) {
      options.blockPadding(
          Boolean.parseBoolean(tableProperties.getProperty(OrcFile.BLOCK_PADDING)));
    }

    return new OrcRecordWriter(path, options, conf);
  }
Exemple #11
0
  private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) {
    OrcFile.WriterOptions result = OrcFile.writerOptions(props, conf);
    if (props != null) {
      final String columnNameProperty = props.getProperty(IOConstants.COLUMNS);
      final String columnTypeProperty = props.getProperty(IOConstants.COLUMNS_TYPES);
      if (columnNameProperty != null
          && !columnNameProperty.isEmpty()
          && columnTypeProperty != null
          && !columnTypeProperty.isEmpty()) {
        List<String> columnNames;
        List<TypeInfo> columnTypes;

        if (columnNameProperty.length() == 0) {
          columnNames = new ArrayList<String>();
        } else {
          columnNames = Arrays.asList(columnNameProperty.split(","));
        }

        if (columnTypeProperty.length() == 0) {
          columnTypes = new ArrayList<TypeInfo>();
        } else {
          columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
        }

        TypeDescription schema = TypeDescription.createStruct();
        for (int i = 0; i < columnNames.size(); ++i) {
          schema.addField(columnNames.get(i), convertTypeInfo(columnTypes.get(i)));
        }
        if (LOG.isDebugEnabled()) {
          LOG.debug("ORC schema = " + schema);
        }
        result.setSchema(schema);
      }
    }
    return result;
  }
  // This method is just for experimentation.
  public void testRead() throws Exception {

    m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf));

    System.out.println("Reader: " + m_reader);

    System.out.println("# Rows: " + m_reader.getNumberOfRows());
    m_types = m_reader.getTypes();
    System.out.println("# Types in the file: " + m_types.size());

    for (int i = 0; i < m_types.size(); i++) {
      System.out.println("Type " + i + ": " + m_types.get(i).getKind());
    }

    System.out.println("Compression: " + m_reader.getCompression());
    if (m_reader.getCompression() != CompressionKind.NONE) {
      System.out.println("Compression size: " + m_reader.getCompressionSize());
    }

    StructObjectInspector m_oi = (StructObjectInspector) m_reader.getObjectInspector();

    System.out.println("object inspector type category: " + m_oi.getCategory());
    System.out.println("object inspector type name    : " + m_oi.getTypeName());

    m_fields = m_oi.getAllStructFieldRefs();
    System.out.println("Number of columns in the table: " + m_fields.size());

    RecordReader m_rr = m_reader.rows();

    // Print the type info:
    for (int i = 0; i < m_fields.size(); i++) {
      System.out.println("Column " + i + " name: " + m_fields.get(i).getFieldName());
      ObjectInspector lv_foi = m_fields.get(i).getFieldObjectInspector();
      System.out.println("Column " + i + " type category: " + lv_foi.getCategory());
      System.out.println("Column " + i + " type name: " + lv_foi.getTypeName());
      //		Object lv_column_val = m_oi.getStructFieldData(lv_row, m_fields.get(i));
      // System.out.print("Column " + i + " value: " + lv_row.getFieldValue(i));
    }

    OrcStruct lv_row = null;
    Object lv_field_val = null;
    StringBuilder lv_row_string = new StringBuilder(1024);
    while (m_rr.hasNext()) {
      lv_row = (OrcStruct) m_rr.next(lv_row);
      lv_row_string.setLength(0);
      for (int i = 0; i < m_fields.size(); i++) {
        lv_field_val = lv_row.getFieldValue(i);
        if (lv_field_val != null) {
          lv_row_string.append(lv_field_val);
        }
        lv_row_string.append('|');
      }
      System.out.println(lv_row_string);
    }

    /**
     * Typecasting to appropriate type based on the 'kind' if (OrcProto.Type.Kind.INT ==
     * m_types.get(1).getKind()) { IntWritable lvf_1_val = (IntWritable) lv_row.getFieldValue(0);
     * System.out.println("Column 1 value: " + lvf_1_val); }
     */
  }
  /**
   * Create a reader that merge sorts the ACID events together.
   *
   * @param conf the configuration
   * @param collapseEvents should the events on the same row be collapsed
   * @param isOriginal is the base file a pre-acid file
   * @param bucket the bucket we are reading
   * @param options the options to read with
   * @param deltaDirectory the list of delta directories to include
   * @throws IOException
   */
  OrcRawRecordMerger(
      Configuration conf,
      boolean collapseEvents,
      Reader reader,
      boolean isOriginal,
      int bucket,
      ValidTxnList validTxnList,
      Reader.Options options,
      Path[] deltaDirectory)
      throws IOException {
    this.conf = conf;
    this.collapse = collapseEvents;
    this.offset = options.getOffset();
    this.length = options.getLength();
    this.validTxnList = validTxnList;
    // modify the optins to reflect the event instead of the base row
    Reader.Options eventOptions = createEventOptions(options);
    if (reader == null) {
      baseReader = null;
    } else {

      // find the min/max based on the offset and length
      if (isOriginal) {
        discoverOriginalKeyBounds(reader, bucket, options);
      } else {
        discoverKeyBounds(reader, options);
      }
      LOG.info("min key = " + minKey + ", max key = " + maxKey);
      // use the min/max instead of the byte range
      ReaderPair pair;
      ReaderKey key = new ReaderKey();
      if (isOriginal) {
        options = options.clone();
        options.range(options.getOffset(), Long.MAX_VALUE);
        pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options);
      } else {
        pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions);
      }

      // if there is at least one record, put it in the map
      if (pair.nextRecord != null) {
        readers.put(key, pair);
      }
      baseReader = pair.recordReader;
    }

    // we always want to read all of the deltas
    eventOptions.range(0, Long.MAX_VALUE);
    // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
    // it can produce wrong results (if the latest valid version of the record is filtered out by
    // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
    eventOptions.searchArgument(null, null);
    if (deltaDirectory != null) {
      for (Path delta : deltaDirectory) {
        ReaderKey key = new ReaderKey();
        Path deltaFile = AcidUtils.createBucketFile(delta, bucket);
        FileSystem fs = deltaFile.getFileSystem(conf);
        long length = getLastFlushLength(fs, deltaFile);
        if (fs.exists(deltaFile) && length != -1) {
          Reader deltaReader =
              OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length));
          ReaderPair deltaPair =
              new ReaderPair(key, deltaReader, bucket, minKey, maxKey, eventOptions);
          if (deltaPair.nextRecord != null) {
            readers.put(key, deltaPair);
          }
        }
      }
    }

    // get the first record
    Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry();
    if (entry == null) {
      columns = 0;
      primary = null;
    } else {
      primary = entry.getValue();
      if (readers.isEmpty()) {
        secondaryKey = null;
      } else {
        secondaryKey = readers.firstKey();
      }
      // get the number of columns in the user's rows
      columns = primary.getColumns();
    }
  }
 @Override
 public RecordWriter<NullWritable, OrcSerdeRow> getRecordWriter(
     FileSystem fileSystem, JobConf conf, String name, Progressable reporter) throws IOException {
   return new OrcRecordWriter(
       new Path(getWorkOutputPath(conf), name), OrcFile.writerOptions(conf), conf);
 }