@Test public void testBufferSizeFor1000Col() throws IOException { ObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector( Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } int bufferSize = 128 * 1024; String columns = getRandomColumnNames(1000); // just for testing. manually write the column names conf.set(IOConstants.COLUMNS, columns); Writer writer = OrcFile.createWriter( testFilePath, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(bufferSize)); final int newBufferSize; if (writer instanceof WriterImpl) { WriterImpl orcWriter = (WriterImpl) writer; newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize); assertEquals(bufferSize, newBufferSize); } }
@Override public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getRawRecordWriter( Path path, Options options) throws IOException { final Path filename = AcidUtils.createFilename(path, options); final OrcFile.WriterOptions opts = OrcFile.writerOptions(options.getConfiguration()); if (!options.isWritingBase()) { opts.bufferSize(OrcRecordUpdater.DELTA_BUFFER_SIZE) .stripeSize(OrcRecordUpdater.DELTA_STRIPE_SIZE) .blockPadding(false) .compress(CompressionKind.NONE) .rowIndexStride(0); } final OrcRecordUpdater.KeyIndexBuilder watcher = new OrcRecordUpdater.KeyIndexBuilder(); opts.inspector(options.getInspector()).callback(watcher); final Writer writer = OrcFile.createWriter(filename, opts); return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() { @Override public void write(Writable w) throws IOException { OrcStruct orc = (OrcStruct) w; watcher.addKey( ((IntWritable) orc.getFieldValue(OrcRecordUpdater.OPERATION)).get(), ((LongWritable) orc.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION)).get(), ((IntWritable) orc.getFieldValue(OrcRecordUpdater.BUCKET)).get(), ((LongWritable) orc.getFieldValue(OrcRecordUpdater.ROW_ID)).get()); writer.addRow(w); } @Override public void close(boolean abort) throws IOException { writer.close(); } }; }
public int openFile() throws Exception { m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf)); m_types = m_reader.getTypes(); m_oi = (StructObjectInspector) m_reader.getObjectInspector(); m_fields = m_oi.getAllStructFieldRefs(); m_rr = m_reader.rows(); return 0; }
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context); return new OrcRecordReader( OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
@Override public void write(NullWritable nullWritable, OrcSerdeRow row) throws IOException { if (writer == null) { options.inspector(row.getInspector()); writer = OrcFile.createWriter(path, options); } writer.addRow(row.getRow()); }
OrcRecordUpdater(Path path, AcidOutputFormat.Options options) throws IOException { this.options = options; this.bucket.set(options.getBucket()); this.path = AcidUtils.createFilename(path, options); FileSystem fs = options.getFilesystem(); if (fs == null) { fs = path.getFileSystem(options.getConfiguration()); } this.fs = fs; try { FSDataOutputStream strm = fs.create(new Path(path, ACID_FORMAT), false); strm.writeInt(ORC_ACID_VERSION); strm.close(); } catch (IOException ioe) { if (LOG.isDebugEnabled()) { LOG.debug("Failed to create " + path + "/" + ACID_FORMAT + " with " + ioe); } } if (options.getMinimumTransactionId() != options.getMaximumTransactionId() && !options.isWritingBase()) { flushLengths = fs.create(getSideFile(this.path), true, 8, options.getReporter()); } else { flushLengths = null; } OrcFile.WriterOptions writerOptions = null; if (options instanceof OrcOptions) { writerOptions = ((OrcOptions) options).getOrcOptions(); } if (writerOptions == null) { writerOptions = OrcFile.writerOptions(options.getConfiguration()); } writerOptions.fileSystem(fs).callback(indexBuilder); if (!options.isWritingBase()) { writerOptions.blockPadding(false); writerOptions.bufferSize(DELTA_BUFFER_SIZE); writerOptions.stripeSize(DELTA_STRIPE_SIZE); } writerOptions.inspector(createEventSchema(options.getInspector())); this.writer = OrcFile.createWriter(this.path, writerOptions); item = new OrcStruct(FIELDS); item.setFieldValue(OPERATION, operation); item.setFieldValue(CURRENT_TRANSACTION, currentTransaction); item.setFieldValue(ORIGINAL_TRANSACTION, originalTransaction); item.setFieldValue(BUCKET, bucket); item.setFieldValue(ROW_ID, rowId); }
@Override public void write(Writable row) throws IOException { OrcSerdeRow serdeRow = (OrcSerdeRow) row; if (writer == null) { options.inspector(serdeRow.getInspector()); writer = OrcFile.createWriter(path, options); } writer.addRow(serdeRow.getRow()); }
@Override public void close(boolean b) throws IOException { // if we haven't written any rows, we need to create a file with a // generic schema. if (writer == null) { // a row with no columns ObjectInspector inspector = ObjectInspectorFactory.getStandardStructObjectInspector( new ArrayList<String>(), new ArrayList<ObjectInspector>()); options.inspector(inspector); writer = OrcFile.createWriter(path, options); } writer.close(); }
@Test public void testBufferSizeFor1Col() throws IOException { ObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector( Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } int bufferSize = 128 * 1024; Writer writer = OrcFile.createWriter( testFilePath, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(100000) .compress(CompressionKind.NONE) .bufferSize(bufferSize)); final int newBufferSize; if (writer instanceof WriterImpl) { WriterImpl orcWriter = (WriterImpl) writer; newBufferSize = orcWriter.getEstimatedBufferSize(bufferSize); assertEquals(bufferSize, newBufferSize); } }
@Override public FileSinkOperator.RecordWriter getHiveRecordWriter( JobConf conf, Path path, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable reporter) throws IOException { OrcFile.WriterOptions options = OrcFile.writerOptions(conf); if (tableProperties.containsKey(OrcFile.STRIPE_SIZE)) { options.stripeSize(Long.parseLong(tableProperties.getProperty(OrcFile.STRIPE_SIZE))); } if (tableProperties.containsKey(OrcFile.COMPRESSION)) { options.compress(CompressionKind.valueOf(tableProperties.getProperty(OrcFile.COMPRESSION))); } if (tableProperties.containsKey(OrcFile.COMPRESSION_BLOCK_SIZE)) { options.bufferSize( Integer.parseInt(tableProperties.getProperty(OrcFile.COMPRESSION_BLOCK_SIZE))); } if (tableProperties.containsKey(OrcFile.ROW_INDEX_STRIDE)) { options.rowIndexStride( Integer.parseInt(tableProperties.getProperty(OrcFile.ROW_INDEX_STRIDE))); } if (tableProperties.containsKey(OrcFile.ENABLE_INDEXES)) { if ("false".equals(tableProperties.getProperty(OrcFile.ENABLE_INDEXES))) { options.rowIndexStride(0); } } if (tableProperties.containsKey(OrcFile.BLOCK_PADDING)) { options.blockPadding( Boolean.parseBoolean(tableProperties.getProperty(OrcFile.BLOCK_PADDING))); } return new OrcRecordWriter(path, options, conf); }
private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) { OrcFile.WriterOptions result = OrcFile.writerOptions(props, conf); if (props != null) { final String columnNameProperty = props.getProperty(IOConstants.COLUMNS); final String columnTypeProperty = props.getProperty(IOConstants.COLUMNS_TYPES); if (columnNameProperty != null && !columnNameProperty.isEmpty() && columnTypeProperty != null && !columnTypeProperty.isEmpty()) { List<String> columnNames; List<TypeInfo> columnTypes; if (columnNameProperty.length() == 0) { columnNames = new ArrayList<String>(); } else { columnNames = Arrays.asList(columnNameProperty.split(",")); } if (columnTypeProperty.length() == 0) { columnTypes = new ArrayList<TypeInfo>(); } else { columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); } TypeDescription schema = TypeDescription.createStruct(); for (int i = 0; i < columnNames.size(); ++i) { schema.addField(columnNames.get(i), convertTypeInfo(columnTypes.get(i))); } if (LOG.isDebugEnabled()) { LOG.debug("ORC schema = " + schema); } result.setSchema(schema); } } return result; }
// This method is just for experimentation. public void testRead() throws Exception { m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf)); System.out.println("Reader: " + m_reader); System.out.println("# Rows: " + m_reader.getNumberOfRows()); m_types = m_reader.getTypes(); System.out.println("# Types in the file: " + m_types.size()); for (int i = 0; i < m_types.size(); i++) { System.out.println("Type " + i + ": " + m_types.get(i).getKind()); } System.out.println("Compression: " + m_reader.getCompression()); if (m_reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + m_reader.getCompressionSize()); } StructObjectInspector m_oi = (StructObjectInspector) m_reader.getObjectInspector(); System.out.println("object inspector type category: " + m_oi.getCategory()); System.out.println("object inspector type name : " + m_oi.getTypeName()); m_fields = m_oi.getAllStructFieldRefs(); System.out.println("Number of columns in the table: " + m_fields.size()); RecordReader m_rr = m_reader.rows(); // Print the type info: for (int i = 0; i < m_fields.size(); i++) { System.out.println("Column " + i + " name: " + m_fields.get(i).getFieldName()); ObjectInspector lv_foi = m_fields.get(i).getFieldObjectInspector(); System.out.println("Column " + i + " type category: " + lv_foi.getCategory()); System.out.println("Column " + i + " type name: " + lv_foi.getTypeName()); // Object lv_column_val = m_oi.getStructFieldData(lv_row, m_fields.get(i)); // System.out.print("Column " + i + " value: " + lv_row.getFieldValue(i)); } OrcStruct lv_row = null; Object lv_field_val = null; StringBuilder lv_row_string = new StringBuilder(1024); while (m_rr.hasNext()) { lv_row = (OrcStruct) m_rr.next(lv_row); lv_row_string.setLength(0); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val != null) { lv_row_string.append(lv_field_val); } lv_row_string.append('|'); } System.out.println(lv_row_string); } /** * Typecasting to appropriate type based on the 'kind' if (OrcProto.Type.Kind.INT == * m_types.get(1).getKind()) { IntWritable lvf_1_val = (IntWritable) lv_row.getFieldValue(0); * System.out.println("Column 1 value: " + lvf_1_val); } */ }
/** * Create a reader that merge sorts the ACID events together. * * @param conf the configuration * @param collapseEvents should the events on the same row be collapsed * @param isOriginal is the base file a pre-acid file * @param bucket the bucket we are reading * @param options the options to read with * @param deltaDirectory the list of delta directories to include * @throws IOException */ OrcRawRecordMerger( Configuration conf, boolean collapseEvents, Reader reader, boolean isOriginal, int bucket, ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) throws IOException { this.conf = conf; this.collapse = collapseEvents; this.offset = options.getOffset(); this.length = options.getLength(); this.validTxnList = validTxnList; // modify the optins to reflect the event instead of the base row Reader.Options eventOptions = createEventOptions(options); if (reader == null) { baseReader = null; } else { // find the min/max based on the offset and length if (isOriginal) { discoverOriginalKeyBounds(reader, bucket, options); } else { discoverKeyBounds(reader, options); } LOG.info("min key = " + minKey + ", max key = " + maxKey); // use the min/max instead of the byte range ReaderPair pair; ReaderKey key = new ReaderKey(); if (isOriginal) { options = options.clone(); options.range(options.getOffset(), Long.MAX_VALUE); pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options); } else { pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions); } // if there is at least one record, put it in the map if (pair.nextRecord != null) { readers.put(key, pair); } baseReader = pair.recordReader; } // we always want to read all of the deltas eventOptions.range(0, Long.MAX_VALUE); // Turn off the sarg before pushing it to delta. We never want to push a sarg to a delta as // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) eventOptions.searchArgument(null, null); if (deltaDirectory != null) { for (Path delta : deltaDirectory) { ReaderKey key = new ReaderKey(); Path deltaFile = AcidUtils.createBucketFile(delta, bucket); FileSystem fs = deltaFile.getFileSystem(conf); long length = getLastFlushLength(fs, deltaFile); if (fs.exists(deltaFile) && length != -1) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); ReaderPair deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, eventOptions); if (deltaPair.nextRecord != null) { readers.put(key, deltaPair); } } } } // get the first record Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry == null) { columns = 0; primary = null; } else { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } // get the number of columns in the user's rows columns = primary.getColumns(); } }
@Override public RecordWriter<NullWritable, OrcSerdeRow> getRecordWriter( FileSystem fileSystem, JobConf conf, String name, Progressable reporter) throws IOException { return new OrcRecordWriter( new Path(getWorkOutputPath(conf), name), OrcFile.writerOptions(conf), conf); }