public int openFile() throws Exception { m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf)); m_types = m_reader.getTypes(); m_oi = (StructObjectInspector) m_reader.getObjectInspector(); m_fields = m_oi.getAllStructFieldRefs(); m_rr = m_reader.rows(); return 0; }
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context); return new OrcRecordReader( OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
// This method is just for experimentation. public void testRead() throws Exception { m_reader = OrcFile.createReader(m_file_path, OrcFile.readerOptions(m_conf)); System.out.println("Reader: " + m_reader); System.out.println("# Rows: " + m_reader.getNumberOfRows()); m_types = m_reader.getTypes(); System.out.println("# Types in the file: " + m_types.size()); for (int i = 0; i < m_types.size(); i++) { System.out.println("Type " + i + ": " + m_types.get(i).getKind()); } System.out.println("Compression: " + m_reader.getCompression()); if (m_reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + m_reader.getCompressionSize()); } StructObjectInspector m_oi = (StructObjectInspector) m_reader.getObjectInspector(); System.out.println("object inspector type category: " + m_oi.getCategory()); System.out.println("object inspector type name : " + m_oi.getTypeName()); m_fields = m_oi.getAllStructFieldRefs(); System.out.println("Number of columns in the table: " + m_fields.size()); RecordReader m_rr = m_reader.rows(); // Print the type info: for (int i = 0; i < m_fields.size(); i++) { System.out.println("Column " + i + " name: " + m_fields.get(i).getFieldName()); ObjectInspector lv_foi = m_fields.get(i).getFieldObjectInspector(); System.out.println("Column " + i + " type category: " + lv_foi.getCategory()); System.out.println("Column " + i + " type name: " + lv_foi.getTypeName()); // Object lv_column_val = m_oi.getStructFieldData(lv_row, m_fields.get(i)); // System.out.print("Column " + i + " value: " + lv_row.getFieldValue(i)); } OrcStruct lv_row = null; Object lv_field_val = null; StringBuilder lv_row_string = new StringBuilder(1024); while (m_rr.hasNext()) { lv_row = (OrcStruct) m_rr.next(lv_row); lv_row_string.setLength(0); for (int i = 0; i < m_fields.size(); i++) { lv_field_val = lv_row.getFieldValue(i); if (lv_field_val != null) { lv_row_string.append(lv_field_val); } lv_row_string.append('|'); } System.out.println(lv_row_string); } /** * Typecasting to appropriate type based on the 'kind' if (OrcProto.Type.Kind.INT == * m_types.get(1).getKind()) { IntWritable lvf_1_val = (IntWritable) lv_row.getFieldValue(0); * System.out.println("Column 1 value: " + lvf_1_val); } */ }
/** * Create a reader that merge sorts the ACID events together. * * @param conf the configuration * @param collapseEvents should the events on the same row be collapsed * @param isOriginal is the base file a pre-acid file * @param bucket the bucket we are reading * @param options the options to read with * @param deltaDirectory the list of delta directories to include * @throws IOException */ OrcRawRecordMerger( Configuration conf, boolean collapseEvents, Reader reader, boolean isOriginal, int bucket, ValidTxnList validTxnList, Reader.Options options, Path[] deltaDirectory) throws IOException { this.conf = conf; this.collapse = collapseEvents; this.offset = options.getOffset(); this.length = options.getLength(); this.validTxnList = validTxnList; // modify the optins to reflect the event instead of the base row Reader.Options eventOptions = createEventOptions(options); if (reader == null) { baseReader = null; } else { // find the min/max based on the offset and length if (isOriginal) { discoverOriginalKeyBounds(reader, bucket, options); } else { discoverKeyBounds(reader, options); } LOG.info("min key = " + minKey + ", max key = " + maxKey); // use the min/max instead of the byte range ReaderPair pair; ReaderKey key = new ReaderKey(); if (isOriginal) { options = options.clone(); options.range(options.getOffset(), Long.MAX_VALUE); pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, options); } else { pair = new ReaderPair(key, reader, bucket, minKey, maxKey, eventOptions); } // if there is at least one record, put it in the map if (pair.nextRecord != null) { readers.put(key, pair); } baseReader = pair.recordReader; } // we always want to read all of the deltas eventOptions.range(0, Long.MAX_VALUE); // Turn off the sarg before pushing it to delta. We never want to push a sarg to a delta as // it can produce wrong results (if the latest valid version of the record is filtered out by // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record) eventOptions.searchArgument(null, null); if (deltaDirectory != null) { for (Path delta : deltaDirectory) { ReaderKey key = new ReaderKey(); Path deltaFile = AcidUtils.createBucketFile(delta, bucket); FileSystem fs = deltaFile.getFileSystem(conf); long length = getLastFlushLength(fs, deltaFile); if (fs.exists(deltaFile) && length != -1) { Reader deltaReader = OrcFile.createReader(deltaFile, OrcFile.readerOptions(conf).maxLength(length)); ReaderPair deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, maxKey, eventOptions); if (deltaPair.nextRecord != null) { readers.put(key, deltaPair); } } } } // get the first record Map.Entry<ReaderKey, ReaderPair> entry = readers.pollFirstEntry(); if (entry == null) { columns = 0; primary = null; } else { primary = entry.getValue(); if (readers.isEmpty()) { secondaryKey = null; } else { secondaryKey = readers.firstKey(); } // get the number of columns in the user's rows columns = primary.getColumns(); } }