/** * reads {@link ColumnarMetadata} stored in an RCFile. * * @throws IOException if metadata is not stored or in case of any other error. */ public static ColumnarMetadata readMetadata(Configuration conf, Path rcfile) throws IOException { Metadata metadata = null; Configuration confCopy = new Configuration(conf); // set up conf to read all the columns ColumnProjectionUtils.setFullyReadColumns(confCopy); RCFile.Reader reader = new RCFile.Reader(rcfile.getFileSystem(confCopy), rcfile, confCopy); // ugly hack to get metadata. RCFile has to provide access to metata try { Field f = RCFile.Reader.class.getDeclaredField("metadata"); f.setAccessible(true); metadata = (Metadata) f.get(reader); } catch (Throwable t) { throw new IOException("Could not access metadata field in RCFile reader", t); } reader.close(); Text metadataKey = new Text(COLUMN_METADATA_PROTOBUF_KEY); if (metadata == null || metadata.get(metadataKey) == null) { throw new IOException("could not find ColumnarMetadata in " + rcfile); } return Protobufs.mergeFromText(ColumnarMetadata.newBuilder(), metadata.get(metadataKey)) .build(); }
public int performRCFileFullyReadColumnTest( FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException { byte[][] checkBytes = null; BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber); if (chechCorrect) { resetRandomGenerators(); checkBytes = new byte[allColumnsNumber][]; } int actualReadCount = 0; ColumnProjectionUtils.setReadAllColumns(conf); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); boolean ok = true; if (chechCorrect) { nextRandomRow(checkBytes, checkRow); ok = ok && checkRow.equals(cols); } if (!ok) { throw new IllegalStateException("Compare read and write error."); } actualReadCount++; } return actualReadCount; }
public int performRCFileReadFirstAndLastColumnTest( FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException { byte[][] checkBytes = null; BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber); if (chechCorrect) { resetRandomGenerators(); checkBytes = new byte[allColumnsNumber][]; } int actualReadCount = 0; java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>(); readCols.add(Integer.valueOf(0)); readCols.add(Integer.valueOf(allColumnsNumber - 1)); ColumnProjectionUtils.appendReadColumns(conf, readCols); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); boolean ok = true; if (chechCorrect) { nextRandomRow(checkBytes, checkRow); ok = ok && (checkRow.get(0).equals(cols.get(0))); ok = ok && checkRow.get(allColumnsNumber - 1).equals(cols.get(allColumnsNumber - 1)); } if (!ok) { throw new IllegalStateException("Compare read and write error."); } actualReadCount++; } return actualReadCount; }
@Override public void initialize( HiveConf conf, QueryPlan queryPlan, DriverContext ctx, CompilationOpContext opContext) { super.initialize(conf, queryPlan, ctx, opContext); work.initializeForFetch(opContext); try { // Create a file system handle JobConf job = new JobConf(conf); Operator<?> source = work.getSource(); if (source instanceof TableScanOperator) { TableScanOperator ts = (TableScanOperator) source; // push down projections ColumnProjectionUtils.appendReadColumns( job, ts.getNeededColumnIDs(), ts.getNeededColumns()); // push down filters HiveInputFormat.pushFilters(job, ts); } sink = work.getSink(); fetch = new FetchOperator(work, job, source, getVirtualColumns(source)); source.initialize(conf, new ObjectInspector[] {fetch.getOutputObjectInspector()}); totalRows = 0; ExecMapper.setDone(false); } catch (Exception e) { // Bail out ungracefully - we should never hit // this here - but would have hit it in SemanticAnalyzer LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } }
private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException { for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work.getAliasToWork().entrySet()) { LOG.debug( "initializeOperators: " + entry.getKey() + ", children = " + entry.getValue().getChildOperators()); } // this mapper operator is used to initialize all the operators for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) { if (entry.getValue() == null) { continue; } JobConf jobClone = new JobConf(job); TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey()); // push down projections ColumnProjectionUtils.appendReadColumns( jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns()); // push down filters HiveInputFormat.pushFilters(jobClone, ts); // create a fetch operator FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone); fetchOpJobConfMap.put(fetchOp, jobClone); fetchOperators.put(entry.getKey(), fetchOp); l4j.info("fetchoperator for " + entry.getKey() + " created"); } // initialize all forward operator for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) { // get the forward op String alias = entry.getKey(); Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias); // put the exe context into all the operators forwardOp.passExecContext(execContext); // All the operators need to be initialized before process FetchOperator fetchOp = entry.getValue(); JobConf jobConf = fetchOpJobConfMap.get(fetchOp); if (jobConf == null) { jobConf = job; } // initialize the forward operator ObjectInspector objectInspector = fetchOp.getOutputObjectInspector(); forwardOp.initialize(jobConf, new ObjectInspector[] {objectInspector}); l4j.info("fetchoperator for " + entry.getKey() + " initialized"); } }
@Override @SuppressWarnings("unchecked") public void initialize(InputSplit split, TaskAttemptContext ctx) throws IOException, InterruptedException { // set up columns that needs to read from the RCFile. tDesc = TStructDescriptor.getInstance(typeRef.getRawClass()); thriftWritable = ThriftWritable.newInstance((Class<TBase<?, ?>>) typeRef.getRawClass()); final List<Field> tFields = tDesc.getFields(); FileSplit fsplit = (FileSplit) split; Path file = fsplit.getPath(); LOG.info( String.format( "reading %s from %s:%d:%d", typeRef.getRawClass().getName(), file.toString(), fsplit.getStart(), fsplit.getStart() + fsplit.getLength())); ColumnarMetadata storedInfo = RCFileUtil.readMetadata(ctx.getConfiguration(), file); // list of field numbers List<Integer> tFieldIds = Lists.transform( tFields, new Function<Field, Integer>() { public Integer apply(Field fd) { return Integer.valueOf(fd.getFieldId()); } }); columnsBeingRead = RCFileUtil.findColumnsToRead(ctx.getConfiguration(), tFieldIds, storedInfo); for (int idx : columnsBeingRead) { int fid = storedInfo.getFieldId(idx); if (fid >= 0) { knownRequiredFields.add(tFields.get(tFieldIds.indexOf(fid))); } else { readUnknownsColumn = true; } } ColumnProjectionUtils.setReadColumnIDs(ctx.getConfiguration(), columnsBeingRead); // finally! super.initialize(split, ctx); }
private void partialReadTest(FileSystem fs, int count, Path file) throws IOException, SerDeException { LOG.debug("reading " + count + " records"); long start = System.currentTimeMillis(); java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>(); readCols.add(Integer.valueOf(2)); readCols.add(Integer.valueOf(3)); ColumnProjectionUtils.setReadColumnIDs(conf, readCols); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); cols.resetValid(8); Object row = serDe.deserialize(cols); StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); assertEquals("Field size should be 8", 8, fieldRefs.size()); for (int i : readCols) { Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i)); Object standardWritableData = ObjectInspectorUtils.copyToStandardObject( fieldData, fieldRefs.get(i).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE); assertEquals("Field " + i, standardWritableData, expectedPartitalFieldsData[i]); } assertEquals( "Class of the serialized object should be BytesRefArrayWritable", BytesRefArrayWritable.class, serDe.getSerializedClass()); BytesRefArrayWritable serializedBytes = (BytesRefArrayWritable) serDe.serialize(row, oi); assertEquals("Serialized data", patialS, serializedBytes); } reader.close(); long cost = System.currentTimeMillis() - start; LOG.debug("reading fully costs:" + cost + " milliseconds"); }
public void fullyReadTest(FileSystem fs, int count, Path file) throws IOException, SerDeException { LOG.debug("reading " + count + " records"); long start = System.currentTimeMillis(); ColumnProjectionUtils.setFullyReadColumns(conf); RCFile.Reader reader = new RCFile.Reader(fs, file, conf); LongWritable rowID = new LongWritable(); int actualRead = 0; BytesRefArrayWritable cols = new BytesRefArrayWritable(); while (reader.next(rowID)) { reader.getCurrentRow(cols); cols.resetValid(8); Object row = serDe.deserialize(cols); StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); assertEquals("Field size should be 8", 8, fieldRefs.size()); for (int i = 0; i < fieldRefs.size(); i++) { Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i)); Object standardWritableData = ObjectInspectorUtils.copyToStandardObject( fieldData, fieldRefs.get(i).getFieldObjectInspector(), ObjectInspectorCopyOption.WRITABLE); assertEquals("Field " + i, standardWritableData, expectedFieldsData[i]); } // Serialize assertEquals( "Class of the serialized object should be BytesRefArrayWritable", BytesRefArrayWritable.class, serDe.getSerializedClass()); BytesRefArrayWritable serializedText = (BytesRefArrayWritable) serDe.serialize(row, oi); assertEquals("Serialized data", s, serializedText); actualRead++; } reader.close(); assertEquals("Expect " + count + " rows, actual read " + actualRead, actualRead, count); long cost = System.currentTimeMillis() - start; LOG.debug("reading fully costs:" + cost + " milliseconds"); }