@Override @SuppressWarnings("unchecked") public void initialize(InputSplit split, TaskAttemptContext ctx) throws IOException, InterruptedException { // set up columns that needs to read from the RCFile. tDesc = TStructDescriptor.getInstance(typeRef.getRawClass()); thriftWritable = ThriftWritable.newInstance((Class<TBase<?, ?>>) typeRef.getRawClass()); final List<Field> tFields = tDesc.getFields(); FileSplit fsplit = (FileSplit) split; Path file = fsplit.getPath(); LOG.info( String.format( "reading %s from %s:%d:%d", typeRef.getRawClass().getName(), file.toString(), fsplit.getStart(), fsplit.getStart() + fsplit.getLength())); ColumnarMetadata storedInfo = RCFileUtil.readMetadata(ctx.getConfiguration(), file); // list of field numbers List<Integer> tFieldIds = Lists.transform( tFields, new Function<Field, Integer>() { public Integer apply(Field fd) { return Integer.valueOf(fd.getFieldId()); } }); columnsBeingRead = RCFileUtil.findColumnsToRead(ctx.getConfiguration(), tFieldIds, storedInfo); for (int idx : columnsBeingRead) { int fid = storedInfo.getFieldId(idx); if (fid >= 0) { knownRequiredFields.add(tFields.get(tFieldIds.indexOf(fid))); } else { readUnknownsColumn = true; } } ColumnProjectionUtils.setReadColumnIDs(ctx.getConfiguration(), columnsBeingRead); // finally! super.initialize(split, ctx); }
/** * Builds Thrift object from the raw bytes returned by RCFile reader. * * @throws TException */ @SuppressWarnings({"unchecked", "rawtypes"}) public TBase<?, ?> getCurrentThriftValue() throws IOException, InterruptedException, TException { BytesRefArrayWritable byteRefs = getCurrentBytesRefArrayWritable(); if (byteRefs == null) { return null; } TBase tObj = tDesc.newThriftObject(); for (int i = 0; i < knownRequiredFields.size(); i++) { BytesRefWritable buf = byteRefs.get(columnsBeingRead.get(i)); if (buf.getLength() > 0) { memTransport.reset(buf.getData(), buf.getStart(), buf.getLength()); Field field = knownRequiredFields.get(i); tObj.setFieldValue(field.getFieldIdEnum(), ThriftUtils.readFieldNoTag(tProto, field)); } // else no need to set default value since any default value // would have been serialized when this record was written. } // parse unknowns column if required if (readUnknownsColumn) { int last = columnsBeingRead.get(columnsBeingRead.size() - 1); BytesRefWritable buf = byteRefs.get(last); if (buf.getLength() > 0) { memTransport.reset(buf.getData(), buf.getStart(), buf.getLength()); tObj.read(tProto); } } return tObj; }