/** * Does the configuration setup and schema parsing and setup. * * @param table_schema String * @param columnsToRead String */ private void setup(String table_schema) { if (table_schema == null) throw new RuntimeException( "The table schema must be defined as colname type, colname type. All types are hive types"); // create basic configuration for hdfs and hive conf = new Configuration(); hiveConf = new HiveConf(conf, SessionState.class); // parse the table_schema string List<String> types = HiveRCSchemaUtil.parseSchemaTypes(table_schema); List<String> cols = HiveRCSchemaUtil.parseSchema(pcols, table_schema); List<FieldSchema> fieldSchemaList = new ArrayList<FieldSchema>(cols.size()); for (int i = 0; i < cols.size(); i++) { fieldSchemaList.add( new FieldSchema(cols.get(i), HiveRCSchemaUtil.findPigDataType(types.get(i)))); } pigSchema = new ResourceSchema(new Schema(fieldSchemaList)); props = new Properties(); // setting table schema properties for ColumnarSerDe // these properties are never changed by the columns to read filter, // because the columnar serde needs to now the // complete format of each record. props.setProperty(Constants.LIST_COLUMNS, HiveRCSchemaUtil.listToString(cols)); props.setProperty(Constants.LIST_COLUMN_TYPES, HiveRCSchemaUtil.listToString(types)); }
/** * Only read the columns that were requested in the constructor.<br> * * @param struct ColumnarStruct * @param path Path * @return Tuple * @throws IOException */ private Tuple readColumnarTuple(ColumnarStruct struct, Path path) throws IOException { int[] columnIndexes = getRequiredColumns(); // the partition keys if any will already be in the UDFContext here. String[] partitionKeys = getPartitionKeys(null, null); // only if the path has changed should be run the if (currentPath == null || !currentPath.equals(path)) { currentPathPartitionKeyMap = (partitionKeys == null) ? null : pathPartitionerHelper.getPathPartitionKeyValues(path.toString()); currentPath = path; } // if the partitionColumns is null this value will stop the for loop // below from trynig to add any partition columns // that do not exist int partitionColumnStartIndex = Integer.MAX_VALUE; if (!(partitionColumns == null || partitionColumns.size() == 0)) { // partition columns are always appended to the schema fields. partitionColumnStartIndex = pigSchema.getFields().length; } // create tuple with determined previous size Tuple t = tupleFactory.newTuple(columnIndexes.length); // read in all columns for (int i = 0; i < columnIndexes.length; i++) { int columnIndex = columnIndexes[i]; if (columnIndex < partitionColumnStartIndex) { Object obj = struct.getField(columnIndex); Object pigType = HiveRCSchemaUtil.extractPigTypeFromHiveType(obj); t.set(i, pigType); } else { // read the partition columns // will only be executed if partitionColumns is not null String key = partitionKeys[columnIndex - partitionColumnStartIndex]; Object value = currentPathPartitionKeyMap.get(key); t.set(i, value); } } return t; }