예제 #1
0
  /**
   * Does the configuration setup and schema parsing and setup.
   *
   * @param table_schema String
   * @param columnsToRead String
   */
  private void setup(String table_schema) {

    if (table_schema == null)
      throw new RuntimeException(
          "The table schema must be defined as colname type, colname type.  All types are hive types");

    // create basic configuration for hdfs and hive
    conf = new Configuration();
    hiveConf = new HiveConf(conf, SessionState.class);

    // parse the table_schema string
    List<String> types = HiveRCSchemaUtil.parseSchemaTypes(table_schema);
    List<String> cols = HiveRCSchemaUtil.parseSchema(pcols, table_schema);

    List<FieldSchema> fieldSchemaList = new ArrayList<FieldSchema>(cols.size());

    for (int i = 0; i < cols.size(); i++) {
      fieldSchemaList.add(
          new FieldSchema(cols.get(i), HiveRCSchemaUtil.findPigDataType(types.get(i))));
    }

    pigSchema = new ResourceSchema(new Schema(fieldSchemaList));

    props = new Properties();

    // setting table schema properties for ColumnarSerDe
    // these properties are never changed by the columns to read filter,
    // because the columnar serde needs to now the
    // complete format of each record.
    props.setProperty(Constants.LIST_COLUMNS, HiveRCSchemaUtil.listToString(cols));
    props.setProperty(Constants.LIST_COLUMN_TYPES, HiveRCSchemaUtil.listToString(types));
  }
예제 #2
0
  /**
   * Only read the columns that were requested in the constructor.<br>
   *
   * @param struct ColumnarStruct
   * @param path Path
   * @return Tuple
   * @throws IOException
   */
  private Tuple readColumnarTuple(ColumnarStruct struct, Path path) throws IOException {

    int[] columnIndexes = getRequiredColumns();
    // the partition keys if any will already be in the UDFContext here.
    String[] partitionKeys = getPartitionKeys(null, null);
    // only if the path has changed should be run the
    if (currentPath == null || !currentPath.equals(path)) {
      currentPathPartitionKeyMap =
          (partitionKeys == null)
              ? null
              : pathPartitionerHelper.getPathPartitionKeyValues(path.toString());
      currentPath = path;
    }

    // if the partitionColumns is null this value will stop the for loop
    // below from trynig to add any partition columns
    // that do not exist
    int partitionColumnStartIndex = Integer.MAX_VALUE;

    if (!(partitionColumns == null || partitionColumns.size() == 0)) {
      // partition columns are always appended to the schema fields.
      partitionColumnStartIndex = pigSchema.getFields().length;
    }

    // create tuple with determined previous size
    Tuple t = tupleFactory.newTuple(columnIndexes.length);

    // read in all columns
    for (int i = 0; i < columnIndexes.length; i++) {
      int columnIndex = columnIndexes[i];

      if (columnIndex < partitionColumnStartIndex) {
        Object obj = struct.getField(columnIndex);
        Object pigType = HiveRCSchemaUtil.extractPigTypeFromHiveType(obj);

        t.set(i, pigType);

      } else {
        // read the partition columns
        // will only be executed if partitionColumns is not null
        String key = partitionKeys[columnIndex - partitionColumnStartIndex];
        Object value = currentPathPartitionKeyMap.get(key);
        t.set(i, value);
      }
    }

    return t;
  }