Example #1
0
  /**
   * reads {@link ColumnarMetadata} stored in an RCFile.
   *
   * @throws IOException if metadata is not stored or in case of any other error.
   */
  public static ColumnarMetadata readMetadata(Configuration conf, Path rcfile) throws IOException {

    Metadata metadata = null;

    Configuration confCopy = new Configuration(conf);
    // set up conf to read all the columns
    ColumnProjectionUtils.setFullyReadColumns(confCopy);

    RCFile.Reader reader = new RCFile.Reader(rcfile.getFileSystem(confCopy), rcfile, confCopy);

    // ugly hack to get metadata. RCFile has to provide access to metata
    try {
      Field f = RCFile.Reader.class.getDeclaredField("metadata");
      f.setAccessible(true);
      metadata = (Metadata) f.get(reader);
    } catch (Throwable t) {
      throw new IOException("Could not access metadata field in RCFile reader", t);
    }

    reader.close();

    Text metadataKey = new Text(COLUMN_METADATA_PROTOBUF_KEY);

    if (metadata == null || metadata.get(metadataKey) == null) {
      throw new IOException("could not find ColumnarMetadata in " + rcfile);
    }

    return Protobufs.mergeFromText(ColumnarMetadata.newBuilder(), metadata.get(metadataKey))
        .build();
  }
  public int performRCFileFullyReadColumnTest(
      FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException {

    byte[][] checkBytes = null;
    BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
    if (chechCorrect) {
      resetRandomGenerators();
      checkBytes = new byte[allColumnsNumber][];
    }

    int actualReadCount = 0;

    ColumnProjectionUtils.setReadAllColumns(conf);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      boolean ok = true;
      if (chechCorrect) {
        nextRandomRow(checkBytes, checkRow);
        ok = ok && checkRow.equals(cols);
      }
      if (!ok) {
        throw new IllegalStateException("Compare read and write error.");
      }
      actualReadCount++;
    }
    return actualReadCount;
  }
  public int performRCFileReadFirstAndLastColumnTest(
      FileSystem fs, Path file, int allColumnsNumber, boolean chechCorrect) throws IOException {

    byte[][] checkBytes = null;
    BytesRefArrayWritable checkRow = new BytesRefArrayWritable(allColumnsNumber);
    if (chechCorrect) {
      resetRandomGenerators();
      checkBytes = new byte[allColumnsNumber][];
    }

    int actualReadCount = 0;

    java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
    readCols.add(Integer.valueOf(0));
    readCols.add(Integer.valueOf(allColumnsNumber - 1));
    ColumnProjectionUtils.appendReadColumns(conf, readCols);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      boolean ok = true;
      if (chechCorrect) {
        nextRandomRow(checkBytes, checkRow);
        ok = ok && (checkRow.get(0).equals(cols.get(0)));
        ok = ok && checkRow.get(allColumnsNumber - 1).equals(cols.get(allColumnsNumber - 1));
      }
      if (!ok) {
        throw new IllegalStateException("Compare read and write error.");
      }
      actualReadCount++;
    }
    return actualReadCount;
  }
Example #4
0
  @Override
  public void initialize(
      HiveConf conf, QueryPlan queryPlan, DriverContext ctx, CompilationOpContext opContext) {
    super.initialize(conf, queryPlan, ctx, opContext);
    work.initializeForFetch(opContext);

    try {
      // Create a file system handle
      JobConf job = new JobConf(conf);

      Operator<?> source = work.getSource();
      if (source instanceof TableScanOperator) {
        TableScanOperator ts = (TableScanOperator) source;
        // push down projections
        ColumnProjectionUtils.appendReadColumns(
            job, ts.getNeededColumnIDs(), ts.getNeededColumns());
        // push down filters
        HiveInputFormat.pushFilters(job, ts);
      }
      sink = work.getSink();
      fetch = new FetchOperator(work, job, source, getVirtualColumns(source));
      source.initialize(conf, new ObjectInspector[] {fetch.getOutputObjectInspector()});
      totalRows = 0;
      ExecMapper.setDone(false);

    } catch (Exception e) {
      // Bail out ungracefully - we should never hit
      // this here - but would have hit it in SemanticAnalyzer
      LOG.error(StringUtils.stringifyException(e));
      throw new RuntimeException(e);
    }
  }
Example #5
0
  private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap)
      throws HiveException {
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry :
        work.getAliasToWork().entrySet()) {
      LOG.debug(
          "initializeOperators: "
              + entry.getKey()
              + ", children = "
              + entry.getValue().getChildOperators());
    }
    // this mapper operator is used to initialize all the operators
    for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
      if (entry.getValue() == null) {
        continue;
      }
      JobConf jobClone = new JobConf(job);

      TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
      // push down projections
      ColumnProjectionUtils.appendReadColumns(
          jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns());
      // push down filters
      HiveInputFormat.pushFilters(jobClone, ts);

      // create a fetch operator
      FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
      fetchOpJobConfMap.put(fetchOp, jobClone);
      fetchOperators.put(entry.getKey(), fetchOp);
      l4j.info("fetchoperator for " + entry.getKey() + " created");
    }
    // initialize all forward operator
    for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
      // get the forward op
      String alias = entry.getKey();
      Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);

      // put the exe context into all the operators
      forwardOp.passExecContext(execContext);
      // All the operators need to be initialized before process
      FetchOperator fetchOp = entry.getValue();
      JobConf jobConf = fetchOpJobConfMap.get(fetchOp);

      if (jobConf == null) {
        jobConf = job;
      }
      // initialize the forward operator
      ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
      forwardOp.initialize(jobConf, new ObjectInspector[] {objectInspector});
      l4j.info("fetchoperator for " + entry.getKey() + " initialized");
    }
  }
    @Override
    @SuppressWarnings("unchecked")
    public void initialize(InputSplit split, TaskAttemptContext ctx)
        throws IOException, InterruptedException {
      // set up columns that needs to read from the RCFile.

      tDesc = TStructDescriptor.getInstance(typeRef.getRawClass());
      thriftWritable = ThriftWritable.newInstance((Class<TBase<?, ?>>) typeRef.getRawClass());
      final List<Field> tFields = tDesc.getFields();

      FileSplit fsplit = (FileSplit) split;
      Path file = fsplit.getPath();

      LOG.info(
          String.format(
              "reading %s from %s:%d:%d",
              typeRef.getRawClass().getName(),
              file.toString(),
              fsplit.getStart(),
              fsplit.getStart() + fsplit.getLength()));

      ColumnarMetadata storedInfo = RCFileUtil.readMetadata(ctx.getConfiguration(), file);

      // list of field numbers
      List<Integer> tFieldIds =
          Lists.transform(
              tFields,
              new Function<Field, Integer>() {
                public Integer apply(Field fd) {
                  return Integer.valueOf(fd.getFieldId());
                }
              });

      columnsBeingRead =
          RCFileUtil.findColumnsToRead(ctx.getConfiguration(), tFieldIds, storedInfo);

      for (int idx : columnsBeingRead) {
        int fid = storedInfo.getFieldId(idx);
        if (fid >= 0) {
          knownRequiredFields.add(tFields.get(tFieldIds.indexOf(fid)));
        } else {
          readUnknownsColumn = true;
        }
      }

      ColumnProjectionUtils.setReadColumnIDs(ctx.getConfiguration(), columnsBeingRead);

      // finally!
      super.initialize(split, ctx);
    }
Example #7
0
  private void partialReadTest(FileSystem fs, int count, Path file)
      throws IOException, SerDeException {
    LOG.debug("reading " + count + " records");
    long start = System.currentTimeMillis();
    java.util.ArrayList<Integer> readCols = new java.util.ArrayList<Integer>();
    readCols.add(Integer.valueOf(2));
    readCols.add(Integer.valueOf(3));
    ColumnProjectionUtils.setReadColumnIDs(conf, readCols);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    BytesRefArrayWritable cols = new BytesRefArrayWritable();

    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      cols.resetValid(8);
      Object row = serDe.deserialize(cols);

      StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector();
      List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
      assertEquals("Field size should be 8", 8, fieldRefs.size());

      for (int i : readCols) {
        Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
        Object standardWritableData =
            ObjectInspectorUtils.copyToStandardObject(
                fieldData,
                fieldRefs.get(i).getFieldObjectInspector(),
                ObjectInspectorCopyOption.WRITABLE);
        assertEquals("Field " + i, standardWritableData, expectedPartitalFieldsData[i]);
      }

      assertEquals(
          "Class of the serialized object should be BytesRefArrayWritable",
          BytesRefArrayWritable.class,
          serDe.getSerializedClass());
      BytesRefArrayWritable serializedBytes = (BytesRefArrayWritable) serDe.serialize(row, oi);
      assertEquals("Serialized data", patialS, serializedBytes);
    }
    reader.close();
    long cost = System.currentTimeMillis() - start;
    LOG.debug("reading fully costs:" + cost + " milliseconds");
  }
Example #8
0
  public void fullyReadTest(FileSystem fs, int count, Path file)
      throws IOException, SerDeException {
    LOG.debug("reading " + count + " records");
    long start = System.currentTimeMillis();
    ColumnProjectionUtils.setFullyReadColumns(conf);
    RCFile.Reader reader = new RCFile.Reader(fs, file, conf);

    LongWritable rowID = new LongWritable();
    int actualRead = 0;
    BytesRefArrayWritable cols = new BytesRefArrayWritable();
    while (reader.next(rowID)) {
      reader.getCurrentRow(cols);
      cols.resetValid(8);
      Object row = serDe.deserialize(cols);

      StructObjectInspector oi = (StructObjectInspector) serDe.getObjectInspector();
      List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
      assertEquals("Field size should be 8", 8, fieldRefs.size());
      for (int i = 0; i < fieldRefs.size(); i++) {
        Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
        Object standardWritableData =
            ObjectInspectorUtils.copyToStandardObject(
                fieldData,
                fieldRefs.get(i).getFieldObjectInspector(),
                ObjectInspectorCopyOption.WRITABLE);
        assertEquals("Field " + i, standardWritableData, expectedFieldsData[i]);
      }
      // Serialize
      assertEquals(
          "Class of the serialized object should be BytesRefArrayWritable",
          BytesRefArrayWritable.class,
          serDe.getSerializedClass());
      BytesRefArrayWritable serializedText = (BytesRefArrayWritable) serDe.serialize(row, oi);
      assertEquals("Serialized data", s, serializedText);
      actualRead++;
    }
    reader.close();
    assertEquals("Expect " + count + " rows, actual read " + actualRead, actualRead, count);
    long cost = System.currentTimeMillis() - start;
    LOG.debug("reading fully costs:" + cost + " milliseconds");
  }