示例#1
0
 /**
  * This utility method creates a new Thrift ColumnDescriptor "struct" based on an Hbase
  * HColumnDescriptor object.
  *
  * @param in Hbase HColumnDescriptor object
  * @return Thrift ColumnDescriptor
  */
 public static ColumnDescriptor colDescFromHbase(HColumnDescriptor in) {
   ColumnDescriptor col = new ColumnDescriptor();
   col.name = ByteBuffer.wrap(Bytes.add(in.getName(), KeyValue.COLUMN_FAMILY_DELIM_ARRAY));
   col.maxVersions = in.getMaxVersions();
   col.compression = in.getCompression().toString();
   col.inMemory = in.isInMemory();
   col.blockCacheEnabled = in.isBlockCacheEnabled();
   col.bloomFilterType = in.getBloomFilterType().toString();
   return col;
 }
示例#2
0
  /**
   * Test that {@link HFileOutputFormat2} RecordWriter uses compression and bloom filter settings
   * from the column family descriptor
   */
  @Test
  public void testColumnFamilySettings() throws Exception {
    Configuration conf = new Configuration(this.util.getConfiguration());
    RecordWriter<ImmutableBytesWritable, Cell> writer = null;
    TaskAttemptContext context = null;
    Path dir = util.getDataTestDir("testColumnFamilySettings");

    // Setup table descriptor
    Table table = Mockito.mock(Table.class);
    RegionLocator regionLocator = Mockito.mock(RegionLocator.class);
    HTableDescriptor htd = new HTableDescriptor(TABLE_NAME);
    Mockito.doReturn(htd).when(table).getTableDescriptor();
    for (HColumnDescriptor hcd : HBaseTestingUtility.generateColumnDescriptors()) {
      htd.addFamily(hcd);
    }

    // set up the table to return some mock keys
    setupMockStartKeys(regionLocator);

    try {
      // partial map red setup to get an operational writer for testing
      // We turn off the sequence file compression, because DefaultCodec
      // pollutes the GZip codec pool with an incompatible compressor.
      conf.set("io.seqfile.compression.type", "NONE");
      Job job = new Job(conf, "testLocalMRIncrementalLoad");
      job.setWorkingDirectory(util.getDataTestDirOnTestFS("testColumnFamilySettings"));
      setupRandomGeneratorMapper(job);
      HFileOutputFormat2.configureIncrementalLoad(job, table.getTableDescriptor(), regionLocator);
      FileOutputFormat.setOutputPath(job, dir);
      context = createTestTaskAttemptContext(job);
      HFileOutputFormat2 hof = new HFileOutputFormat2();
      writer = hof.getRecordWriter(context);

      // write out random rows
      writeRandomKeyValues(writer, context, htd.getFamiliesKeys(), ROWSPERSPLIT);
      writer.close(context);

      // Make sure that a directory was created for every CF
      FileSystem fs = dir.getFileSystem(conf);

      // commit so that the filesystem has one directory per column family
      hof.getOutputCommitter(context).commitTask(context);
      hof.getOutputCommitter(context).commitJob(context);
      FileStatus[] families = FSUtils.listStatus(fs, dir, new FSUtils.FamilyDirFilter(fs));
      assertEquals(htd.getFamilies().size(), families.length);
      for (FileStatus f : families) {
        String familyStr = f.getPath().getName();
        HColumnDescriptor hcd = htd.getFamily(Bytes.toBytes(familyStr));
        // verify that the compression on this file matches the configured
        // compression
        Path dataFilePath = fs.listStatus(f.getPath())[0].getPath();
        Reader reader = HFile.createReader(fs, dataFilePath, new CacheConfig(conf), conf);
        Map<byte[], byte[]> fileInfo = reader.loadFileInfo();

        byte[] bloomFilter = fileInfo.get(StoreFile.BLOOM_FILTER_TYPE_KEY);
        if (bloomFilter == null) bloomFilter = Bytes.toBytes("NONE");
        assertEquals(
            "Incorrect bloom filter used for column family "
                + familyStr
                + "(reader: "
                + reader
                + ")",
            hcd.getBloomFilterType(),
            BloomType.valueOf(Bytes.toString(bloomFilter)));
        assertEquals(
            "Incorrect compression used for column family "
                + familyStr
                + "(reader: "
                + reader
                + ")",
            hcd.getCompression(),
            reader.getFileContext().getCompression());
      }
    } finally {
      dir.getFileSystem(conf).delete(dir, true);
    }
  }
示例#3
0
  /**
   * Get an estimate of the number of rows and bytes per row in regions between startRowKey and
   * endRowKey. The more store files there are the more this will be off. Also, this does not take
   * into account any rows that are in the memstore.
   *
   * <p>The values computed here should be cached so that in high qps workloads the nn is not
   * overwhelmed. Could be done in load(); Synchronized to make sure that only one thread at a time
   * is using the htable.
   *
   * @param startRowKey First row key in the range
   * @param endRowKey Last row key in the range
   * @return The estimated number of rows in the regions between the row keys (first) and the
   *     estimated row size in bytes (second).
   */
  public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) {
    Preconditions.checkNotNull(startRowKey);
    Preconditions.checkNotNull(endRowKey);

    long rowSize = 0;
    long rowCount = 0;
    long hdfsSize = 0;
    boolean isCompressed = false;

    try {
      // Check to see if things are compressed.
      // If they are we'll estimate a compression factor.
      if (columnFamilies_ == null) {
        columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies();
      }
      Preconditions.checkNotNull(columnFamilies_);
      for (HColumnDescriptor desc : columnFamilies_) {
        isCompressed |= desc.getCompression() != Compression.Algorithm.NONE;
      }

      // For every region in the range.
      List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey);
      for (HRegionLocation location : locations) {
        long currentHdfsSize = 0;
        long currentRowSize = 0;
        long currentRowCount = 0;

        HRegionInfo info = location.getRegionInfo();
        // Get the size on hdfs
        currentHdfsSize += getHdfsSize(info);

        Scan s = new Scan(info.getStartKey());
        // Get a small sample of rows
        s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE);
        // Try and get every version so the row's size can be used to estimate.
        s.setMaxVersions(Short.MAX_VALUE);
        // Don't cache the blocks as we don't think these are
        // necessarily important blocks.
        s.setCacheBlocks(false);
        // Try and get deletes too so their size can be counted.
        s.setRaw(true);
        ResultScanner rs = hTable_.getScanner(s);
        try {
          // And get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows
          // for a representative sample
          for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; i++) {
            Result r = rs.next();
            if (r == null) break;
            currentRowCount += 1;
            for (KeyValue kv : r.list()) {
              // some extra row size added to make up for shared overhead
              currentRowSize +=
                  kv.getRowLength() // row key
                      + 4 // row key length field
                      + kv.getFamilyLength() // Column family bytes
                      + 4 // family length field
                      + kv.getQualifierLength() // qualifier bytes
                      + 4 // qualifier length field
                      + kv.getValueLength() // length of the value
                      + 4 // value length field
                      + 10; // extra overhead for hfile index, checksums, metadata, etc
            }
          }
          // add these values to the cumulative totals in one shot just
          // in case there was an error in between getting the hdfs
          // size and the row/column sizes.
          hdfsSize += currentHdfsSize;
          rowCount += currentRowCount;
          rowSize += currentRowSize;
        } finally {
          rs.close();
        }
      }
    } catch (IOException ioe) {
      // Print the stack trace, but we'll ignore it
      // as this is just an estimate.
      // TODO: Put this into the per query log.
      LOG.error("Error computing HBase row count estimate", ioe);
    }

    // If there are no rows then no need to estimate.
    if (rowCount == 0) return new Pair<Long, Long>(0L, 0L);

    // if something went wrong then set a signal value.
    if (rowSize <= 0 || hdfsSize <= 0) return new Pair<Long, Long>(-1L, -1L);

    // estimate the number of rows.
    double bytesPerRow = rowSize / (double) rowCount;
    long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (hdfsSize / bytesPerRow));

    return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow);
  }