/*
   * 为表添加数据(适合知道有多少列族的固定表)
   *
   * @rowKey rowKey
   *
   * @tableName 表名
   *
   * @column1 第一个列族列表
   *
   * @value1 第一个列的值的列表
   *
   * @column2 第二个列族列表
   *
   * @value2 第二个列的值的列表
   */
  public static void addData(
      String rowKey,
      String tableName,
      String[] column1,
      String[] value1,
      String[] column2,
      String[] value2)
      throws IOException {
    Put put = new Put(Bytes.toBytes(rowKey)); // 设置rowkey
    HTable table = new HTable(conf, Bytes.toBytes(tableName)); // HTabel负责跟记录相关的操作如增删改查等//
    // 获取表
    HColumnDescriptor[] columnFamilies =
        table
            .getTableDescriptor() // 获取所有的列族
            .getColumnFamilies();

    for (int i = 0; i < columnFamilies.length; i++) {
      String familyName = columnFamilies[i].getNameAsString(); // 获取列族名
      if (familyName.equals("article")) { // article列族put数据
        for (int j = 0; j < column1.length; j++) {
          put.add(Bytes.toBytes(familyName), Bytes.toBytes(column1[j]), Bytes.toBytes(value1[j]));
        }
      }
      if (familyName.equals("author")) { // author列族put数据
        for (int j = 0; j < column2.length; j++) {
          put.add(Bytes.toBytes(familyName), Bytes.toBytes(column2[j]), Bytes.toBytes(value2[j]));
        }
      }
    }
    table.put(put);
    System.out.println("add data Success!");
  }
  /**
   * 新增单列
   *
   * @param tableName 表名
   * @param rowkey 行键名
   * @param col 列
   */
  public void addColumn(String tableName, String rowkey, HbaseColumn col) {
    try {
      HTable table = new HTable(conf, Bytes.toBytes(tableName)); // HTabel负责跟记录相关的操作如增删改查等//
      HColumnDescriptor[] cfstmp = table.getTableDescriptor().getColumnFamilies(); // 获取所有的列族
      ArrayList<String> cfs = new ArrayList<>();
      for (HColumnDescriptor cf : cfstmp) {
        cfs.add(cf.getNameAsString()); // 获取列族名
      }

      Put put = new Put(Bytes.toBytes(rowkey)); // 设置rowkey
      if (cfs.contains(col.cf)) {
        put.add(Bytes.toBytes(col.cf), Bytes.toBytes(col.col), Bytes.toBytes(col.value));
      }
      table.put(put);
    } catch (Exception e) {
      logger.error("addColumn failed", e);
    }
  }
  /**
   * 新增数据
   *
   * @param tableName 表名
   * @param rowKey 行键名
   * @param columnFamily 列族名
   * @param column 列名
   * @param value 列值
   */
  public void addData(
      String tableName, String rowKey, String columnFamily, String column, String value) {
    try {
      // 获取表
      HTable table = new HTable(conf, Bytes.toBytes(tableName)); // HTabel负责跟记录相关的操作如增删改查等//
      HColumnDescriptor[] cfs = table.getTableDescriptor().getColumnFamilies(); // 获取所有的列族
      Put put = new Put(Bytes.toBytes(rowKey)); // 设置rowkey

      for (HColumnDescriptor cf : cfs) {
        String familyName = cf.getNameAsString(); // 获取列族名
        if (!familyName.equals(columnFamily)) {
          continue;
        }
        // 列族相同put数据
        put.add(Bytes.toBytes(familyName), Bytes.toBytes(column), Bytes.toBytes(value));
      }
      table.put(put);
    } catch (Exception e) {
      logger.error("addData failed", e);
    }
  }
  private void runIncrementalPELoad(Configuration conf, HTable table, Path outDir)
      throws Exception {
    Job job = new Job(conf, "testLocalMRIncrementalLoad");
    job.setWorkingDirectory(util.getDataTestDirOnTestFS("runIncrementalPELoad"));
    job.getConfiguration()
        .setStrings(
            "io.serializations",
            conf.get("io.serializations"),
            MutationSerialization.class.getName(),
            ResultSerialization.class.getName(),
            KeyValueSerialization.class.getName());
    setupRandomGeneratorMapper(job);
    HFileOutputFormat2.configureIncrementalLoad(
        job, table.getTableDescriptor(), table.getRegionLocator());
    FileOutputFormat.setOutputPath(job, outDir);

    Assert.assertFalse(util.getTestFileSystem().exists(outDir));

    assertEquals(table.getRegionLocator().getAllRegionLocations().size(), job.getNumReduceTasks());

    assertTrue(job.waitForCompletion(true));
  }
  private void init() {
    logger.debug("Getting region locations");
    try {
      HTable table = new HTable(storagePluginConfig.getHBaseConf(), hbaseScanSpec.getTableName());
      this.hTableDesc = table.getTableDescriptor();
      NavigableMap<HRegionInfo, ServerName> regionsMap = table.getRegionLocations();
      statsCalculator =
          new TableStatsCalculator(table, hbaseScanSpec, storagePlugin.getContext().getConfig());

      boolean foundStartRegion = false;
      regionsToScan = new TreeMap<HRegionInfo, ServerName>();
      for (Entry<HRegionInfo, ServerName> mapEntry : regionsMap.entrySet()) {
        HRegionInfo regionInfo = mapEntry.getKey();
        if (!foundStartRegion
            && hbaseScanSpec.getStartRow() != null
            && hbaseScanSpec.getStartRow().length != 0
            && !regionInfo.containsRow(hbaseScanSpec.getStartRow())) {
          continue;
        }
        foundStartRegion = true;
        regionsToScan.put(regionInfo, mapEntry.getValue());
        scanSizeInBytes += statsCalculator.getRegionSizeInBytes(regionInfo.getRegionName());
        if (hbaseScanSpec.getStopRow() != null
            && hbaseScanSpec.getStopRow().length != 0
            && regionInfo.containsRow(hbaseScanSpec.getStopRow())) {
          break;
        }
      }

      table.close();
    } catch (IOException e) {
      throw new DrillRuntimeException(
          "Error getting region info for table: " + hbaseScanSpec.getTableName(), e);
    }
    verifyColumns();
  }
  /**
   * This test is to test the scenario happened in HBASE-6901. All files are bulk loaded and
   * excluded from minor compaction. Without the fix of HBASE-6901, an
   * ArrayIndexOutOfBoundsException will be thrown.
   */
  @Ignore("Flakey: See HBASE-9051")
  @Test
  public void testExcludeAllFromMinorCompaction() throws Exception {
    Configuration conf = util.getConfiguration();
    conf.setInt("hbase.hstore.compaction.min", 2);
    generateRandomStartKeys(5);

    util.startMiniCluster();
    try (Connection conn = ConnectionFactory.createConnection();
        Admin admin = conn.getAdmin()) {
      final FileSystem fs = util.getDFSCluster().getFileSystem();
      HTable table = util.createTable(TABLE_NAME, FAMILIES);
      assertEquals("Should start with empty table", 0, util.countRows(table));

      // deep inspection: get the StoreFile dir
      final Path storePath =
          new Path(
              FSUtils.getTableDir(FSUtils.getRootDir(conf), TABLE_NAME),
              new Path(
                  admin.getTableRegions(TABLE_NAME).get(0).getEncodedName(),
                  Bytes.toString(FAMILIES[0])));
      assertEquals(0, fs.listStatus(storePath).length);

      // Generate two bulk load files
      conf.setBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", true);
      util.startMiniMapReduceCluster();

      for (int i = 0; i < 2; i++) {
        Path testDir = util.getDataTestDirOnTestFS("testExcludeAllFromMinorCompaction_" + i);
        runIncrementalPELoad(
            conf, table.getTableDescriptor(), conn.getRegionLocator(TABLE_NAME), testDir);
        // Perform the actual load
        new LoadIncrementalHFiles(conf).doBulkLoad(testDir, table);
      }

      // Ensure data shows up
      int expectedRows = 2 * NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT;
      assertEquals(
          "LoadIncrementalHFiles should put expected data in table",
          expectedRows,
          util.countRows(table));

      // should have a second StoreFile now
      assertEquals(2, fs.listStatus(storePath).length);

      // minor compactions shouldn't get rid of the file
      admin.compact(TABLE_NAME);
      try {
        quickPoll(
            new Callable<Boolean>() {
              @Override
              public Boolean call() throws Exception {
                return fs.listStatus(storePath).length == 1;
              }
            },
            5000);
        throw new IOException("SF# = " + fs.listStatus(storePath).length);
      } catch (AssertionError ae) {
        // this is expected behavior
      }

      // a major compaction should work though
      admin.majorCompact(TABLE_NAME);
      quickPoll(
          new Callable<Boolean>() {
            @Override
            public Boolean call() throws Exception {
              return fs.listStatus(storePath).length == 1;
            }
          },
          5000);

    } finally {
      util.shutdownMiniMapReduceCluster();
      util.shutdownMiniCluster();
    }
  }
  private void doIncrementalLoadTest(boolean shouldChangeRegions) throws Exception {
    util = new HBaseTestingUtility();
    Configuration conf = util.getConfiguration();
    byte[][] splitKeys = generateRandomSplitKeys(4);
    util.startMiniCluster();
    try {
      HTable table = util.createTable(TABLE_NAME, FAMILIES, splitKeys);
      Admin admin = table.getConnection().getAdmin();
      Path testDir = util.getDataTestDirOnTestFS("testLocalMRIncrementalLoad");
      assertEquals("Should start with empty table", 0, util.countRows(table));
      int numRegions = -1;
      try (RegionLocator r = table.getRegionLocator()) {
        numRegions = r.getStartKeys().length;
      }
      assertEquals("Should make 5 regions", numRegions, 5);

      // Generate the bulk load files
      util.startMiniMapReduceCluster();
      runIncrementalPELoad(conf, table.getTableDescriptor(), table.getRegionLocator(), testDir);
      // This doesn't write into the table, just makes files
      assertEquals("HFOF should not touch actual table", 0, util.countRows(table));

      // Make sure that a directory was created for every CF
      int dir = 0;
      for (FileStatus f : testDir.getFileSystem(conf).listStatus(testDir)) {
        for (byte[] family : FAMILIES) {
          if (Bytes.toString(family).equals(f.getPath().getName())) {
            ++dir;
          }
        }
      }
      assertEquals("Column family not found in FS.", FAMILIES.length, dir);

      // handle the split case
      if (shouldChangeRegions) {
        LOG.info("Changing regions in table");
        admin.disableTable(table.getName());
        while (util.getMiniHBaseCluster()
            .getMaster()
            .getAssignmentManager()
            .getRegionStates()
            .isRegionsInTransition()) {
          Threads.sleep(200);
          LOG.info("Waiting on table to finish disabling");
        }
        util.deleteTable(table.getName());
        byte[][] newSplitKeys = generateRandomSplitKeys(14);
        table = util.createTable(TABLE_NAME, FAMILIES, newSplitKeys);

        while (table.getRegionLocator().getAllRegionLocations().size() != 15
            || !admin.isTableAvailable(table.getName())) {
          Thread.sleep(200);
          LOG.info("Waiting for new region assignment to happen");
        }
      }

      // Perform the actual load
      new LoadIncrementalHFiles(conf).doBulkLoad(testDir, table);

      // Ensure data shows up
      int expectedRows = NMapInputFormat.getNumMapTasks(conf) * ROWSPERSPLIT;
      assertEquals(
          "LoadIncrementalHFiles should put expected data in table",
          expectedRows,
          util.countRows(table));
      Scan scan = new Scan();
      ResultScanner results = table.getScanner(scan);
      for (Result res : results) {
        assertEquals(FAMILIES.length, res.rawCells().length);
        Cell first = res.rawCells()[0];
        for (Cell kv : res.rawCells()) {
          assertTrue(CellUtil.matchingRow(first, kv));
          assertTrue(Bytes.equals(CellUtil.cloneValue(first), CellUtil.cloneValue(kv)));
        }
      }
      results.close();
      String tableDigestBefore = util.checksumRows(table);

      // Cause regions to reopen
      admin.disableTable(TABLE_NAME);
      while (!admin.isTableDisabled(TABLE_NAME)) {
        Thread.sleep(200);
        LOG.info("Waiting for table to disable");
      }
      admin.enableTable(TABLE_NAME);
      util.waitTableAvailable(TABLE_NAME);
      assertEquals(
          "Data should remain after reopening of regions",
          tableDigestBefore,
          util.checksumRows(table));
    } finally {
      util.shutdownMiniMapReduceCluster();
      util.shutdownMiniCluster();
    }
  }
Exemple #8
0
  /**
   * Get an estimate of the number of rows and bytes per row in regions between startRowKey and
   * endRowKey. The more store files there are the more this will be off. Also, this does not take
   * into account any rows that are in the memstore.
   *
   * <p>The values computed here should be cached so that in high qps workloads the nn is not
   * overwhelmed. Could be done in load(); Synchronized to make sure that only one thread at a time
   * is using the htable.
   *
   * @param startRowKey First row key in the range
   * @param endRowKey Last row key in the range
   * @return The estimated number of rows in the regions between the row keys (first) and the
   *     estimated row size in bytes (second).
   */
  public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) {
    Preconditions.checkNotNull(startRowKey);
    Preconditions.checkNotNull(endRowKey);

    long rowSize = 0;
    long rowCount = 0;
    long hdfsSize = 0;
    boolean isCompressed = false;

    try {
      // Check to see if things are compressed.
      // If they are we'll estimate a compression factor.
      if (columnFamilies_ == null) {
        columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies();
      }
      Preconditions.checkNotNull(columnFamilies_);
      for (HColumnDescriptor desc : columnFamilies_) {
        isCompressed |= desc.getCompression() != Compression.Algorithm.NONE;
      }

      // For every region in the range.
      List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey);
      for (HRegionLocation location : locations) {
        long currentHdfsSize = 0;
        long currentRowSize = 0;
        long currentRowCount = 0;

        HRegionInfo info = location.getRegionInfo();
        // Get the size on hdfs
        currentHdfsSize += getHdfsSize(info);

        Scan s = new Scan(info.getStartKey());
        // Get a small sample of rows
        s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE);
        // Try and get every version so the row's size can be used to estimate.
        s.setMaxVersions(Short.MAX_VALUE);
        // Don't cache the blocks as we don't think these are
        // necessarily important blocks.
        s.setCacheBlocks(false);
        // Try and get deletes too so their size can be counted.
        s.setRaw(true);
        ResultScanner rs = hTable_.getScanner(s);
        try {
          // And get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows
          // for a representative sample
          for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; i++) {
            Result r = rs.next();
            if (r == null) break;
            currentRowCount += 1;
            for (KeyValue kv : r.list()) {
              // some extra row size added to make up for shared overhead
              currentRowSize +=
                  kv.getRowLength() // row key
                      + 4 // row key length field
                      + kv.getFamilyLength() // Column family bytes
                      + 4 // family length field
                      + kv.getQualifierLength() // qualifier bytes
                      + 4 // qualifier length field
                      + kv.getValueLength() // length of the value
                      + 4 // value length field
                      + 10; // extra overhead for hfile index, checksums, metadata, etc
            }
          }
          // add these values to the cumulative totals in one shot just
          // in case there was an error in between getting the hdfs
          // size and the row/column sizes.
          hdfsSize += currentHdfsSize;
          rowCount += currentRowCount;
          rowSize += currentRowSize;
        } finally {
          rs.close();
        }
      }
    } catch (IOException ioe) {
      // Print the stack trace, but we'll ignore it
      // as this is just an estimate.
      // TODO: Put this into the per query log.
      LOG.error("Error computing HBase row count estimate", ioe);
    }

    // If there are no rows then no need to estimate.
    if (rowCount == 0) return new Pair<Long, Long>(0L, 0L);

    // if something went wrong then set a signal value.
    if (rowSize <= 0 || hdfsSize <= 0) return new Pair<Long, Long>(-1L, -1L);

    // estimate the number of rows.
    double bytesPerRow = rowSize / (double) rowCount;
    long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (hdfsSize / bytesPerRow));

    return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow);
  }