/** Job configuration. */
  public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Scan scan = new Scan();
    scan.addFamily(Cw09Constants.CF_FREQUENCIES_BYTES);
    scan.setBatch(Cw09Constants.CW09_INDEX_SCAN_BATCH);

    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");
    Job job = Job.getInstance(conf, "Count the total frequency of each term in the index table");
    job.setJarByClass(TermHitsCounter.class);
    // TableMapReduceUtil.initTableMapperJob(Constants.CLUEWEB09_INDEX_TABLE_NAME, scan,
    //		ThcMapper.class, Text.class, LongWritable.class, job);
    TableMapReduceUtil.initTableMapperJob(
        Cw09Constants.CLUEWEB09_INDEX_TABLE_NAME,
        scan,
        ThcMapper.class,
        Text.class,
        LongWritable.class,
        job,
        true,
        CustomizedSplitTableInputFormat.class);
    job.setCombinerClass(ThcCombiner.class);
    TableMapReduceUtil.initTableReducerJob(
        Cw09Constants.CLUEWEB09_TERM_COUNT_TABLE_NAME, ThcReducer.class, job);
    job.setNumReduceTasks(40);
    return job;
  }
  /** Job configuration. */
  public static Job configureJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    String columnFamily = args[1];
    String outputPath = args[2];
    String rowKeyType = args[3];
    conf.set("row.key.type", rowKeyType);
    conf.set("table.name", tableName);
    Scan scan = new Scan();
    scan.addFamily(Bytes.toBytes(columnFamily));
    scan.setBatch(ConstantsTruthy.TRUTHY_TABLE_SCAN_BATCH);

    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");
    Job job =
        Job.getInstance(
            conf, "Count the column count and indexRecordSize for each row in " + tableName);
    job.setJarByClass(TruthyIndexFeatureCounter.class);
    TableMapReduceUtil.initTableMapperJob(
        tableName, scan, TfcMapper.class, Text.class, Text.class, job, true);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    TableMapReduceUtil.addDependencyJars(job);
    return job;
  }
Beispiel #3
0
  public void testWideScanBatching() throws IOException {
    final int batch = 256;
    try {
      this.r = createNewHRegion(TESTTABLEDESC, null, null);
      int inserted = addWideContent(this.r);
      List<Cell> results = new ArrayList<Cell>();
      Scan scan = new Scan();
      scan.addFamily(A);
      scan.addFamily(B);
      scan.addFamily(C);
      scan.setMaxVersions(100);
      scan.setBatch(batch);
      InternalScanner s = r.getScanner(scan);
      int total = 0;
      int i = 0;
      boolean more;
      do {
        more = s.next(results);
        i++;
        LOG.info("iteration #" + i + ", results.size=" + results.size());

        // assert that the result set is no larger
        assertTrue(results.size() <= batch);

        total += results.size();

        if (results.size() > 0) {
          // assert that all results are from the same row
          byte[] row = CellUtil.cloneRow(results.get(0));
          for (Cell kv : results) {
            assertTrue(Bytes.equals(row, CellUtil.cloneRow(kv)));
          }
        }

        results.clear();

        // trigger ChangedReadersObservers
        Iterator<KeyValueScanner> scanners =
            ((HRegion.RegionScannerImpl) s).storeHeap.getHeap().iterator();
        while (scanners.hasNext()) {
          StoreScanner ss = (StoreScanner) scanners.next();
          ss.updateReaders();
        }
      } while (more);

      // assert that the scanner returned all values
      LOG.info("inserted " + inserted + ", scanned " + total);
      assertEquals(total, inserted);

      s.close();
    } finally {
      HRegion.closeHRegion(this.r);
    }
  }
  @SuppressWarnings({"rawtypes", "unchecked"})
  public void Read() throws IOException {
    List list = new ArrayList();
    Scan scan = new Scan();
    scan.setBatch(0);
    scan.setCaching(10000);
    scan.setMaxVersions();
    scan.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("total"));
    ResultScanner rsScanner = table.getScanner(scan);
    for (Result rs : rsScanner) {
      String date = Bytes.toString(rs.getRow());
      String total = Bytes.toString(rs.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("total")));
      list.add(date + "\t" + total);
    }

    for (int i = 0; i < 7; i++)
      System.out.println((String) list.get(i) + "\t" + (String) list.get(i + 7));
  }
Beispiel #5
0
  /**
   * Deletes the specified table with all its columns. ATTENTION: Invoking this method will delete
   * the table if it exists and therefore causes data loss.
   */
  @Override
  public void clearStorage() throws StorageException {
    HBaseAdmin adm = getAdminInterface();

    try { // first of all, check if table exists, if not - we are done
      if (!adm.tableExists(tableName)) {
        logger.debug("clearStorage() called before table {} was created, skipping.", tableName);
        return;
      }
    } catch (IOException e) {
      throw new TemporaryStorageException(e);
    }

    HTable table = null;

    try {
      table = new HTable(hconf, tableName);

      Scan scan = new Scan();
      scan.setBatch(100);
      scan.setCacheBlocks(false);
      scan.setCaching(2000);

      ResultScanner scanner = null;

      try {
        scanner = table.getScanner(scan);

        for (Result res : scanner) {
          table.delete(new Delete(res.getRow()));
        }
      } finally {
        IOUtils.closeQuietly(scanner);
      }
    } catch (IOException e) {
      throw new TemporaryStorageException(e);
    } finally {
      IOUtils.closeQuietly(table);
    }
  }
  @SuppressWarnings({"resource", "rawtypes", "unchecked"})
  public static void main(String[] args) throws IOException {
    Configuration config = HBaseConfiguration.create();
    config.set("hbase.master", "192.168.32.128:60000");
    config.set("hbase.zookeeper.property.clientPort", "2181");
    config.set("hbase.zookeeper.quorum", "192.168.32.128");
    HTable table = new HTable(config, Bytes.toBytes("weibo"));
    List list = new ArrayList();
    Scan scan = new Scan();
    scan.setBatch(0);
    scan.setCaching(10000);
    scan.setMaxVersions();
    scan.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("total"));
    ResultScanner rsScanner = table.getScanner(scan);
    for (Result rs : rsScanner) {
      String date = Bytes.toString(rs.getRow());
      String total = Bytes.toString(rs.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("total")));
      list.add(date + "\t" + total);
    }

    for (int i = 0; i < 7; i++)
      System.out.println((String) list.get(i) + "\t" + (String) list.get(i + 7));
  }
Beispiel #7
0
  /**
   * Get an estimate of the number of rows and bytes per row in regions between startRowKey and
   * endRowKey. The more store files there are the more this will be off. Also, this does not take
   * into account any rows that are in the memstore.
   *
   * <p>The values computed here should be cached so that in high qps workloads the nn is not
   * overwhelmed. Could be done in load(); Synchronized to make sure that only one thread at a time
   * is using the htable.
   *
   * @param startRowKey First row key in the range
   * @param endRowKey Last row key in the range
   * @return The estimated number of rows in the regions between the row keys (first) and the
   *     estimated row size in bytes (second).
   */
  public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) {
    Preconditions.checkNotNull(startRowKey);
    Preconditions.checkNotNull(endRowKey);

    long rowSize = 0;
    long rowCount = 0;
    long hdfsSize = 0;
    boolean isCompressed = false;

    try {
      // Check to see if things are compressed.
      // If they are we'll estimate a compression factor.
      if (columnFamilies_ == null) {
        columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies();
      }
      Preconditions.checkNotNull(columnFamilies_);
      for (HColumnDescriptor desc : columnFamilies_) {
        isCompressed |= desc.getCompression() != Compression.Algorithm.NONE;
      }

      // For every region in the range.
      List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey);
      for (HRegionLocation location : locations) {
        long currentHdfsSize = 0;
        long currentRowSize = 0;
        long currentRowCount = 0;

        HRegionInfo info = location.getRegionInfo();
        // Get the size on hdfs
        currentHdfsSize += getHdfsSize(info);

        Scan s = new Scan(info.getStartKey());
        // Get a small sample of rows
        s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE);
        // Try and get every version so the row's size can be used to estimate.
        s.setMaxVersions(Short.MAX_VALUE);
        // Don't cache the blocks as we don't think these are
        // necessarily important blocks.
        s.setCacheBlocks(false);
        // Try and get deletes too so their size can be counted.
        s.setRaw(true);
        ResultScanner rs = hTable_.getScanner(s);
        try {
          // And get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows
          // for a representative sample
          for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; i++) {
            Result r = rs.next();
            if (r == null) break;
            currentRowCount += 1;
            for (KeyValue kv : r.list()) {
              // some extra row size added to make up for shared overhead
              currentRowSize +=
                  kv.getRowLength() // row key
                      + 4 // row key length field
                      + kv.getFamilyLength() // Column family bytes
                      + 4 // family length field
                      + kv.getQualifierLength() // qualifier bytes
                      + 4 // qualifier length field
                      + kv.getValueLength() // length of the value
                      + 4 // value length field
                      + 10; // extra overhead for hfile index, checksums, metadata, etc
            }
          }
          // add these values to the cumulative totals in one shot just
          // in case there was an error in between getting the hdfs
          // size and the row/column sizes.
          hdfsSize += currentHdfsSize;
          rowCount += currentRowCount;
          rowSize += currentRowSize;
        } finally {
          rs.close();
        }
      }
    } catch (IOException ioe) {
      // Print the stack trace, but we'll ignore it
      // as this is just an estimate.
      // TODO: Put this into the per query log.
      LOG.error("Error computing HBase row count estimate", ioe);
    }

    // If there are no rows then no need to estimate.
    if (rowCount == 0) return new Pair<Long, Long>(0L, 0L);

    // if something went wrong then set a signal value.
    if (rowSize <= 0 || hdfsSize <= 0) return new Pair<Long, Long>(-1L, -1L);

    // estimate the number of rows.
    double bytesPerRow = rowSize / (double) rowCount;
    long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (hdfsSize / bytesPerRow));

    return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow);
  }