/** Job configuration. */ public static Job configureJob(Configuration conf, String[] args) throws IOException { Scan scan = new Scan(); scan.addFamily(Cw09Constants.CF_FREQUENCIES_BYTES); scan.setBatch(Cw09Constants.CW09_INDEX_SCAN_BATCH); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); Job job = Job.getInstance(conf, "Count the total frequency of each term in the index table"); job.setJarByClass(TermHitsCounter.class); // TableMapReduceUtil.initTableMapperJob(Constants.CLUEWEB09_INDEX_TABLE_NAME, scan, // ThcMapper.class, Text.class, LongWritable.class, job); TableMapReduceUtil.initTableMapperJob( Cw09Constants.CLUEWEB09_INDEX_TABLE_NAME, scan, ThcMapper.class, Text.class, LongWritable.class, job, true, CustomizedSplitTableInputFormat.class); job.setCombinerClass(ThcCombiner.class); TableMapReduceUtil.initTableReducerJob( Cw09Constants.CLUEWEB09_TERM_COUNT_TABLE_NAME, ThcReducer.class, job); job.setNumReduceTasks(40); return job; }
/** Job configuration. */ public static Job configureJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String rowKeyType = args[3]; conf.set("row.key.type", rowKeyType); conf.set("table.name", tableName); Scan scan = new Scan(); scan.addFamily(Bytes.toBytes(columnFamily)); scan.setBatch(ConstantsTruthy.TRUTHY_TABLE_SCAN_BATCH); conf.set("mapred.map.tasks.speculative.execution", "false"); conf.set("mapred.reduce.tasks.speculative.execution", "false"); Job job = Job.getInstance( conf, "Count the column count and indexRecordSize for each row in " + tableName); job.setJarByClass(TruthyIndexFeatureCounter.class); TableMapReduceUtil.initTableMapperJob( tableName, scan, TfcMapper.class, Text.class, Text.class, job, true); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); TableMapReduceUtil.addDependencyJars(job); return job; }
public void testWideScanBatching() throws IOException { final int batch = 256; try { this.r = createNewHRegion(TESTTABLEDESC, null, null); int inserted = addWideContent(this.r); List<Cell> results = new ArrayList<Cell>(); Scan scan = new Scan(); scan.addFamily(A); scan.addFamily(B); scan.addFamily(C); scan.setMaxVersions(100); scan.setBatch(batch); InternalScanner s = r.getScanner(scan); int total = 0; int i = 0; boolean more; do { more = s.next(results); i++; LOG.info("iteration #" + i + ", results.size=" + results.size()); // assert that the result set is no larger assertTrue(results.size() <= batch); total += results.size(); if (results.size() > 0) { // assert that all results are from the same row byte[] row = CellUtil.cloneRow(results.get(0)); for (Cell kv : results) { assertTrue(Bytes.equals(row, CellUtil.cloneRow(kv))); } } results.clear(); // trigger ChangedReadersObservers Iterator<KeyValueScanner> scanners = ((HRegion.RegionScannerImpl) s).storeHeap.getHeap().iterator(); while (scanners.hasNext()) { StoreScanner ss = (StoreScanner) scanners.next(); ss.updateReaders(); } } while (more); // assert that the scanner returned all values LOG.info("inserted " + inserted + ", scanned " + total); assertEquals(total, inserted); s.close(); } finally { HRegion.closeHRegion(this.r); } }
@SuppressWarnings({"rawtypes", "unchecked"}) public void Read() throws IOException { List list = new ArrayList(); Scan scan = new Scan(); scan.setBatch(0); scan.setCaching(10000); scan.setMaxVersions(); scan.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("total")); ResultScanner rsScanner = table.getScanner(scan); for (Result rs : rsScanner) { String date = Bytes.toString(rs.getRow()); String total = Bytes.toString(rs.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("total"))); list.add(date + "\t" + total); } for (int i = 0; i < 7; i++) System.out.println((String) list.get(i) + "\t" + (String) list.get(i + 7)); }
/** * Deletes the specified table with all its columns. ATTENTION: Invoking this method will delete * the table if it exists and therefore causes data loss. */ @Override public void clearStorage() throws StorageException { HBaseAdmin adm = getAdminInterface(); try { // first of all, check if table exists, if not - we are done if (!adm.tableExists(tableName)) { logger.debug("clearStorage() called before table {} was created, skipping.", tableName); return; } } catch (IOException e) { throw new TemporaryStorageException(e); } HTable table = null; try { table = new HTable(hconf, tableName); Scan scan = new Scan(); scan.setBatch(100); scan.setCacheBlocks(false); scan.setCaching(2000); ResultScanner scanner = null; try { scanner = table.getScanner(scan); for (Result res : scanner) { table.delete(new Delete(res.getRow())); } } finally { IOUtils.closeQuietly(scanner); } } catch (IOException e) { throw new TemporaryStorageException(e); } finally { IOUtils.closeQuietly(table); } }
@SuppressWarnings({"resource", "rawtypes", "unchecked"}) public static void main(String[] args) throws IOException { Configuration config = HBaseConfiguration.create(); config.set("hbase.master", "192.168.32.128:60000"); config.set("hbase.zookeeper.property.clientPort", "2181"); config.set("hbase.zookeeper.quorum", "192.168.32.128"); HTable table = new HTable(config, Bytes.toBytes("weibo")); List list = new ArrayList(); Scan scan = new Scan(); scan.setBatch(0); scan.setCaching(10000); scan.setMaxVersions(); scan.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("total")); ResultScanner rsScanner = table.getScanner(scan); for (Result rs : rsScanner) { String date = Bytes.toString(rs.getRow()); String total = Bytes.toString(rs.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("total"))); list.add(date + "\t" + total); } for (int i = 0; i < 7; i++) System.out.println((String) list.get(i) + "\t" + (String) list.get(i + 7)); }
/** * Get an estimate of the number of rows and bytes per row in regions between startRowKey and * endRowKey. The more store files there are the more this will be off. Also, this does not take * into account any rows that are in the memstore. * * <p>The values computed here should be cached so that in high qps workloads the nn is not * overwhelmed. Could be done in load(); Synchronized to make sure that only one thread at a time * is using the htable. * * @param startRowKey First row key in the range * @param endRowKey Last row key in the range * @return The estimated number of rows in the regions between the row keys (first) and the * estimated row size in bytes (second). */ public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) { Preconditions.checkNotNull(startRowKey); Preconditions.checkNotNull(endRowKey); long rowSize = 0; long rowCount = 0; long hdfsSize = 0; boolean isCompressed = false; try { // Check to see if things are compressed. // If they are we'll estimate a compression factor. if (columnFamilies_ == null) { columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies(); } Preconditions.checkNotNull(columnFamilies_); for (HColumnDescriptor desc : columnFamilies_) { isCompressed |= desc.getCompression() != Compression.Algorithm.NONE; } // For every region in the range. List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey); for (HRegionLocation location : locations) { long currentHdfsSize = 0; long currentRowSize = 0; long currentRowCount = 0; HRegionInfo info = location.getRegionInfo(); // Get the size on hdfs currentHdfsSize += getHdfsSize(info); Scan s = new Scan(info.getStartKey()); // Get a small sample of rows s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE); // Try and get every version so the row's size can be used to estimate. s.setMaxVersions(Short.MAX_VALUE); // Don't cache the blocks as we don't think these are // necessarily important blocks. s.setCacheBlocks(false); // Try and get deletes too so their size can be counted. s.setRaw(true); ResultScanner rs = hTable_.getScanner(s); try { // And get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows // for a representative sample for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; i++) { Result r = rs.next(); if (r == null) break; currentRowCount += 1; for (KeyValue kv : r.list()) { // some extra row size added to make up for shared overhead currentRowSize += kv.getRowLength() // row key + 4 // row key length field + kv.getFamilyLength() // Column family bytes + 4 // family length field + kv.getQualifierLength() // qualifier bytes + 4 // qualifier length field + kv.getValueLength() // length of the value + 4 // value length field + 10; // extra overhead for hfile index, checksums, metadata, etc } } // add these values to the cumulative totals in one shot just // in case there was an error in between getting the hdfs // size and the row/column sizes. hdfsSize += currentHdfsSize; rowCount += currentRowCount; rowSize += currentRowSize; } finally { rs.close(); } } } catch (IOException ioe) { // Print the stack trace, but we'll ignore it // as this is just an estimate. // TODO: Put this into the per query log. LOG.error("Error computing HBase row count estimate", ioe); } // If there are no rows then no need to estimate. if (rowCount == 0) return new Pair<Long, Long>(0L, 0L); // if something went wrong then set a signal value. if (rowSize <= 0 || hdfsSize <= 0) return new Pair<Long, Long>(-1L, -1L); // estimate the number of rows. double bytesPerRow = rowSize / (double) rowCount; long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (hdfsSize / bytesPerRow)); return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow); }