public Scan generateScan( String[] rowRange, FilterList filterList, String[] family, String[] columns, int maxVersion) throws Exception { if (table == null) throw new Exception("No table handler"); if (cacheSize < 0) throw new Exception("should set cache size before scanning"); Scan scan = null; try { scan = new Scan(); scan.setCaching(this.cacheSize); scan.setCacheBlocks(this.blockCached); scan.setFilter(filterList); if (maxVersion > 0) scan.setMaxVersions(maxVersion); if (rowRange != null) { scan.setStartRow(rowRange[0].getBytes()); if (rowRange.length == 2) scan.setStopRow(rowRange[1].getBytes()); } if (columns != null) { for (int i = 0; i < columns.length; i++) { scan.addColumn(family[0].getBytes(), columns[i].getBytes()); // System.out.println(family[i]+";"+columns[i]); } } else { scan.addFamily(family[0].getBytes()); } } catch (Exception e) { e.printStackTrace(); } return scan; }
/** * Get a collection of Scans, one per region, that cover the range of the table having the given * key prefix. Thes will be used as the map task input splits. */ public static List<Scan> scansThisCubeOnly(byte[] keyPrefix, byte[][] splitKeys) throws IOException { Scan copyScan = new Scan(); copyScan.setCaching(5000); copyScan.setCacheBlocks(false); // Hack: generate a key that probably comes after all this cube's keys but doesn't include any // keys not belonging to this cube. byte[] keyAfterCube = ArrayUtils.addAll(keyPrefix, fiftyBytesFF); List<Scan> scans = new ArrayList<Scan>(); Scan scanUnderConstruction = new Scan(copyScan); for (byte[] splitKey : splitKeys) { scanUnderConstruction.setStopRow(splitKey); // Coerce scan to only touch keys belonging to this cube Scan truncated = truncateScan(scanUnderConstruction, keyPrefix, keyAfterCube); if (truncated != null) { scans.add(truncated); } scanUnderConstruction = new Scan(copyScan); scanUnderConstruction.setStartRow(splitKey); } // There's another region from last split key to the end of the table. Scan truncated = truncateScan(scanUnderConstruction, keyPrefix, keyAfterCube); if (truncated != null) { scans.add(truncated); } return scans; }
public static void readWriteHtable() throws Exception { Job job = new Job(conf, "ExampleReadWrite"); job.setJarByClass(HbaseMR.class); // class that contains mapper Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs // set other scan attrs TableMapReduceUtil.initTableMapperJob( "sourceTable", // input table scan, // Scan instance to control CF and attribute selection MyMapper2.class, // mapper class null, // mapper output key null, // mapper output value job); TableMapReduceUtil.initTableReducerJob( "targetTable", // output table null, // reducer class job); job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } }
@Test public void shouldRunMapReduce() throws Exception { // given Job job = new Job(configuration, "Average Rating"); job.setJarByClass(AverageRatingMapper.class); Scan scan = new Scan(); scan.setCaching(500); scan.setCacheBlocks(false); scan.addFamily(Bytes.toBytes(LoadMovieRatingData.FAMILY_NAME)); TableMapReduceUtil.initTableMapperJob( LoadMovieRatingData.TABLE_NAME, scan, AverageRatingMapper.class, Text.class, DoubleWritable.class, job); job.setReducerClass(RatingExportReducer.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath( job, new Path("/tmp/mr/mySummaryFile_" + System.currentTimeMillis())); // when boolean succeeded = job.waitForCompletion(true); // then assertThat(succeeded).isTrue(); }
public static void main(String[] args) throws Exception { Configuration con = new Configuration(); String[] otherArgs = new GenericOptionsParser(con, args).getRemainingArgs(); HBaseConfiguration conf = new HBaseConfiguration(); Job job = new Job(conf, "AverageCalc"); job.setJarByClass(AverageCalculator.class); Scan scan = new Scan(); scan.setCaching(500); scan.setCacheBlocks(false); scan.addFamily(Bytes.toBytes("Post")); FilterList li = new FilterList(FilterList.Operator.MUST_PASS_ALL); SingleColumnValueFilter filter = new SingleColumnValueFilter( Bytes.toBytes("Post"), Bytes.toBytes("PostTypeId"), CompareOp.EQUAL, Bytes.toBytes("1")); li.addFilter(filter); scan.setFilter(li); FileOutputFormat.setOutputPath(job, new Path(otherArgs[0])); job.setOutputKeyClass(Text.class); TableMapReduceUtil.initTableMapperJob( "bigd24-hbase-sample", scan, Mapper1.class, Text.class, IntWritable.class, job); job.setReducerClass(Reducer1.class); job.setOutputValueClass(FloatWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
public static void main(String[] args) throws Exception { new JobConf().setSpeculativeExecution(false); Configuration conf = new Configuration(); conf.set("es.nodes", ES_NODES); conf.set("es.resource", ES_RESOURCE); conf.set("es.mapping.id", HBaseTableMapper.ID_FIELD.toString()); conf.set("es.batch.size.bytes", "10mb"); conf.set("es.batch.size.entries", "10000"); conf.set("es.batch.write.refresh", "false"); Job job = new Job(conf); job.setJarByClass(BulkIndex.class); job.setMapperClass(HBaseTableMapper.class); job.setNumReduceTasks(0); job.setSpeculativeExecution(false); job.setOutputFormatClass(BulkProcessorOutputFormat.class); job.setMapOutputValueClass(Text.class); Scan scan = new Scan(); scan.setCaching(1000); scan.setCacheBlocks(false); TableMapReduceUtil.initTableMapperJob( BulkLoad.HBASE_TABLE_NAME, scan, HBaseTableMapper.class, NullWritable.class, MapWritable.class, job); job.waitForCompletion(true); }
public static void htableFile() throws Exception { Job job = new Job(conf, "ExampleSummaryToFile"); job.setJarByClass(HbaseMR.class); // class that contains mapper Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs // set other scan attrs TableMapReduceUtil.initTableMapperJob( "sourceTable", // input table scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper class Text.class, // mapper output key IntWritable.class, // mapper output value job); job.setReducerClass(MyReducer4.class); // reducer class job.setNumReduceTasks(1); // at least one, adjust as required FileOutputFormat.setOutputPath(new JobConf(conf), new Path("/tmp/mr/mySummaryFile")); // adjust // directories // as // required boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } }
/** * Test # of blocks read to ensure disabling cache-fill on Scan works. * * @throws Exception */ @Test public void testBlocksStoredWhenCachingDisabled() throws Exception { byte[] TABLE = Bytes.toBytes("testBlocksReadWhenCachingDisabled"); String FAMILY = "cf1"; HBaseConfiguration conf = getConf(); this.region = initHRegion(TABLE, getName(), conf, FAMILY); try { putData(FAMILY, "row", "col1", 1); putData(FAMILY, "row", "col2", 2); region.flushcache(); // Execute a scan with caching turned off // Expected blocks stored: 0 long blocksStart = getBlkCount(); Scan scan = new Scan(); scan.setCacheBlocks(false); RegionScanner rs = region.getScanner(scan); List<KeyValue> result = new ArrayList<KeyValue>(2); rs.next(result); assertEquals(2 * BLOOM_TYPE.length, result.size()); rs.close(); long blocksEnd = getBlkCount(); assertEquals(blocksStart, blocksEnd); // Execute with caching turned on // Expected blocks stored: 2 blocksStart = blocksEnd; scan.setCacheBlocks(true); rs = region.getScanner(scan); result = new ArrayList<KeyValue>(2); rs.next(result); assertEquals(2 * BLOOM_TYPE.length, result.size()); rs.close(); blocksEnd = getBlkCount(); assertEquals(2 * BLOOM_TYPE.length, blocksEnd - blocksStart); } finally { HRegion.closeHRegion(this.region); this.region = null; } }
public static Job startJob(String[] args) throws IOException { // args[0] = hbase table name // args[1] = zookeeper Configuration hConf = HBaseConfiguration.create(new Configuration()); hConf.set("hbase.zookeeper.quorum", args[1]); hConf.set("scan.table", args[0]); hConf.set("hbase.zookeeper.property.clientPort", "2181"); Scan scan = new Scan(); // scan.setFilter(rowColBloomFilter()); Job job = new Job(hConf); job.setJobName("BSBM-Q11-RepartitionJoin"); job.setJarByClass(RepartitionJoinQ11.class); // Change caching to speed up the scan scan.setCaching(500); scan.setMaxVersions(200); scan.setCacheBlocks(false); // Mapper settings TableMapReduceUtil.initTableMapperJob( args[0], // input HBase table name scan, // Scan instance to control CF and attribute selection RepartitionMapper.class, // mapper CompositeKeyWritable.class, // mapper output key KeyValueArrayWritable.class, // mapper output value job); // Repartition settings job.setPartitionerClass(CompositePartitioner.class); job.setSortComparatorClass(CompositeSortComparator.class); job.setGroupingComparatorClass(CompositeGroupingComparator.class); // Reducer settings job.setReducerClass(SharedServices.RepartitionJoin_Reducer.class); // reducer class job.setNumReduceTasks(1); // at least one, adjust as required FileOutputFormat.setOutputPath(job, new Path("output/BSBMQ11")); try { System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } return job; }
public static void main(String[] args) throws IOException, SolrServerException { final Configuration conf; HttpSolrServer solrServer = new HttpSolrServer("http://c1master:8983/solr"); conf = HBaseConfiguration.create(); // Define Hbase Table Name HTable table = new HTable(conf, "test_global_shop"); Scan scan = new Scan(); // Define Hbase Column Family scan.addFamily(Bytes.toBytes("shop")); scan.setCaching(1000); scan.setCacheBlocks(false); ResultScanner ss = table.getScanner(scan); System.out.println("start Storing..."); int i = 0; try { for (Result r : ss) { SolrInputDocument solrDoc = new SolrInputDocument(); solrDoc.addField("key", new String(r.getRow())); for (KeyValue kv : r.raw()) { String fieldName = new String(kv.getQualifier()); String fieldValue = new String(kv.getValue()); if (fieldName.equalsIgnoreCase("address") || fieldName.equalsIgnoreCase("category") || fieldName.equalsIgnoreCase("name") || fieldName.equalsIgnoreCase("province") || fieldName.equalsIgnoreCase("tel")) { solrDoc.addField(fieldName, fieldValue); } } solrServer.add(solrDoc); solrServer.commit(true, true, true); i = i + 1; System.out.println("Already Succcess " + i + " number data"); } ss.close(); table.close(); System.out.println("done !"); } catch (IOException e) { } finally { ss.close(); table.close(); System.out.println("error !"); } }
private static Scan getConfiguredScanForJob(Configuration conf, String[] args) throws IOException { Scan s = new Scan(); // Set Scan Versions s.setMaxVersions(Integer.MAX_VALUE); s.setCacheBlocks(false); // Set Scan Column Family if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) { s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY))); } // Set RowFilter or Prefix Filter if applicable. Filter rowFilter = getRowFilter(args); if (rowFilter != null) { LOG.info("Setting Row Filter for counter."); s.setFilter(rowFilter); } return s; }
// MapReduce Stage-1 Job public static Job startJob_Stage1(String[] args, Configuration hConf) throws IOException { // args[0] = hbase table name // args[1] = zookeeper /* * MapReduce Stage-1 Job * Retrieve a list of subjects and their attributes */ Scan scan1 = new Scan(); Job job1 = new Job(hConf); job1.setJobName("BSBM-Q8-RepartitionJoin"); job1.setJarByClass(RepartitionJoinQ8.class); // Change caching and number of time stamps to speed up the scan scan1.setCaching(500); scan1.setMaxVersions(200); scan1.setCacheBlocks(false); // Mapper settings TableMapReduceUtil.initTableMapperJob( args[0], // input HBase table name scan1, // Scan instance to control CF and attribute selection RepartitionMapper.class, // mapper class CompositeKeyWritable.class, // mapper output key KeyValueArrayWritable.class, // mapper output value job1); // Reducer settings job1.setReducerClass(RepartitionReducer.class); job1.setOutputFormatClass(TextOutputFormat.class); // job1.setNumReduceTasks(1); // Uncomment this if running into problems on 2+ node cluster FileOutputFormat.setOutputPath(job1, new Path("output/BSBMQ8")); try { job1.waitForCompletion(true); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } return job1; }
/** * Deletes the specified table with all its columns. ATTENTION: Invoking this method will delete * the table if it exists and therefore causes data loss. */ @Override public void clearStorage() throws StorageException { HBaseAdmin adm = getAdminInterface(); try { // first of all, check if table exists, if not - we are done if (!adm.tableExists(tableName)) { logger.debug("clearStorage() called before table {} was created, skipping.", tableName); return; } } catch (IOException e) { throw new TemporaryStorageException(e); } HTable table = null; try { table = new HTable(hconf, tableName); Scan scan = new Scan(); scan.setBatch(100); scan.setCacheBlocks(false); scan.setCaching(2000); ResultScanner scanner = null; try { scanner = table.getScanner(scan); for (Result res : scanner) { table.delete(new Delete(res.getRow())); } } finally { IOUtils.closeQuietly(scanner); } } catch (IOException e) { throw new TemporaryStorageException(e); } finally { IOUtils.closeQuietly(table); } }
public static void readHtable() throws Exception { Job job = new Job(conf, "ExampleRead"); job.setJarByClass(HbaseMR.class); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs TableMapReduceUtil.initTableMapperJob( "", // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(NullOutputFormat.class); // because we aren't // emitting anything // from mapper boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } }
/** * It should be noticed that the stop row in scan is not included as default * * @param rowRange * @param filterList * @param family * @param columns * @param maxVersion * @return * @throws Exception */ public ResultScanner getResultSet( String[] rowRange, FilterList filterList, String[] family, String[] columns, int maxVersion) throws Exception { if (table == null) throw new Exception("No table handler"); if (cacheSize < 0) throw new Exception("should set cache size before scanning"); Scan scan = null; ResultScanner rscanner = null; try { scan = new Scan(); scan.setCaching(this.cacheSize); scan.setCacheBlocks(blockCached); scan.setFilter(filterList); if (maxVersion > 0) scan.setMaxVersions(maxVersion); // scan exclude the stop row directly, so have to make a little difference of the stop row if (rowRange != null) { scan.setStartRow(rowRange[0].getBytes()); if (rowRange.length == 2 && rowRange[1] != null) scan.setStopRow((rowRange[1]).getBytes()); } if (columns != null) { for (int i = 0; i < columns.length; i++) { scan.addColumn(family[0].getBytes(), columns[i].getBytes()); } } rscanner = this.table.getScanner(scan); } catch (Exception e) { e.printStackTrace(); } return rscanner; }
@Test public void shouldRunMapReduce() throws Exception { // given Configuration configuration = HBaseConfiguration.create(); TableFactory.recreateTable( configuration, Bytes.toString(UsersDao.TABLE_NAME), Bytes.toString(UsersDao.FAMILY_NAME)); UserDataFactory.insertTestData(); // map reduce Job job = new Job(configuration, "Count Users"); job.setJarByClass(CountUsersMapper.class); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addColumn(UsersDao.FAMILY_NAME, UsersDao.FORENAME_COL); // mapper TableMapReduceUtil.initTableMapperJob( Bytes.toString(UsersDao.TABLE_NAME), scan, CountUsersMapper.class, ImmutableBytesWritable.class, Result.class, job); job.setOutputFormatClass(NullOutputFormat.class); job.setNumReduceTasks(0); // when boolean succeeded = job.waitForCompletion(true); // then assertThat(succeeded).isTrue(); assertThat(job.getCounters().findCounter(CountUsersMapper.Counters.USER_COUNT).getValue()) .isGreaterThan(99); }
public static boolean scanTable(Configuration config) throws IOException, URISyntaxException { /** Connection to the cluster. A single connection shared by all application threads. */ Connection connection = null; /** A lightweight handle to a specific table. Used from a single thread. */ Table table = null; Scan scan = null; ResultScanner scanner = null; // String[] columns = null; try { // establish the connection to the cluster. connection = ConnectionFactory.createConnection(config); // retrieve a handle to the target table. table = connection.getTable(TABLE_NAME1); // Instantiating the Scan class scan = new Scan(); // Recommended tuning Ch12 HBase Performance Tuning scan.setCaching(5000); // Defined rows returned from server during next() scan.setCacheBlocks(false); // Disable server side cache // Scanning the required columns scan.addFamily(CF); // Getting the scan result scanner = table.getScanner(scan); System.out.println("Rows founded"); StringBuffer line = new StringBuffer(); int counter = 0; // Reading values from scan result for (Result result = scanner.next(); result != null; result = scanner.next()) { columns = getColumnsInColumnFamily(result, CF); // clean line // line.delete(0, line.length()); line.append(Bytes.toString(result.getRow())); for (String column : columns) { line.append("^" + Bytes.toString(result.getValue(CF, Bytes.toBytes(column)))); } line.append("\n"); counter++; System.out.println("Linea: " + counter); // line.append("" // + Bytes.toString(result.getRow()) + "," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("account"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("amount"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("aut"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("balance"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("card"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("commerce"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("commerceData"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("date"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("f"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("f2"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("f3"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("f4"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("hour"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("m"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("money"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("ms"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("r"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("ref"))) +"," // + Bytes.toString(result.getValue(CF, Bytes.toBytes("trxcd"))) +"\n" // ); // System.out.println(Bytes.toString(result.getRow()) +","+ // Bytes.toString(result.getValue(CF, qualifier))); } DistributedFileSystem hdfs = new DistributedFileSystem(); hdfs.initialize(new URI("hdfs://quickstart.cloudera:8020"), config); Path homeDir = hdfs.getHomeDirectory(); // Print the home directory System.out.println("Home folder HDFS-" + homeDir); Path newFilePath = new Path(homeDir + "/datatest/" + TABLE_NAME + "/file.csv"); byte[] byt = line.toString().getBytes(); FSDataOutputStream fsOutStream = hdfs.create(newFilePath); fsOutStream.write(byt); fsOutStream.close(); hdfs.close(); System.out.println("File exported to " + homeDir + "/datatest/" + TABLE_NAME + "/file.csv"); } finally { // close everything down if (table != null) table.close(); if (connection != null) connection.close(); if (scanner != null) scanner.close(); } return true; }
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length == 0) { System.out.println( "ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional"); return; } String table = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String compressionCodec = args[3]; String schemaFilePath = args[4]; String delimiter = args[5]; String rowKeyColumn = ""; if (args.length > 6) { rowKeyColumn = args[6]; } Job job = Job.getInstance(); job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn); HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath); job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath); job.getConfiguration().set(DELIMITER_CONF, delimiter); job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class); job.setJobName("ExportHBaseTableToDelimiteredSeq "); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addFamily(Bytes.toBytes(columnFamily)); TableMapReduceUtil.initTableMapperJob( table, // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); if (compressionCodec.equals("snappy")) { SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } else if (compressionCodec.equals("gzip")) { SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { // nothing } job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); }
private PageBean getDataMapTemp(SearchParam searchParam, Integer currentPage, Integer pageSize) throws IOException { String tableName = searchParam.getTableName(); String startRow = searchParam.getStartKey(); String stopRow = searchParam.getEndKey(); List<Map<String, String>> mapList = null; mapList = new LinkedList<Map<String, String>>(); ResultScanner scanner = null; // 为分页创建的封装类对象,下面有给出具体属性 TBData tbData = null; PageBean tbData = new PageBean(); Scan scan = null; HbaseWhereEngine whereEngine = null; HTableInterface table = HbaseTool.getTable(tableName); // 获取筛选对象 if ("ca_summary_optimize".equals(tableName) && compoundFieldMap.isEmpty()) { compoundFieldMap = CompoundFieldConfigService.getParentComField(tableName); } try { if (StringUtils.isNotEmpty(searchParam.getWhere())) { whereEngine = new HbaseWhereEngine(tableName, searchParam.getFamily(), searchParam.getWhere()); } String[] selectArray = null; if (StringUtils.isNotEmpty(searchParam.getSelect())) { selectArray = searchParam.getSelect().split(","); } byte[] cf = Bytes.toBytes(searchParam.getFamily()); // if keys is not null,then we can sure which records to be selected! if (StringUtils.isNotEmpty(searchParam.getKeys())) { List<Get> getKeysList = new ArrayList<Get>(); for (String key : searchParam.getKeys().split(",")) { Get get = new Get(Bytes.toBytes(key)); getKeysList.add(get); if (selectArray != null) { for (String field : selectArray) { String temp[] = processField(field, searchParam.getFamily()); if ("ca_summary_optimize".equals(tableName) && compoundFieldMap.containsKey(temp[1])) { get.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(compoundFieldMap.get(temp[1]))); } else { get.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(temp[1])); } } } if (selectArray != null && whereEngine != null) { Set<String> varSet = whereEngine.getVarSet(); for (String var : varSet) { get.addColumn(cf, Bytes.toBytes(var)); } } } Result[] resultsFromKeys = table.get(getKeysList); for (Result rr : resultsFromKeys) { if (!rr.isEmpty()) { if (whereEngine != null && !whereEngine.meetCondition(rr)) { continue; } Map<String, String> map = new TreeMap<String, String>(); map.put("_id", Bytes.toString(rr.getRow())); for (String field : selectArray) { String value = HbaseWhereEngine.getColumnValue(tableName, cf, rr, field); if (!field.equals("id")) { map.put(field, value); } } mapList.add(map); } } pageSize = mapList.size(); tbData.setCurrentPage(currentPage); tbData.setLength(pageSize); tbData.setTotalRecords(mapList.size()); } else { // if keys is null,we select some records between startKey and end key or top // pageSize ! // 获取最大返回结果数量 if (pageSize == null || pageSize == 0L) pageSize = 100; if (currentPage == null || currentPage == 0) currentPage = 1; // 计算起始页和结束页 Integer firstPage = (currentPage - 1) * pageSize; Integer endPage = firstPage + pageSize; scan = getScan(startRow, stopRow); // 给筛选对象放入过滤器(true标识分页,具体方法在下面) scan.setFilter(packageFilters(searchParam, true)); // 缓存1000条数据 scan.setCaching(1000); scan.setCacheBlocks(false); if (selectArray != null) { for (String field : selectArray) { String temp[] = processField(field, searchParam.getFamily()); if ("ca_summary_optimize".equals(tableName) && compoundFieldMap.containsKey(temp[1])) { scan.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(compoundFieldMap.get(temp[1]))); } else { scan.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(temp[1])); } } } if (selectArray != null && whereEngine != null) { Set<String> varSet = whereEngine.getVarSet(); for (String var : varSet) { scan.addColumn(cf, Bytes.toBytes(var)); } } scanner = table.getScanner(scan); int i = 0; List<byte[]> rowList = new LinkedList<byte[]>(); // 遍历扫描器对象, 并将需要查询出来的数据row key取出 for (Result result : scanner) { String row = toStr(result.getRow()); if (i >= firstPage && i < endPage) { // filter firstPage rowList.add(getBytes(row)); } if (i >= endPage) { break; } i++; } // 获取取出的row key的GET对象 List<Get> getList = getList(rowList, cf, selectArray, tableName); Result[] results = table.get(getList); for (Result result : results) { if (whereEngine != null && !whereEngine.meetCondition(result)) { continue; } Map<byte[], byte[]> fmap = packFamilyMap(tableName, result, cf, selectArray); Map<String, String> rmap = packRowMap(fmap); rmap.put("_id", toStr(result.getRow())); mapList.add(rmap); } // 封装分页对象 tbData.setCurrentPage(currentPage); tbData.setLength(pageSize); // (pageSize); tbData.setTotalRecords(i); } tbData.setResults(mapList); } catch (Exception e) { } finally { if (table != null) { table.close(); } closeScanner(scanner); } compoundFieldMap.clear(); return tbData; }
private ResultScanner createHBaseResultScanner(RecordScan scan) throws RepositoryException, InterruptedException { Scan hbaseScan = new Scan(); hbaseScan.setMaxVersions(1); if (scan.getRawStartRecordId() != null) { hbaseScan.setStartRow(scan.getRawStartRecordId()); } else if (scan.getStartRecordId() != null) { hbaseScan.setStartRow(scan.getStartRecordId().toBytes()); } if (scan.getRawStopRecordId() != null) { hbaseScan.setStopRow(scan.getRawStopRecordId()); } else if (scan.getStopRecordId() != null) { hbaseScan.setStopRow(scan.getStopRecordId().toBytes()); } // Filters FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL); // filter out deleted records filterList.addFilter(REAL_RECORDS_FILTER); // add user's filter if (scan.getRecordFilter() != null) { Filter filter = filterFactory.createHBaseFilter(scan.getRecordFilter(), this, filterFactory); filterList.addFilter(filter); } hbaseScan.setFilter(filterList); hbaseScan.setCaching(scan.getCaching()); hbaseScan.setCacheBlocks(scan.getCacheBlocks()); ReturnFields returnFields = scan.getReturnFields(); if (returnFields != null && returnFields.getType() != ReturnFields.Type.ALL) { RecordDecoder.addSystemColumnsToScan(hbaseScan); switch (returnFields.getType()) { case ENUM: for (QName field : returnFields.getFields()) { FieldTypeImpl fieldType = (FieldTypeImpl) typeManager.getFieldTypeByName(field); hbaseScan.addColumn(RecordCf.DATA.bytes, fieldType.getQualifier()); } break; case NONE: // nothing to add break; default: throw new RuntimeException("Unrecognized ReturnFields type: " + returnFields.getType()); } } else { hbaseScan.addFamily(RecordCf.DATA.bytes); } ResultScanner hbaseScanner; try { hbaseScanner = recordTable.getScanner(hbaseScan); } catch (IOException e) { throw new RecordException("Error creating scanner", e); } return hbaseScanner; }
public static void main(String args[]) throws Exception { try { Configuration config = HBaseConfiguration.create(); config.set("mapred.output.dir", "/mnt/data/workspace/weatheranalysis/mapred/monthly"); Job job = new Job(config, "MonthlySummary"); String sourceTable = DailyWeatherHbaseOperations.TABLE_NAME; Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad // for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs // set other scan attrs TableMapReduceUtil.initTableMapperJob( sourceTable, // input table scan, // Scan instance to control CF and attribute selection MonthlyWeatherMapper.class, // mapper class Text.class, // mapper output key Text.class, // mapper output value job); job.setReducerClass(MonthlyWeatherReducer.class); // reducer class job.setNumReduceTasks(1); // at least one, adjust as required Path out = new Path("/mnt/data/workspace/weatheranalysis/mapred/monthly"); File outDir = new File(out.toString()); FileUtil.fullyDelete(outDir); FileOutputFormat.setOutputPath(job, out); MonthlyWeatherHbaseOperations.useTable(); // The required Total Precipitate for a particular station for an year /* List<Float> ForAllMonthsPpt = new ArrayList<Float>(); List<MonthlyWeather> lr = MonthlyWeatherHbaseOperations.get(425010,2013); while(lr.isEmpty()) { ForAllMonthsPpt.add(lr.get(4)); } * * * * */ job.waitForCompletion(true); System.out.println("Job Completed."); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job!"); } } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } }
@Override public Scan getRawScan(ItemData d, Map<String, HCompareOp> ops) { int startShopID = 0; int startItemID = 0; int endShopID = MAX_SHOP_ID; int endItemID = MAX_ITEM_ID; // some performance improvement // shop_id指定 HCompareOp shopIDOp = ops.get("shop_id"); if (shopIDOp == HCompareOp.EQUAL) { startShopID = d.shopID; endShopID = startShopID; } // item idも指定 HCompareOp itemIDOp = ops.get("item_id"); if (itemIDOp == HCompareOp.EQUAL) { startItemID = d.itemID; endItemID = startItemID; } log.info(String.format("scan start row, shop_id=%d, item_id=%d", startShopID, startItemID)); log.info(String.format("scan stop row, shop_id=%d, item_id=%d", endShopID, endItemID)); byte[] startRow = encodeRowkey(startShopID, startItemID); byte[] endRow = encodeRowkey(endShopID, endItemID); Scan s = new Scan(startRow, endRow); s.addFamily(DATA_FAMILY); s.addFamily(META_FAMILY); s.setCacheBlocks(false); s.setMaxVersions(); s.setCaching(DEFAULT_SCAN_CACHE); FilterList fl = new FilterList(); for (String column : ops.keySet()) { byte[] value; byte[] family = DATA_FAMILY; if ("ctime".equals(column)) { value = Bytes.toBytes(d.ctime); family = META_FAMILY; } else if ("shop_id".equals(column)) { value = Bytes.toBytes(d.shopID); } else if ("item_id".equals(column)) { value = Bytes.toBytes(d.itemID); } else if ("genre_id".equals(column)) { value = Bytes.toBytes(d.genreID); } else if ("price".equals(column)) { value = Bytes.toBytes(d.price); } else if ("full_item_url".equals(column)) { value = Bytes.toBytes(d.fullItemUrl); } else if ("item_name".equals(column)) { value = Bytes.toBytes(d.itemName); } else { // ignore continue; } byte[] qualifier = Bytes.toBytes(column); HCompareOp hop = ops.get(column); CompareOp op = HClient.toCompareOp(hop); SingleColumnValueFilter filter = new SingleColumnValueFilter(family, qualifier, op, value); filter.setFilterIfMissing(true); fl.addFilter(filter); } s.setFilter(fl); return s; }
/** * Get an estimate of the number of rows and bytes per row in regions between startRowKey and * endRowKey. The more store files there are the more this will be off. Also, this does not take * into account any rows that are in the memstore. * * <p>The values computed here should be cached so that in high qps workloads the nn is not * overwhelmed. Could be done in load(); Synchronized to make sure that only one thread at a time * is using the htable. * * @param startRowKey First row key in the range * @param endRowKey Last row key in the range * @return The estimated number of rows in the regions between the row keys (first) and the * estimated row size in bytes (second). */ public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) { Preconditions.checkNotNull(startRowKey); Preconditions.checkNotNull(endRowKey); long rowSize = 0; long rowCount = 0; long hdfsSize = 0; boolean isCompressed = false; try { // Check to see if things are compressed. // If they are we'll estimate a compression factor. if (columnFamilies_ == null) { columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies(); } Preconditions.checkNotNull(columnFamilies_); for (HColumnDescriptor desc : columnFamilies_) { isCompressed |= desc.getCompression() != Compression.Algorithm.NONE; } // For every region in the range. List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey); for (HRegionLocation location : locations) { long currentHdfsSize = 0; long currentRowSize = 0; long currentRowCount = 0; HRegionInfo info = location.getRegionInfo(); // Get the size on hdfs currentHdfsSize += getHdfsSize(info); Scan s = new Scan(info.getStartKey()); // Get a small sample of rows s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE); // Try and get every version so the row's size can be used to estimate. s.setMaxVersions(Short.MAX_VALUE); // Don't cache the blocks as we don't think these are // necessarily important blocks. s.setCacheBlocks(false); // Try and get deletes too so their size can be counted. s.setRaw(true); ResultScanner rs = hTable_.getScanner(s); try { // And get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows // for a representative sample for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; i++) { Result r = rs.next(); if (r == null) break; currentRowCount += 1; for (KeyValue kv : r.list()) { // some extra row size added to make up for shared overhead currentRowSize += kv.getRowLength() // row key + 4 // row key length field + kv.getFamilyLength() // Column family bytes + 4 // family length field + kv.getQualifierLength() // qualifier bytes + 4 // qualifier length field + kv.getValueLength() // length of the value + 4 // value length field + 10; // extra overhead for hfile index, checksums, metadata, etc } } // add these values to the cumulative totals in one shot just // in case there was an error in between getting the hdfs // size and the row/column sizes. hdfsSize += currentHdfsSize; rowCount += currentRowCount; rowSize += currentRowSize; } finally { rs.close(); } } } catch (IOException ioe) { // Print the stack trace, but we'll ignore it // as this is just an estimate. // TODO: Put this into the per query log. LOG.error("Error computing HBase row count estimate", ioe); } // If there are no rows then no need to estimate. if (rowCount == 0) return new Pair<Long, Long>(0L, 0L); // if something went wrong then set a signal value. if (rowSize <= 0 || hdfsSize <= 0) return new Pair<Long, Long>(-1L, -1L); // estimate the number of rows. double bytesPerRow = rowSize / (double) rowCount; long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (hdfsSize / bytesPerRow)); return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow); }