Example #1
0
  public Scan generateScan(
      String[] rowRange, FilterList filterList, String[] family, String[] columns, int maxVersion)
      throws Exception {
    if (table == null) throw new Exception("No table handler");
    if (cacheSize < 0) throw new Exception("should set cache size before scanning");

    Scan scan = null;

    try {
      scan = new Scan();
      scan.setCaching(this.cacheSize);
      scan.setCacheBlocks(this.blockCached);
      scan.setFilter(filterList);
      if (maxVersion > 0) scan.setMaxVersions(maxVersion);
      if (rowRange != null) {
        scan.setStartRow(rowRange[0].getBytes());
        if (rowRange.length == 2) scan.setStopRow(rowRange[1].getBytes());
      }

      if (columns != null) {
        for (int i = 0; i < columns.length; i++) {
          scan.addColumn(family[0].getBytes(), columns[i].getBytes());
          // System.out.println(family[i]+";"+columns[i]);
        }
      } else {
        scan.addFamily(family[0].getBytes());
      }

    } catch (Exception e) {
      e.printStackTrace();
    }

    return scan;
  }
  /**
   * Get a collection of Scans, one per region, that cover the range of the table having the given
   * key prefix. Thes will be used as the map task input splits.
   */
  public static List<Scan> scansThisCubeOnly(byte[] keyPrefix, byte[][] splitKeys)
      throws IOException {
    Scan copyScan = new Scan();
    copyScan.setCaching(5000);
    copyScan.setCacheBlocks(false);

    // Hack: generate a key that probably comes after all this cube's keys but doesn't include any
    // keys not belonging to this cube.
    byte[] keyAfterCube = ArrayUtils.addAll(keyPrefix, fiftyBytesFF);

    List<Scan> scans = new ArrayList<Scan>();
    Scan scanUnderConstruction = new Scan(copyScan);

    for (byte[] splitKey : splitKeys) {
      scanUnderConstruction.setStopRow(splitKey);

      // Coerce scan to only touch keys belonging to this cube
      Scan truncated = truncateScan(scanUnderConstruction, keyPrefix, keyAfterCube);
      if (truncated != null) {
        scans.add(truncated);
      }

      scanUnderConstruction = new Scan(copyScan);
      scanUnderConstruction.setStartRow(splitKey);
    }

    // There's another region from last split key to the end of the table.
    Scan truncated = truncateScan(scanUnderConstruction, keyPrefix, keyAfterCube);
    if (truncated != null) {
      scans.add(truncated);
    }

    return scans;
  }
Example #3
0
  public static void readWriteHtable() throws Exception {
    Job job = new Job(conf, "ExampleReadWrite");
    job.setJarByClass(HbaseMR.class); // class that contains mapper

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(
        "sourceTable", // input table
        scan, // Scan instance to control CF and attribute selection
        MyMapper2.class, // mapper class
        null, // mapper output key
        null, // mapper output value
        job);
    TableMapReduceUtil.initTableReducerJob(
        "targetTable", // output table
        null, // reducer class
        job);
    job.setNumReduceTasks(0);

    boolean b = job.waitForCompletion(true);
    if (!b) {
      throw new IOException("error with job!");
    }
  }
  @Test
  public void shouldRunMapReduce() throws Exception {
    // given
    Job job = new Job(configuration, "Average Rating");
    job.setJarByClass(AverageRatingMapper.class);

    Scan scan = new Scan();
    scan.setCaching(500);
    scan.setCacheBlocks(false);
    scan.addFamily(Bytes.toBytes(LoadMovieRatingData.FAMILY_NAME));

    TableMapReduceUtil.initTableMapperJob(
        LoadMovieRatingData.TABLE_NAME,
        scan,
        AverageRatingMapper.class,
        Text.class,
        DoubleWritable.class,
        job);
    job.setReducerClass(RatingExportReducer.class);
    job.setNumReduceTasks(1);
    FileOutputFormat.setOutputPath(
        job, new Path("/tmp/mr/mySummaryFile_" + System.currentTimeMillis()));

    // when
    boolean succeeded = job.waitForCompletion(true);

    // then
    assertThat(succeeded).isTrue();
  }
 public static void main(String[] args) throws Exception {
   Configuration con = new Configuration();
   String[] otherArgs = new GenericOptionsParser(con, args).getRemainingArgs();
   HBaseConfiguration conf = new HBaseConfiguration();
   Job job = new Job(conf, "AverageCalc");
   job.setJarByClass(AverageCalculator.class);
   Scan scan = new Scan();
   scan.setCaching(500);
   scan.setCacheBlocks(false);
   scan.addFamily(Bytes.toBytes("Post"));
   FilterList li = new FilterList(FilterList.Operator.MUST_PASS_ALL);
   SingleColumnValueFilter filter =
       new SingleColumnValueFilter(
           Bytes.toBytes("Post"),
           Bytes.toBytes("PostTypeId"),
           CompareOp.EQUAL,
           Bytes.toBytes("1"));
   li.addFilter(filter);
   scan.setFilter(li);
   FileOutputFormat.setOutputPath(job, new Path(otherArgs[0]));
   job.setOutputKeyClass(Text.class);
   TableMapReduceUtil.initTableMapperJob(
       "bigd24-hbase-sample", scan, Mapper1.class, Text.class, IntWritable.class, job);
   job.setReducerClass(Reducer1.class);
   job.setOutputValueClass(FloatWritable.class);
   System.exit(job.waitForCompletion(true) ? 0 : 1);
 }
Example #6
0
  public static void main(String[] args) throws Exception {
    new JobConf().setSpeculativeExecution(false);
    Configuration conf = new Configuration();
    conf.set("es.nodes", ES_NODES);
    conf.set("es.resource", ES_RESOURCE);
    conf.set("es.mapping.id", HBaseTableMapper.ID_FIELD.toString());
    conf.set("es.batch.size.bytes", "10mb");
    conf.set("es.batch.size.entries", "10000");
    conf.set("es.batch.write.refresh", "false");

    Job job = new Job(conf);
    job.setJarByClass(BulkIndex.class);
    job.setMapperClass(HBaseTableMapper.class);
    job.setNumReduceTasks(0);
    job.setSpeculativeExecution(false);
    job.setOutputFormatClass(BulkProcessorOutputFormat.class);
    job.setMapOutputValueClass(Text.class);

    Scan scan = new Scan();
    scan.setCaching(1000);
    scan.setCacheBlocks(false);

    TableMapReduceUtil.initTableMapperJob(
        BulkLoad.HBASE_TABLE_NAME,
        scan,
        HBaseTableMapper.class,
        NullWritable.class,
        MapWritable.class,
        job);

    job.waitForCompletion(true);
  }
Example #7
0
  public static void htableFile() throws Exception {
    Job job = new Job(conf, "ExampleSummaryToFile");
    job.setJarByClass(HbaseMR.class); // class that contains mapper

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(
        "sourceTable", // input table
        scan, // Scan instance to control CF and attribute selection
        MyMapper.class, // mapper class
        Text.class, // mapper output key
        IntWritable.class, // mapper output value
        job);
    job.setReducerClass(MyReducer4.class); // reducer class
    job.setNumReduceTasks(1); // at least one, adjust as required
    FileOutputFormat.setOutputPath(new JobConf(conf), new Path("/tmp/mr/mySummaryFile")); // adjust
    // directories
    // as
    // required

    boolean b = job.waitForCompletion(true);
    if (!b) {
      throw new IOException("error with job!");
    }
  }
  /**
   * Test # of blocks read to ensure disabling cache-fill on Scan works.
   *
   * @throws Exception
   */
  @Test
  public void testBlocksStoredWhenCachingDisabled() throws Exception {
    byte[] TABLE = Bytes.toBytes("testBlocksReadWhenCachingDisabled");
    String FAMILY = "cf1";

    HBaseConfiguration conf = getConf();
    this.region = initHRegion(TABLE, getName(), conf, FAMILY);

    try {
      putData(FAMILY, "row", "col1", 1);
      putData(FAMILY, "row", "col2", 2);
      region.flushcache();

      // Execute a scan with caching turned off
      // Expected blocks stored: 0
      long blocksStart = getBlkCount();
      Scan scan = new Scan();
      scan.setCacheBlocks(false);
      RegionScanner rs = region.getScanner(scan);
      List<KeyValue> result = new ArrayList<KeyValue>(2);
      rs.next(result);
      assertEquals(2 * BLOOM_TYPE.length, result.size());
      rs.close();
      long blocksEnd = getBlkCount();

      assertEquals(blocksStart, blocksEnd);

      // Execute with caching turned on
      // Expected blocks stored: 2
      blocksStart = blocksEnd;
      scan.setCacheBlocks(true);
      rs = region.getScanner(scan);
      result = new ArrayList<KeyValue>(2);
      rs.next(result);
      assertEquals(2 * BLOOM_TYPE.length, result.size());
      rs.close();
      blocksEnd = getBlkCount();

      assertEquals(2 * BLOOM_TYPE.length, blocksEnd - blocksStart);
    } finally {
      HRegion.closeHRegion(this.region);
      this.region = null;
    }
  }
  public static Job startJob(String[] args) throws IOException {

    // args[0] = hbase table name
    // args[1] = zookeeper

    Configuration hConf = HBaseConfiguration.create(new Configuration());
    hConf.set("hbase.zookeeper.quorum", args[1]);
    hConf.set("scan.table", args[0]);
    hConf.set("hbase.zookeeper.property.clientPort", "2181");

    Scan scan = new Scan();
    // scan.setFilter(rowColBloomFilter());

    Job job = new Job(hConf);
    job.setJobName("BSBM-Q11-RepartitionJoin");
    job.setJarByClass(RepartitionJoinQ11.class);
    // Change caching to speed up the scan
    scan.setCaching(500);
    scan.setMaxVersions(200);
    scan.setCacheBlocks(false);

    // Mapper settings
    TableMapReduceUtil.initTableMapperJob(
        args[0], // input HBase table name
        scan, // Scan instance to control CF and attribute selection
        RepartitionMapper.class, // mapper
        CompositeKeyWritable.class, // mapper output key
        KeyValueArrayWritable.class, // mapper output value
        job);

    // Repartition settings
    job.setPartitionerClass(CompositePartitioner.class);
    job.setSortComparatorClass(CompositeSortComparator.class);
    job.setGroupingComparatorClass(CompositeGroupingComparator.class);

    // Reducer settings
    job.setReducerClass(SharedServices.RepartitionJoin_Reducer.class); // reducer class
    job.setNumReduceTasks(1); // at least one, adjust as required

    FileOutputFormat.setOutputPath(job, new Path("output/BSBMQ11"));

    try {
      System.exit(job.waitForCompletion(true) ? 0 : 1);
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }

    return job;
  }
  public static void main(String[] args) throws IOException, SolrServerException {

    final Configuration conf;
    HttpSolrServer solrServer = new HttpSolrServer("http://c1master:8983/solr");
    conf = HBaseConfiguration.create();

    // Define Hbase Table Name
    HTable table = new HTable(conf, "test_global_shop");
    Scan scan = new Scan();

    // Define Hbase Column Family
    scan.addFamily(Bytes.toBytes("shop"));
    scan.setCaching(1000);
    scan.setCacheBlocks(false);
    ResultScanner ss = table.getScanner(scan);

    System.out.println("start Storing...");
    int i = 0;
    try {
      for (Result r : ss) {
        SolrInputDocument solrDoc = new SolrInputDocument();
        solrDoc.addField("key", new String(r.getRow()));
        for (KeyValue kv : r.raw()) {
          String fieldName = new String(kv.getQualifier());
          String fieldValue = new String(kv.getValue());
          if (fieldName.equalsIgnoreCase("address")
              || fieldName.equalsIgnoreCase("category")
              || fieldName.equalsIgnoreCase("name")
              || fieldName.equalsIgnoreCase("province")
              || fieldName.equalsIgnoreCase("tel")) {
            solrDoc.addField(fieldName, fieldValue);
          }
        }
        solrServer.add(solrDoc);
        solrServer.commit(true, true, true);
        i = i + 1;
        System.out.println("Already Succcess " + i + " number data");
      }
      ss.close();
      table.close();
      System.out.println("done !");
    } catch (IOException e) {
    } finally {
      ss.close();
      table.close();
      System.out.println("error !");
    }
  }
Example #11
0
 private static Scan getConfiguredScanForJob(Configuration conf, String[] args)
     throws IOException {
   Scan s = new Scan();
   // Set Scan Versions
   s.setMaxVersions(Integer.MAX_VALUE);
   s.setCacheBlocks(false);
   // Set Scan Column Family
   if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) {
     s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY)));
   }
   // Set RowFilter or Prefix Filter if applicable.
   Filter rowFilter = getRowFilter(args);
   if (rowFilter != null) {
     LOG.info("Setting Row Filter for counter.");
     s.setFilter(rowFilter);
   }
   return s;
 }
  // MapReduce Stage-1 Job
  public static Job startJob_Stage1(String[] args, Configuration hConf) throws IOException {

    // args[0] = hbase table name
    // args[1] = zookeeper

    /*
     * MapReduce Stage-1 Job
     * Retrieve a list of subjects and their attributes
     */
    Scan scan1 = new Scan();
    Job job1 = new Job(hConf);
    job1.setJobName("BSBM-Q8-RepartitionJoin");
    job1.setJarByClass(RepartitionJoinQ8.class);
    // Change caching and number of time stamps to speed up the scan
    scan1.setCaching(500);
    scan1.setMaxVersions(200);
    scan1.setCacheBlocks(false);

    // Mapper settings
    TableMapReduceUtil.initTableMapperJob(
        args[0], // input HBase table name
        scan1, // Scan instance to control CF and attribute selection
        RepartitionMapper.class, // mapper class
        CompositeKeyWritable.class, // mapper output key
        KeyValueArrayWritable.class, // mapper output value
        job1);

    // Reducer settings
    job1.setReducerClass(RepartitionReducer.class);

    job1.setOutputFormatClass(TextOutputFormat.class);
    // job1.setNumReduceTasks(1);  // Uncomment this if running into problems on 2+ node cluster
    FileOutputFormat.setOutputPath(job1, new Path("output/BSBMQ8"));

    try {
      job1.waitForCompletion(true);
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }

    return job1;
  }
Example #13
0
  /**
   * Deletes the specified table with all its columns. ATTENTION: Invoking this method will delete
   * the table if it exists and therefore causes data loss.
   */
  @Override
  public void clearStorage() throws StorageException {
    HBaseAdmin adm = getAdminInterface();

    try { // first of all, check if table exists, if not - we are done
      if (!adm.tableExists(tableName)) {
        logger.debug("clearStorage() called before table {} was created, skipping.", tableName);
        return;
      }
    } catch (IOException e) {
      throw new TemporaryStorageException(e);
    }

    HTable table = null;

    try {
      table = new HTable(hconf, tableName);

      Scan scan = new Scan();
      scan.setBatch(100);
      scan.setCacheBlocks(false);
      scan.setCaching(2000);

      ResultScanner scanner = null;

      try {
        scanner = table.getScanner(scan);

        for (Result res : scanner) {
          table.delete(new Delete(res.getRow()));
        }
      } finally {
        IOUtils.closeQuietly(scanner);
      }
    } catch (IOException e) {
      throw new TemporaryStorageException(e);
    } finally {
      IOUtils.closeQuietly(table);
    }
  }
Example #14
0
 public static void readHtable() throws Exception {
   Job job = new Job(conf, "ExampleRead");
   job.setJarByClass(HbaseMR.class);
   Scan scan = new Scan();
   scan.setCaching(500); // 1 is the default in Scan, which will be bad for
   // MapReduce jobs
   scan.setCacheBlocks(false); // don't set to true for MR jobs
   TableMapReduceUtil.initTableMapperJob(
       "", // input HBase table name
       scan, // Scan instance to control CF and attribute selection
       MyMapper.class, // mapper
       null, // mapper output key
       null, // mapper output value
       job);
   job.setOutputFormatClass(NullOutputFormat.class); // because we aren't
   // emitting anything
   // from mapper
   boolean b = job.waitForCompletion(true);
   if (!b) {
     throw new IOException("error with job!");
   }
 }
Example #15
0
  /**
   * It should be noticed that the stop row in scan is not included as default
   *
   * @param rowRange
   * @param filterList
   * @param family
   * @param columns
   * @param maxVersion
   * @return
   * @throws Exception
   */
  public ResultScanner getResultSet(
      String[] rowRange, FilterList filterList, String[] family, String[] columns, int maxVersion)
      throws Exception {
    if (table == null) throw new Exception("No table handler");
    if (cacheSize < 0) throw new Exception("should set cache size before scanning");

    Scan scan = null;
    ResultScanner rscanner = null;

    try {
      scan = new Scan();

      scan.setCaching(this.cacheSize);
      scan.setCacheBlocks(blockCached);
      scan.setFilter(filterList);
      if (maxVersion > 0) scan.setMaxVersions(maxVersion);

      // scan exclude the stop row directly, so have to make a little difference of the stop row
      if (rowRange != null) {
        scan.setStartRow(rowRange[0].getBytes());
        if (rowRange.length == 2 && rowRange[1] != null) scan.setStopRow((rowRange[1]).getBytes());
      }

      if (columns != null) {
        for (int i = 0; i < columns.length; i++) {
          scan.addColumn(family[0].getBytes(), columns[i].getBytes());
        }
      }

      rscanner = this.table.getScanner(scan);

    } catch (Exception e) {
      e.printStackTrace();
    }

    return rscanner;
  }
  @Test
  public void shouldRunMapReduce() throws Exception {
    // given
    Configuration configuration = HBaseConfiguration.create();

    TableFactory.recreateTable(
        configuration, Bytes.toString(UsersDao.TABLE_NAME), Bytes.toString(UsersDao.FAMILY_NAME));
    UserDataFactory.insertTestData();

    // map reduce
    Job job = new Job(configuration, "Count Users");
    job.setJarByClass(CountUsersMapper.class);

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.addColumn(UsersDao.FAMILY_NAME, UsersDao.FORENAME_COL);

    // mapper
    TableMapReduceUtil.initTableMapperJob(
        Bytes.toString(UsersDao.TABLE_NAME),
        scan,
        CountUsersMapper.class,
        ImmutableBytesWritable.class,
        Result.class,
        job);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);

    // when
    boolean succeeded = job.waitForCompletion(true);

    // then
    assertThat(succeeded).isTrue();
    assertThat(job.getCounters().findCounter(CountUsersMapper.Counters.USER_COUNT).getValue())
        .isGreaterThan(99);
  }
  public static boolean scanTable(Configuration config) throws IOException, URISyntaxException {

    /** Connection to the cluster. A single connection shared by all application threads. */
    Connection connection = null;
    /** A lightweight handle to a specific table. Used from a single thread. */
    Table table = null;

    Scan scan = null;

    ResultScanner scanner = null;

    //		String[] columns = null;

    try {
      // establish the connection to the cluster.
      connection = ConnectionFactory.createConnection(config);
      // retrieve a handle to the target table.
      table = connection.getTable(TABLE_NAME1);
      // Instantiating the Scan class
      scan = new Scan();
      // Recommended tuning Ch12 HBase Performance Tuning
      scan.setCaching(5000); // Defined rows returned from server during next()
      scan.setCacheBlocks(false); // Disable server side cache
      // Scanning the required columns
      scan.addFamily(CF);
      // Getting the scan result
      scanner = table.getScanner(scan);

      System.out.println("Rows founded");
      StringBuffer line = new StringBuffer();

      int counter = 0;
      // Reading values from scan result
      for (Result result = scanner.next(); result != null; result = scanner.next()) {
        columns = getColumnsInColumnFamily(result, CF);

        // clean line
        //		    	line.delete(0, line.length());

        line.append(Bytes.toString(result.getRow()));
        for (String column : columns) {
          line.append("^" + Bytes.toString(result.getValue(CF, Bytes.toBytes(column))));
        }
        line.append("\n");
        counter++;
        System.out.println("Linea: " + counter);

        //		    	line.append(""
        //		    		+ Bytes.toString(result.getRow()) + ","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("account"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("amount"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("aut"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("balance"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("card"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("commerce"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("commerceData"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("date"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("f"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("f2"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("f3"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("f4"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("hour"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("m"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("money"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("ms"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("r"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("ref"))) +","
        //		    		+ Bytes.toString(result.getValue(CF, Bytes.toBytes("trxcd")))  +"\n"
        //		    		);
        //		    	System.out.println(Bytes.toString(result.getRow()) +","+
        // Bytes.toString(result.getValue(CF, qualifier)));
      }

      DistributedFileSystem hdfs = new DistributedFileSystem();
      hdfs.initialize(new URI("hdfs://quickstart.cloudera:8020"), config);

      Path homeDir = hdfs.getHomeDirectory();
      // Print the home directory
      System.out.println("Home folder HDFS-" + homeDir);

      Path newFilePath = new Path(homeDir + "/datatest/" + TABLE_NAME + "/file.csv");

      byte[] byt = line.toString().getBytes();
      FSDataOutputStream fsOutStream = hdfs.create(newFilePath);
      fsOutStream.write(byt);

      fsOutStream.close();
      hdfs.close();

      System.out.println("File exported to " + homeDir + "/datatest/" + TABLE_NAME + "/file.csv");

    } finally {
      // close everything down
      if (table != null) table.close();
      if (connection != null) connection.close();
      if (scanner != null) scanner.close();
    }
    return true;
  }
  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    if (args.length == 0) {
      System.out.println(
          "ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
      return;
    }

    String table = args[0];
    String columnFamily = args[1];
    String outputPath = args[2];
    String compressionCodec = args[3];
    String schemaFilePath = args[4];
    String delimiter = args[5];

    String rowKeyColumn = "";
    if (args.length > 6) {
      rowKeyColumn = args[6];
    }

    Job job = Job.getInstance();
    job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);

    HBaseConfiguration.addHbaseResources(job.getConfiguration());

    job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
    job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
    job.getConfiguration().set(DELIMITER_CONF, delimiter);

    job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
    job.setJobName("ExportHBaseTableToDelimiteredSeq ");

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.addFamily(Bytes.toBytes(columnFamily));

    TableMapReduceUtil.initTableMapperJob(
        table, // input HBase table name
        scan, // Scan instance to control CF and attribute selection
        MyMapper.class, // mapper
        null, // mapper output key
        null, // mapper output value
        job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));

    if (compressionCodec.equals("snappy")) {
      SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
    } else if (compressionCodec.equals("gzip")) {
      SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {
      // nothing
    }

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(0);

    boolean b = job.waitForCompletion(true);
  }
Example #19
0
  private PageBean getDataMapTemp(SearchParam searchParam, Integer currentPage, Integer pageSize)
      throws IOException {
    String tableName = searchParam.getTableName();
    String startRow = searchParam.getStartKey();
    String stopRow = searchParam.getEndKey();

    List<Map<String, String>> mapList = null;
    mapList = new LinkedList<Map<String, String>>();
    ResultScanner scanner = null;
    // 为分页创建的封装类对象,下面有给出具体属性 TBData tbData = null;
    PageBean tbData = new PageBean();
    Scan scan = null;
    HbaseWhereEngine whereEngine = null;
    HTableInterface table = HbaseTool.getTable(tableName); // 获取筛选对象
    if ("ca_summary_optimize".equals(tableName) && compoundFieldMap.isEmpty()) {
      compoundFieldMap = CompoundFieldConfigService.getParentComField(tableName);
    }
    try {
      if (StringUtils.isNotEmpty(searchParam.getWhere())) {
        whereEngine =
            new HbaseWhereEngine(tableName, searchParam.getFamily(), searchParam.getWhere());
      }

      String[] selectArray = null;
      if (StringUtils.isNotEmpty(searchParam.getSelect())) {
        selectArray = searchParam.getSelect().split(",");
      }
      byte[] cf = Bytes.toBytes(searchParam.getFamily());

      // if keys is not null,then we can sure which records to be selected!
      if (StringUtils.isNotEmpty(searchParam.getKeys())) {
        List<Get> getKeysList = new ArrayList<Get>();

        for (String key : searchParam.getKeys().split(",")) {
          Get get = new Get(Bytes.toBytes(key));
          getKeysList.add(get);

          if (selectArray != null) {
            for (String field : selectArray) {
              String temp[] = processField(field, searchParam.getFamily());
              if ("ca_summary_optimize".equals(tableName)
                  && compoundFieldMap.containsKey(temp[1])) {
                get.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(compoundFieldMap.get(temp[1])));
              } else {
                get.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(temp[1]));
              }
            }
          }

          if (selectArray != null && whereEngine != null) {
            Set<String> varSet = whereEngine.getVarSet();
            for (String var : varSet) {
              get.addColumn(cf, Bytes.toBytes(var));
            }
          }
        }

        Result[] resultsFromKeys = table.get(getKeysList);

        for (Result rr : resultsFromKeys) {
          if (!rr.isEmpty()) {
            if (whereEngine != null && !whereEngine.meetCondition(rr)) {
              continue;
            }
            Map<String, String> map = new TreeMap<String, String>();

            map.put("_id", Bytes.toString(rr.getRow()));
            for (String field : selectArray) {
              String value = HbaseWhereEngine.getColumnValue(tableName, cf, rr, field);
              if (!field.equals("id")) {
                map.put(field, value);
              }
            }
            mapList.add(map);
          }
        }
        pageSize = mapList.size();
        tbData.setCurrentPage(currentPage);
        tbData.setLength(pageSize);
        tbData.setTotalRecords(mapList.size());
      } else { // if keys is null,we select some records between startKey and end key or top
               // pageSize !
        // 获取最大返回结果数量
        if (pageSize == null || pageSize == 0L) pageSize = 100;
        if (currentPage == null || currentPage == 0) currentPage = 1;
        // 计算起始页和结束页

        Integer firstPage = (currentPage - 1) * pageSize;

        Integer endPage = firstPage + pageSize;

        scan = getScan(startRow, stopRow);
        // 给筛选对象放入过滤器(true标识分页,具体方法在下面)
        scan.setFilter(packageFilters(searchParam, true));

        // 缓存1000条数据
        scan.setCaching(1000);
        scan.setCacheBlocks(false);

        if (selectArray != null) {
          for (String field : selectArray) {
            String temp[] = processField(field, searchParam.getFamily());
            if ("ca_summary_optimize".equals(tableName) && compoundFieldMap.containsKey(temp[1])) {
              scan.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(compoundFieldMap.get(temp[1])));
            } else {
              scan.addColumn(Bytes.toBytes(temp[0]), Bytes.toBytes(temp[1]));
            }
          }
        }

        if (selectArray != null && whereEngine != null) {
          Set<String> varSet = whereEngine.getVarSet();
          for (String var : varSet) {
            scan.addColumn(cf, Bytes.toBytes(var));
          }
        }

        scanner = table.getScanner(scan);
        int i = 0;
        List<byte[]> rowList = new LinkedList<byte[]>();
        // 遍历扫描器对象, 并将需要查询出来的数据row key取出
        for (Result result : scanner) {
          String row = toStr(result.getRow());
          if (i >= firstPage && i < endPage) { // filter firstPage
            rowList.add(getBytes(row));
          }
          if (i >= endPage) {
            break;
          }
          i++;
        }
        // 获取取出的row key的GET对象
        List<Get> getList = getList(rowList, cf, selectArray, tableName);
        Result[] results = table.get(getList);
        for (Result result : results) {
          if (whereEngine != null && !whereEngine.meetCondition(result)) {
            continue;
          }
          Map<byte[], byte[]> fmap = packFamilyMap(tableName, result, cf, selectArray);
          Map<String, String> rmap = packRowMap(fmap);
          rmap.put("_id", toStr(result.getRow()));
          mapList.add(rmap);
        }

        // 封装分页对象

        tbData.setCurrentPage(currentPage);
        tbData.setLength(pageSize); // (pageSize);
        tbData.setTotalRecords(i);
      }
      tbData.setResults(mapList);
    } catch (Exception e) {
    } finally {
      if (table != null) {
        table.close();
      }
      closeScanner(scanner);
    }
    compoundFieldMap.clear();
    return tbData;
  }
Example #20
0
  private ResultScanner createHBaseResultScanner(RecordScan scan)
      throws RepositoryException, InterruptedException {
    Scan hbaseScan = new Scan();

    hbaseScan.setMaxVersions(1);

    if (scan.getRawStartRecordId() != null) {
      hbaseScan.setStartRow(scan.getRawStartRecordId());
    } else if (scan.getStartRecordId() != null) {
      hbaseScan.setStartRow(scan.getStartRecordId().toBytes());
    }

    if (scan.getRawStopRecordId() != null) {
      hbaseScan.setStopRow(scan.getRawStopRecordId());
    } else if (scan.getStopRecordId() != null) {
      hbaseScan.setStopRow(scan.getStopRecordId().toBytes());
    }

    // Filters
    FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);

    // filter out deleted records
    filterList.addFilter(REAL_RECORDS_FILTER);

    // add user's filter
    if (scan.getRecordFilter() != null) {
      Filter filter = filterFactory.createHBaseFilter(scan.getRecordFilter(), this, filterFactory);
      filterList.addFilter(filter);
    }

    hbaseScan.setFilter(filterList);

    hbaseScan.setCaching(scan.getCaching());

    hbaseScan.setCacheBlocks(scan.getCacheBlocks());

    ReturnFields returnFields = scan.getReturnFields();
    if (returnFields != null && returnFields.getType() != ReturnFields.Type.ALL) {
      RecordDecoder.addSystemColumnsToScan(hbaseScan);
      switch (returnFields.getType()) {
        case ENUM:
          for (QName field : returnFields.getFields()) {
            FieldTypeImpl fieldType = (FieldTypeImpl) typeManager.getFieldTypeByName(field);
            hbaseScan.addColumn(RecordCf.DATA.bytes, fieldType.getQualifier());
          }
          break;
        case NONE:
          // nothing to add
          break;
        default:
          throw new RuntimeException("Unrecognized ReturnFields type: " + returnFields.getType());
      }
    } else {
      hbaseScan.addFamily(RecordCf.DATA.bytes);
    }

    ResultScanner hbaseScanner;
    try {
      hbaseScanner = recordTable.getScanner(hbaseScan);
    } catch (IOException e) {
      throw new RecordException("Error creating scanner", e);
    }
    return hbaseScanner;
  }
  public static void main(String args[]) throws Exception {

    try {

      Configuration config = HBaseConfiguration.create();
      config.set("mapred.output.dir", "/mnt/data/workspace/weatheranalysis/mapred/monthly");
      Job job = new Job(config, "MonthlySummary");

      String sourceTable = DailyWeatherHbaseOperations.TABLE_NAME;

      Scan scan = new Scan();
      scan.setCaching(500); // 1 is the default in Scan, which will be bad
      // for
      // MapReduce jobs
      scan.setCacheBlocks(false); // don't set to true for MR jobs
      // set other scan attrs

      TableMapReduceUtil.initTableMapperJob(
          sourceTable, // input table
          scan, // Scan instance to control CF and attribute selection
          MonthlyWeatherMapper.class, // mapper class
          Text.class, // mapper output key
          Text.class, // mapper output value
          job);

      job.setReducerClass(MonthlyWeatherReducer.class); // reducer class
      job.setNumReduceTasks(1); // at least one, adjust as required

      Path out = new Path("/mnt/data/workspace/weatheranalysis/mapred/monthly");
      File outDir = new File(out.toString());
      FileUtil.fullyDelete(outDir);
      FileOutputFormat.setOutputPath(job, out);

      MonthlyWeatherHbaseOperations.useTable();

      // The required Total Precipitate for a particular station for an year

      /* List<Float> ForAllMonthsPpt = new ArrayList<Float>();
      	  List<MonthlyWeather> lr = MonthlyWeatherHbaseOperations.get(425010,2013);
              while(lr.isEmpty()) {
      	    ForAllMonthsPpt.add(lr.get(4));

      }
      	 *
      	 *
      	 *
      	 *
      	 */
      job.waitForCompletion(true);
      System.out.println("Job Completed.");

      boolean b = job.waitForCompletion(true);
      if (!b) {
        throw new IOException("error with job!");
      }

    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
  }
Example #22
0
  @Override
  public Scan getRawScan(ItemData d, Map<String, HCompareOp> ops) {
    int startShopID = 0;
    int startItemID = 0;

    int endShopID = MAX_SHOP_ID;
    int endItemID = MAX_ITEM_ID;

    // some performance improvement
    // shop_id指定
    HCompareOp shopIDOp = ops.get("shop_id");
    if (shopIDOp == HCompareOp.EQUAL) {
      startShopID = d.shopID;
      endShopID = startShopID;
    }

    // item idも指定
    HCompareOp itemIDOp = ops.get("item_id");
    if (itemIDOp == HCompareOp.EQUAL) {
      startItemID = d.itemID;
      endItemID = startItemID;
    }

    log.info(String.format("scan start row, shop_id=%d, item_id=%d", startShopID, startItemID));
    log.info(String.format("scan stop row, shop_id=%d, item_id=%d", endShopID, endItemID));

    byte[] startRow = encodeRowkey(startShopID, startItemID);
    byte[] endRow = encodeRowkey(endShopID, endItemID);
    Scan s = new Scan(startRow, endRow);
    s.addFamily(DATA_FAMILY);
    s.addFamily(META_FAMILY);
    s.setCacheBlocks(false);
    s.setMaxVersions();
    s.setCaching(DEFAULT_SCAN_CACHE);

    FilterList fl = new FilterList();
    for (String column : ops.keySet()) {
      byte[] value;
      byte[] family = DATA_FAMILY;

      if ("ctime".equals(column)) {
        value = Bytes.toBytes(d.ctime);
        family = META_FAMILY;

      } else if ("shop_id".equals(column)) {
        value = Bytes.toBytes(d.shopID);

      } else if ("item_id".equals(column)) {
        value = Bytes.toBytes(d.itemID);

      } else if ("genre_id".equals(column)) {
        value = Bytes.toBytes(d.genreID);

      } else if ("price".equals(column)) {
        value = Bytes.toBytes(d.price);

      } else if ("full_item_url".equals(column)) {
        value = Bytes.toBytes(d.fullItemUrl);

      } else if ("item_name".equals(column)) {
        value = Bytes.toBytes(d.itemName);

      } else {
        // ignore
        continue;
      }

      byte[] qualifier = Bytes.toBytes(column);
      HCompareOp hop = ops.get(column);
      CompareOp op = HClient.toCompareOp(hop);

      SingleColumnValueFilter filter = new SingleColumnValueFilter(family, qualifier, op, value);
      filter.setFilterIfMissing(true);
      fl.addFilter(filter);
    }

    s.setFilter(fl);
    return s;
  }
Example #23
0
  /**
   * Get an estimate of the number of rows and bytes per row in regions between startRowKey and
   * endRowKey. The more store files there are the more this will be off. Also, this does not take
   * into account any rows that are in the memstore.
   *
   * <p>The values computed here should be cached so that in high qps workloads the nn is not
   * overwhelmed. Could be done in load(); Synchronized to make sure that only one thread at a time
   * is using the htable.
   *
   * @param startRowKey First row key in the range
   * @param endRowKey Last row key in the range
   * @return The estimated number of rows in the regions between the row keys (first) and the
   *     estimated row size in bytes (second).
   */
  public synchronized Pair<Long, Long> getEstimatedRowStats(byte[] startRowKey, byte[] endRowKey) {
    Preconditions.checkNotNull(startRowKey);
    Preconditions.checkNotNull(endRowKey);

    long rowSize = 0;
    long rowCount = 0;
    long hdfsSize = 0;
    boolean isCompressed = false;

    try {
      // Check to see if things are compressed.
      // If they are we'll estimate a compression factor.
      if (columnFamilies_ == null) {
        columnFamilies_ = hTable_.getTableDescriptor().getColumnFamilies();
      }
      Preconditions.checkNotNull(columnFamilies_);
      for (HColumnDescriptor desc : columnFamilies_) {
        isCompressed |= desc.getCompression() != Compression.Algorithm.NONE;
      }

      // For every region in the range.
      List<HRegionLocation> locations = getRegionsInRange(hTable_, startRowKey, endRowKey);
      for (HRegionLocation location : locations) {
        long currentHdfsSize = 0;
        long currentRowSize = 0;
        long currentRowCount = 0;

        HRegionInfo info = location.getRegionInfo();
        // Get the size on hdfs
        currentHdfsSize += getHdfsSize(info);

        Scan s = new Scan(info.getStartKey());
        // Get a small sample of rows
        s.setBatch(ROW_COUNT_ESTIMATE_BATCH_SIZE);
        // Try and get every version so the row's size can be used to estimate.
        s.setMaxVersions(Short.MAX_VALUE);
        // Don't cache the blocks as we don't think these are
        // necessarily important blocks.
        s.setCacheBlocks(false);
        // Try and get deletes too so their size can be counted.
        s.setRaw(true);
        ResultScanner rs = hTable_.getScanner(s);
        try {
          // And get the the ROW_COUNT_ESTIMATE_BATCH_SIZE fetched rows
          // for a representative sample
          for (int i = 0; i < ROW_COUNT_ESTIMATE_BATCH_SIZE; i++) {
            Result r = rs.next();
            if (r == null) break;
            currentRowCount += 1;
            for (KeyValue kv : r.list()) {
              // some extra row size added to make up for shared overhead
              currentRowSize +=
                  kv.getRowLength() // row key
                      + 4 // row key length field
                      + kv.getFamilyLength() // Column family bytes
                      + 4 // family length field
                      + kv.getQualifierLength() // qualifier bytes
                      + 4 // qualifier length field
                      + kv.getValueLength() // length of the value
                      + 4 // value length field
                      + 10; // extra overhead for hfile index, checksums, metadata, etc
            }
          }
          // add these values to the cumulative totals in one shot just
          // in case there was an error in between getting the hdfs
          // size and the row/column sizes.
          hdfsSize += currentHdfsSize;
          rowCount += currentRowCount;
          rowSize += currentRowSize;
        } finally {
          rs.close();
        }
      }
    } catch (IOException ioe) {
      // Print the stack trace, but we'll ignore it
      // as this is just an estimate.
      // TODO: Put this into the per query log.
      LOG.error("Error computing HBase row count estimate", ioe);
    }

    // If there are no rows then no need to estimate.
    if (rowCount == 0) return new Pair<Long, Long>(0L, 0L);

    // if something went wrong then set a signal value.
    if (rowSize <= 0 || hdfsSize <= 0) return new Pair<Long, Long>(-1L, -1L);

    // estimate the number of rows.
    double bytesPerRow = rowSize / (double) rowCount;
    long estimatedRowCount = (long) ((isCompressed ? 2 : 1) * (hdfsSize / bytesPerRow));

    return new Pair<Long, Long>(estimatedRowCount, (long) bytesPerRow);
  }