Java TableMapReduceUtil.initTableMapperJob 예제들, org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil.initTableMapperJob Java 예제들

예제 #1

0

파일 보기

파일: hbaseDB.java 프로젝트: remybeaumont/Wikipedia-noSQL-Benchmark

  @Override
  public void searchDB(String keyword) {
    long t0 = System.nanoTime();

    try {
      // First mapreduce phase setup
      HBaseConfiguration conf = config;
      Job job;
      job = new Job(conf, "MapReducePhase1");
      job.setJarByClass(MapReduceHbaseDB.class);
      Scan scan = new Scan();
      String columns = "myColumnFamily";
      scan.addColumns(columns);
      scan.setCaching(10000);

      // Second mapreduce phase setup
      HBaseConfiguration conf2 = new HBaseConfiguration();
      Job job2 = new Job(conf2, "MapReducePhase2");
      job2.setJarByClass(MapReduceHbaseDB.class);
      Scan scan2 = new Scan();
      String columns2 = "resultF";
      scan2.addColumns(columns2);
      scan2.setCaching(10000);

      // Execution of the first mapreduce phase
      TableMapReduceUtil.initTableMapperJob(
          "myTable", scan, Mapper1.class, Text.class, Text.class, job);
      TableMapReduceUtil.initTableReducerJob("result", Reducer1.class, job);

      job.waitForCompletion(true);

      long t2 = System.nanoTime();

      // Execution of the second mapreduce phase
      TableMapReduceUtil.initTableMapperJob(
          "result", scan2, Mapper2.class, Text.class, IntWritable.class, job2);
      TableMapReduceUtil.initTableReducerJob("result2", Reducer2.class, job2);

      job2.waitForCompletion(true);

      long t1 = System.nanoTime();
      double totalTime = (t1 - t0) / 1000000000.0;
      System.out.println("Total time for the search : " + totalTime + " seconds");

      double firstPhaseTime = (t2 - t0) / 1000000000.0;
      System.out.println("Time for the first mapreduce phase : " + firstPhaseTime + " seconds");

      double secondPhaseTime = (t1 - t2) / 1000000000.0;
      System.out.println("Time for the first mapreduce phase : " + secondPhaseTime + " seconds");

    } catch (IOException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    }
  }

예제 #2

0

파일 보기

파일: CachingRowCounter.java 프로젝트: fundatureanu-sever/hbase-rdf

  /**
   * Sets up the actual job.
   *
   * @param conf The current configuration.
   * @param args The command line parameters.
   * @return The newly created job.
   * @throws IOException When setting up the job fails.
   */
  public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(CachingRowCounter.class);
    // Columns are space delimited
    StringBuilder sb = new StringBuilder();
    final int columnoffset = 1;
    for (int i = columnoffset; i < args.length; i++) {
      if (i > columnoffset) {
        sb.append(" ");
      }
      sb.append(args[i]);
    }

    Scan scan = new Scan();
    scan.setFilter(new FirstKeyOnlyFilter());
    if (sb.length() > 0) {
      for (String columnName : sb.toString().split(" ")) {
        String[] fields = columnName.split(":");
        if (fields.length == 1) {
          scan.addFamily(Bytes.toBytes(fields[0]));
        } else {
          scan.addColumn(Bytes.toBytes(fields[0]), Bytes.toBytes(fields[1]));
        }
      }
    }
    scan.setCaching(100);

    // Second argument is the table name.
    job.setOutputFormatClass(NullOutputFormat.class);
    TableMapReduceUtil.initTableMapperJob(
        tableName, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);
    job.setNumReduceTasks(0);
    return job;
  }

예제 #3

0

파일 보기

파일: BulkIndex.java 프로젝트: ept/hbase-es

  public static void main(String[] args) throws Exception {
    new JobConf().setSpeculativeExecution(false);
    Configuration conf = new Configuration();
    conf.set("es.nodes", ES_NODES);
    conf.set("es.resource", ES_RESOURCE);
    conf.set("es.mapping.id", HBaseTableMapper.ID_FIELD.toString());
    conf.set("es.batch.size.bytes", "10mb");
    conf.set("es.batch.size.entries", "10000");
    conf.set("es.batch.write.refresh", "false");

    Job job = new Job(conf);
    job.setJarByClass(BulkIndex.class);
    job.setMapperClass(HBaseTableMapper.class);
    job.setNumReduceTasks(0);
    job.setSpeculativeExecution(false);
    job.setOutputFormatClass(BulkProcessorOutputFormat.class);
    job.setMapOutputValueClass(Text.class);

    Scan scan = new Scan();
    scan.setCaching(1000);
    scan.setCacheBlocks(false);

    TableMapReduceUtil.initTableMapperJob(
        BulkLoad.HBASE_TABLE_NAME,
        scan,
        HBaseTableMapper.class,
        NullWritable.class,
        MapWritable.class,
        job);

    job.waitForCompletion(true);
  }

예제 #4

0

파일 보기

파일: TermHitsCounter.java 프로젝트: salsaproj/IndexedHBase

  /** Job configuration. */
  public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Scan scan = new Scan();
    scan.addFamily(Cw09Constants.CF_FREQUENCIES_BYTES);
    scan.setBatch(Cw09Constants.CW09_INDEX_SCAN_BATCH);

    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");
    Job job = Job.getInstance(conf, "Count the total frequency of each term in the index table");
    job.setJarByClass(TermHitsCounter.class);
    // TableMapReduceUtil.initTableMapperJob(Constants.CLUEWEB09_INDEX_TABLE_NAME, scan,
    //		ThcMapper.class, Text.class, LongWritable.class, job);
    TableMapReduceUtil.initTableMapperJob(
        Cw09Constants.CLUEWEB09_INDEX_TABLE_NAME,
        scan,
        ThcMapper.class,
        Text.class,
        LongWritable.class,
        job,
        true,
        CustomizedSplitTableInputFormat.class);
    job.setCombinerClass(ThcCombiner.class);
    TableMapReduceUtil.initTableReducerJob(
        Cw09Constants.CLUEWEB09_TERM_COUNT_TABLE_NAME, ThcReducer.class, job);
    job.setNumReduceTasks(40);
    return job;
  }

예제 #5

0

파일 보기

파일: RowKeyDistributorTestBase.java 프로젝트: ionutig/HBaseWD

  private void testMapReduceInternal(
      long origKeyPrefix,
      Scan scan,
      int numValues,
      int startWithValue,
      int seekIntervalMinValue,
      int seekIntervalMaxValue)
      throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval =
        writeTestData(
            origKeyPrefix, numValues, startWithValue, seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = testingUtility.getConfiguration();
    Job job = new Job(conf, "testMapReduceInternal()-Job");
    job.setJarByClass(this.getClass());
    TableMapReduceUtil.initTableMapperJob(
        TABLE_NAME, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in
    // TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);
  }

예제 #6

0

파일 보기

파일: UnionExternalTest.java 프로젝트: wojtek-szymanski/hadoop

  @Test
  public void shouldJoinTables() throws Exception {
    // given

    Job job = new Job(configuration, "Joins");
    job.setJarByClass(AverageRatingMapper.class);

    List<Scan> scans = new ArrayList<>();

    Scan scan1 = new Scan();
    scan1.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(LoadMovieData.TABLE_NAME));
    scans.add(scan1);

    Scan scan2 = new Scan();
    scan2.setAttribute(
        Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(LoadMovieRatingData.TABLE_NAME));
    scans.add(scan2);

    TableMapReduceUtil.initTableMapperJob(scans, FilterMapper.class, null, null, job);
    //        FileOutputFormat.setOutputPath(job, new Path("/tmp/sages/movies_with_ratings_" +
    // System.currentTimeMillis()));
    TableMapReduceUtil.initTableReducerJob(TABLE_NAME, null, job);
    job.setNumReduceTasks(0);

    // when
    boolean succeeded = job.waitForCompletion(true);

    // then
    assertThat(succeeded).isTrue();
  }

예제 #7

0

파일 보기

파일: AverageCalculator.java 프로젝트: anjaniswetha/BulkUploadwithHBase

 public static void main(String[] args) throws Exception {
   Configuration con = new Configuration();
   String[] otherArgs = new GenericOptionsParser(con, args).getRemainingArgs();
   HBaseConfiguration conf = new HBaseConfiguration();
   Job job = new Job(conf, "AverageCalc");
   job.setJarByClass(AverageCalculator.class);
   Scan scan = new Scan();
   scan.setCaching(500);
   scan.setCacheBlocks(false);
   scan.addFamily(Bytes.toBytes("Post"));
   FilterList li = new FilterList(FilterList.Operator.MUST_PASS_ALL);
   SingleColumnValueFilter filter =
       new SingleColumnValueFilter(
           Bytes.toBytes("Post"),
           Bytes.toBytes("PostTypeId"),
           CompareOp.EQUAL,
           Bytes.toBytes("1"));
   li.addFilter(filter);
   scan.setFilter(li);
   FileOutputFormat.setOutputPath(job, new Path(otherArgs[0]));
   job.setOutputKeyClass(Text.class);
   TableMapReduceUtil.initTableMapperJob(
       "bigd24-hbase-sample", scan, Mapper1.class, Text.class, IntWritable.class, job);
   job.setReducerClass(Reducer1.class);
   job.setOutputValueClass(FloatWritable.class);
   System.exit(job.waitForCompletion(true) ? 0 : 1);
 }

예제 #8

0

파일 보기

파일: HbaseMR.java 프로젝트: chinahuangxin/prjs

  public static void htableFile() throws Exception {
    Job job = new Job(conf, "ExampleSummaryToFile");
    job.setJarByClass(HbaseMR.class); // class that contains mapper

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(
        "sourceTable", // input table
        scan, // Scan instance to control CF and attribute selection
        MyMapper.class, // mapper class
        Text.class, // mapper output key
        IntWritable.class, // mapper output value
        job);
    job.setReducerClass(MyReducer4.class); // reducer class
    job.setNumReduceTasks(1); // at least one, adjust as required
    FileOutputFormat.setOutputPath(new JobConf(conf), new Path("/tmp/mr/mySummaryFile")); // adjust
    // directories
    // as
    // required

    boolean b = job.waitForCompletion(true);
    if (!b) {
      throw new IOException("error with job!");
    }
  }

예제 #9

0

파일 보기

파일: HbaseMR.java 프로젝트: chinahuangxin/prjs

  public static void readWriteHtable() throws Exception {
    Job job = new Job(conf, "ExampleReadWrite");
    job.setJarByClass(HbaseMR.class); // class that contains mapper

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    // set other scan attrs

    TableMapReduceUtil.initTableMapperJob(
        "sourceTable", // input table
        scan, // Scan instance to control CF and attribute selection
        MyMapper2.class, // mapper class
        null, // mapper output key
        null, // mapper output value
        job);
    TableMapReduceUtil.initTableReducerJob(
        "targetTable", // output table
        null, // reducer class
        job);
    job.setNumReduceTasks(0);

    boolean b = job.waitForCompletion(true);
    if (!b) {
      throw new IOException("error with job!");
    }
  }

예제 #10

0

파일 보기

파일: TruthyIndexFeatureCounter.java 프로젝트: salsaproj/IndexedHBase

  /** Job configuration. */
  public static Job configureJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    String columnFamily = args[1];
    String outputPath = args[2];
    String rowKeyType = args[3];
    conf.set("row.key.type", rowKeyType);
    conf.set("table.name", tableName);
    Scan scan = new Scan();
    scan.addFamily(Bytes.toBytes(columnFamily));
    scan.setBatch(ConstantsTruthy.TRUTHY_TABLE_SCAN_BATCH);

    conf.set("mapred.map.tasks.speculative.execution", "false");
    conf.set("mapred.reduce.tasks.speculative.execution", "false");
    Job job =
        Job.getInstance(
            conf, "Count the column count and indexRecordSize for each row in " + tableName);
    job.setJarByClass(TruthyIndexFeatureCounter.class);
    TableMapReduceUtil.initTableMapperJob(
        tableName, scan, TfcMapper.class, Text.class, Text.class, job, true);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    TableMapReduceUtil.addDependencyJars(job);
    return job;
  }

예제 #11

0

파일 보기

파일: AverageRatingToFileExternalTest.java 프로젝트: wojtek-szymanski/hadoop

  @Test
  public void shouldRunMapReduce() throws Exception {
    // given
    Job job = new Job(configuration, "Average Rating");
    job.setJarByClass(AverageRatingMapper.class);

    Scan scan = new Scan();
    scan.setCaching(500);
    scan.setCacheBlocks(false);
    scan.addFamily(Bytes.toBytes(LoadMovieRatingData.FAMILY_NAME));

    TableMapReduceUtil.initTableMapperJob(
        LoadMovieRatingData.TABLE_NAME,
        scan,
        AverageRatingMapper.class,
        Text.class,
        DoubleWritable.class,
        job);
    job.setReducerClass(RatingExportReducer.class);
    job.setNumReduceTasks(1);
    FileOutputFormat.setOutputPath(
        job, new Path("/tmp/mr/mySummaryFile_" + System.currentTimeMillis()));

    // when
    boolean succeeded = job.waitForCompletion(true);

    // then
    assertThat(succeeded).isTrue();
  }

예제 #12

0

파일 보기

파일: CountRowsMR.java 프로젝트: davidbuttler/hbase-demo

  public int run(String[] args) throws Exception {
    if (args.length != 1) {
      System.out.println("usage: CountRows <table name>");
      return 1;
    }
    Configuration conf = getConf();

    try {
      String tableName = args[0];

      LOG.info("Before map/reduce startup");
      Job job = new Job(conf, "query: count rows");
      job.setJarByClass(this.getClass());
      job.getConfiguration().set(TABLE_NAME, args[0]);

      Scan scan = new Scan();

      TableMapReduceUtil.initTableMapperJob(
          tableName, scan, CountRowMapper.class, ImmutableBytesWritable.class, Put.class, job);
      // TableMapReduceUtil.initTableReducerJob(tableName,
      // IdentityTableReducer.class, job);
      job.setNumReduceTasks(0);

      LOG.info("Started " + tableName);
      job.waitForCompletion(true);
      LOG.info("After map/reduce completion");

    } catch (Exception e) {
      e.printStackTrace();
      return 1;
    }

    return 0;
  }

예제 #13

0

파일 보기

파일: SensorMR.java 프로젝트: shahab3476/HI-labs

 public static void main(String[] args) throws Exception {
   Configuration conf = HBaseConfiguration.create();
   Job job = new Job(conf, SensorMR.class.getName() + "--<your name>"); // TODO
   job.setJarByClass(SensorMR.class);
   Scan scan = new Scan();
   scan.setFilter(new FirstKeyOnlyFilter());
   TableMapReduceUtil.initTableMapperJob(
       tableRawData, scan, Mapper1.class, ImmutableBytesWritable.class, FloatWritable.class, job);
   TableMapReduceUtil.initTableReducerJob(tableSummaryData, Reducer1.class, job);
   TableMapReduceUtil.addDependencyJars(job);
   System.exit(job.waitForCompletion(true) ? 0 : 1);
 }

예제 #14

0

파일 보기

파일: RepartitionJoinQ11.java 프로젝트: subbu3490/rdf-mapreduce-joins

  public static Job startJob(String[] args) throws IOException {

    // args[0] = hbase table name
    // args[1] = zookeeper

    Configuration hConf = HBaseConfiguration.create(new Configuration());
    hConf.set("hbase.zookeeper.quorum", args[1]);
    hConf.set("scan.table", args[0]);
    hConf.set("hbase.zookeeper.property.clientPort", "2181");

    Scan scan = new Scan();
    // scan.setFilter(rowColBloomFilter());

    Job job = new Job(hConf);
    job.setJobName("BSBM-Q11-RepartitionJoin");
    job.setJarByClass(RepartitionJoinQ11.class);
    // Change caching to speed up the scan
    scan.setCaching(500);
    scan.setMaxVersions(200);
    scan.setCacheBlocks(false);

    // Mapper settings
    TableMapReduceUtil.initTableMapperJob(
        args[0], // input HBase table name
        scan, // Scan instance to control CF and attribute selection
        RepartitionMapper.class, // mapper
        CompositeKeyWritable.class, // mapper output key
        KeyValueArrayWritable.class, // mapper output value
        job);

    // Repartition settings
    job.setPartitionerClass(CompositePartitioner.class);
    job.setSortComparatorClass(CompositeSortComparator.class);
    job.setGroupingComparatorClass(CompositeGroupingComparator.class);

    // Reducer settings
    job.setReducerClass(SharedServices.RepartitionJoin_Reducer.class); // reducer class
    job.setNumReduceTasks(1); // at least one, adjust as required

    FileOutputFormat.setOutputPath(job, new Path("output/BSBMQ11"));

    try {
      System.exit(job.waitForCompletion(true) ? 0 : 1);
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }

    return job;
  }

예제 #15

0

파일 보기

파일: Language.java 프로젝트: yorickdewid/maven-tweets

  /**
   * @param args the command line arguments
   * @throws java.io.IOException
   */
  public static void main(String[] args) throws IOException, Exception {
    Conf conf = new Conf();
    Job job = new Job(conf, "TweetsLanguage");

    Scan scan = new Scan();

    TableMapReduceUtil.initTableMapperJob(
        "hhscyber:tweets", scan, LanguageMapper.class, null, null, job);
    job.setNumReduceTasks(0);

    TableMapReduceUtil.initTableReducerJob("hhscyber:tweets_lang", null, job);

    job.waitForCompletion(true);
  }

예제 #16

0

파일 보기

파일: TableIndexer.java 프로젝트: rexwong/hindex

  private static void createMapReduceJob(
      String tableNameToIndex, Configuration conf, int caching, int versions)
      throws IOException, InterruptedException, ClassNotFoundException {
    // Set the details to TableInputFormat
    Scan s = new Scan();
    s.setCaching(caching);
    s.setMaxVersions(versions);
    conf.set(TableInputFormat.INPUT_TABLE, tableNameToIndex);

    Set<Entry<String, List<String>>> entrySet = cfs.entrySet();
    for (Entry<String, List<String>> entry : entrySet) {
      List<String> quals = entry.getValue();
      addColumn(quals, Bytes.toBytes(entry.getKey()), s);
    }
    Job job = new Job(conf, "CreateIndex");
    String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);

    TableMapReduceUtil.initTableMapperJob(
        tableNameToIndex, // input table
        s, // Scan instance to control CF and attribute selection
        IndexCreationMapper.class, // mapper class
        ImmutableBytesWritable.class, // mapper output key
        Put.class, // mapper output value
        job);

    TableMapReduceUtil.initTableReducerJob(
        IndexUtils.getIndexTableName(tableNameToIndex), // output
        // table
        null, // reducer class
        job);

    if (hfileOutPath != null) {
      HTable table = new HTable(conf, tableNameToIndex);
      job.setReducerClass(KeyValueSortReducer.class);
      Path outputDir = new Path(hfileOutPath);
      FileOutputFormat.setOutputPath(job, outputDir);
      HFileOutputFormat.configureIncrementalLoad(job, table);
    } else {
      job.setNumReduceTasks(0);
    }

    TableMapReduceUtil.addDependencyJars(
        job.getConfiguration(), com.google.common.base.Preconditions.class);
    job.waitForCompletion(true);
    assert job.isComplete() == true;
  }

예제 #17

0

파일 보기

파일: MatrixCreationI.java 프로젝트: bistaprad/ImageClustering

  @Override
  public int run(String[] strings) throws Exception {
    Configuration conf = new Configuration();
    // String inputFileName = "/cluster/gmm.seq";
    String outputFileName = "/cluster/matrix_intermediate_" + level + ".seq";

    int result;
    System.out.println("level:" + level);
    conf.set("level", level + "");
    String table = "ClusterDatabase";
    // String seqFileName = "/cluster/gmm.seq";

    Scan scan = new Scan();
    scan.setStartRow((level + "|").getBytes());
    scan.setStopRow(
        Bytes.add((level + "|").getBytes(), Bytes.toBytes("ffffffffffffffffffffffffffffffff")));
    scan.addColumn("Cluster".getBytes(), "GMM".getBytes());

    // try (FileSystem fileSystem = FileSystem.get(conf)) {
    FileSystem fileSystem = FileSystem.get(conf);
    Path outputpath = new Path(outputFileName);
    if (fileSystem.exists(outputpath)) {
      fileSystem.delete(outputpath, true);
    }

    Job job = new Job(conf, "Matrix Creation I From HBase");
    job.setJarByClass(MatrixCreationI.class);
    TableMapReduceUtil.initTableMapperJob(
        table, scan, MatrixMapper.class, IntWritable.class, Text.class, job);
    job.setReducerClass(MatrixReducer.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    //            job.setInputFormatClass(TableInputFormat.class);
    // job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setNumReduceTasks(1);
    // FileInputFormat.addInputPath(job, new Path(inputFileName + "/part*"));
    FileOutputFormat.setOutputPath(job, outputpath);
    result = job.waitForCompletion(true) ? 0 : 1;
    // }
    return result;
  }

예제 #18

0

파일 보기

파일: SimpleRowCount.java 프로젝트: flySword/helloHBase

  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 1) {
      System.err.println("Usage: SimpleRowCounter <tablename>");
      return -1;
    }
    String tableName = args[0];
    Scan scan = new Scan();
    scan.setFilter(new FirstKeyOnlyFilter());
    Job job = new Job(getConf(), getClass().getSimpleName());
    job.setJarByClass(getClass()); // Set the Jar by finding where a given class came from.
    TableMapReduceUtil.initTableMapperJob(
        tableName, scan, RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);

    job.setNumReduceTasks(0); // 不需要reduce过程
    job.setOutputFormatClass(NullOutputFormat.class); // 不需要输出
    return job.waitForCompletion(true) ? 0 : 1;
  }

예제 #19

0

파일 보기

파일: RepartitionJoinQ8.java 프로젝트: subbu3490/rdf-mapreduce-joins

  // MapReduce Stage-1 Job
  public static Job startJob_Stage1(String[] args, Configuration hConf) throws IOException {

    // args[0] = hbase table name
    // args[1] = zookeeper

    /*
     * MapReduce Stage-1 Job
     * Retrieve a list of subjects and their attributes
     */
    Scan scan1 = new Scan();
    Job job1 = new Job(hConf);
    job1.setJobName("BSBM-Q8-RepartitionJoin");
    job1.setJarByClass(RepartitionJoinQ8.class);
    // Change caching and number of time stamps to speed up the scan
    scan1.setCaching(500);
    scan1.setMaxVersions(200);
    scan1.setCacheBlocks(false);

    // Mapper settings
    TableMapReduceUtil.initTableMapperJob(
        args[0], // input HBase table name
        scan1, // Scan instance to control CF and attribute selection
        RepartitionMapper.class, // mapper class
        CompositeKeyWritable.class, // mapper output key
        KeyValueArrayWritable.class, // mapper output value
        job1);

    // Reducer settings
    job1.setReducerClass(RepartitionReducer.class);

    job1.setOutputFormatClass(TextOutputFormat.class);
    // job1.setNumReduceTasks(1);  // Uncomment this if running into problems on 2+ node cluster
    FileOutputFormat.setOutputPath(job1, new Path("output/BSBMQ8"));

    try {
      job1.waitForCompletion(true);
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }

    return job1;
  }

예제 #20

0

파일 보기

파일: HbaseMR.java 프로젝트: chinahuangxin/prjs

 public static void readHtable() throws Exception {
   Job job = new Job(conf, "ExampleRead");
   job.setJarByClass(HbaseMR.class);
   Scan scan = new Scan();
   scan.setCaching(500); // 1 is the default in Scan, which will be bad for
   // MapReduce jobs
   scan.setCacheBlocks(false); // don't set to true for MR jobs
   TableMapReduceUtil.initTableMapperJob(
       "", // input HBase table name
       scan, // Scan instance to control CF and attribute selection
       MyMapper.class, // mapper
       null, // mapper output key
       null, // mapper output value
       job);
   job.setOutputFormatClass(NullOutputFormat.class); // because we aren't
   // emitting anything
   // from mapper
   boolean b = job.waitForCompletion(true);
   if (!b) {
     throw new IOException("error with job!");
   }
 }

예제 #21

0

파일 보기

파일: CountUsersExternalTest.java 프로젝트: wojtek-szymanski/hadoop

  @Test
  public void shouldRunMapReduce() throws Exception {
    // given
    Configuration configuration = HBaseConfiguration.create();

    TableFactory.recreateTable(
        configuration, Bytes.toString(UsersDao.TABLE_NAME), Bytes.toString(UsersDao.FAMILY_NAME));
    UserDataFactory.insertTestData();

    // map reduce
    Job job = new Job(configuration, "Count Users");
    job.setJarByClass(CountUsersMapper.class);

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.addColumn(UsersDao.FAMILY_NAME, UsersDao.FORENAME_COL);

    // mapper
    TableMapReduceUtil.initTableMapperJob(
        Bytes.toString(UsersDao.TABLE_NAME),
        scan,
        CountUsersMapper.class,
        ImmutableBytesWritable.class,
        Result.class,
        job);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);

    // when
    boolean succeeded = job.waitForCompletion(true);

    // then
    assertThat(succeeded).isTrue();
    assertThat(job.getCounters().findCounter(CountUsersMapper.Counters.USER_COUNT).getValue())
        .isGreaterThan(99);
  }

예제 #22

0

파일 보기

파일: FreqCounter.java 프로젝트: bobqiu/hadoop

  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    Configuration hbaseConf = HBaseConfiguration.create();
    hbaseConf.set(
        "hbase.zookeeper.quorum", "DamHadoop1,DamHadoop2,DamHadoop3"); // zookeeper服务器地址，要换成自己服务器地址
    // Job jobConf=new Job(hbaseConf, "FreqCounter");
    Job jobConf = Job.getInstance(hbaseConf, "FreqCounter");
    jobConf.setJobName("Hbase_FreqCounter");
    jobConf.setJarByClass(FreqCounter.class);

    Scan scan = new Scan();
    String columns = "details";
    scan.addFamily(Bytes.toBytes(columns));
    scan.setFilter(new FirstKeyOnlyFilter());
    TableMapReduceUtil.initTableMapperJob(
        "import_tests",
        scan,
        CountMapper.class,
        ImmutableBytesWritable.class,
        IntWritable.class,
        jobConf);
    TableMapReduceUtil.initTableReducerJob("summary_user", CountReducer.class, jobConf);
    System.exit(jobConf.waitForCompletion(true) ? 0 : 1);
  }

예제 #23

0

파일 보기

파일: ExportHBaseTableToDelimiteredSeq.java 프로젝트: yangwn/HBase-ToHDFS

  public static void main(String[] args)
      throws IOException, InterruptedException, ClassNotFoundException {
    if (args.length == 0) {
      System.out.println(
          "ExportHBaseTableToDelimiteredSeq {tableName} {ColumnFamily} {outputPath} {compressionCodec} {schemaLocationOnLocal} {delimiter} {rowKeyColumn.optional");
      return;
    }

    String table = args[0];
    String columnFamily = args[1];
    String outputPath = args[2];
    String compressionCodec = args[3];
    String schemaFilePath = args[4];
    String delimiter = args[5];

    String rowKeyColumn = "";
    if (args.length > 6) {
      rowKeyColumn = args[6];
    }

    Job job = Job.getInstance();
    job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn);

    HBaseConfiguration.addHbaseResources(job.getConfiguration());

    job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath);
    job.getConfiguration().set(OUTPUT_PATH_CONF, outputPath);
    job.getConfiguration().set(DELIMITER_CONF, delimiter);

    job.setJarByClass(ExportHBaseTableToDelimiteredSeq.class);
    job.setJobName("ExportHBaseTableToDelimiteredSeq ");

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for
    // MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.addFamily(Bytes.toBytes(columnFamily));

    TableMapReduceUtil.initTableMapperJob(
        table, // input HBase table name
        scan, // Scan instance to control CF and attribute selection
        MyMapper.class, // mapper
        null, // mapper output key
        null, // mapper output value
        job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));

    if (compressionCodec.equals("snappy")) {
      SequenceFileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);
    } else if (compressionCodec.equals("gzip")) {
      SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {
      // nothing
    }

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(0);

    boolean b = job.waitForCompletion(true);
  }

예제 #24

0

파일 보기

파일: test.java 프로젝트: NCTUee104/2015_fall_Database

  public boolean doTest()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException, SQLException,
          IOException, InterruptedException {
    /** create config */
    conf_h = HBaseConfiguration.create();
    conf_h.set("hbase.zookeeper.quorum", "localhost");
    conf_h.set("hbase.zookeeper.property.clientPort", "2181");
    Connection con_h = null;
    try {
      con_h = ConnectionFactory.createConnection(conf_h);
    } catch (IOException e) {
      e.printStackTrace();
    }
    Admin admin = con_h.getAdmin();
    HTableDescriptor tableDesc = new HTableDescriptor(tableName_chi);
    HColumnDescriptor colFamDesc = new HColumnDescriptor("count");
    colFamDesc.setMaxVersions(1);
    tableDesc.addFamily(colFamDesc);
    admin.createTable(tableDesc);

    /** counting and insert in chiTable */
    Scan scan = new Scan();
    scan.addColumn(Bytes.toBytes("products"), Bytes.toBytes("product_category_id"));
    scan.addColumn(Bytes.toBytes("orders"), Bytes.toBytes("order_date"));
    // Creates a new Job with no particular Cluster
    Job job =
        Job.getInstance(conf_h, "Count"); // Job.getInstance(Configuration conf, String JobName)
    job.setJarByClass(
        ChiSquaredTest2_abc.class); // Set the Jar by finding where a given class came from
    // initTableMapperJob(String table, Scan scan, Class<? extends TableMapper> mapper, Class<?>
    // outputKeyClass, Class<?> outputValueClass, org.apache.hadoop.mapreduce.Job job)
    TableMapReduceUtil.initTableMapperJob(
        "retail_order", scan, Map1.class, Text.class, IntWritable.class, job);
    // initTableReducerJob(String table, Class<? extends TableReducer> reducer,
    // org.apache.hadoop.mapreduce.Job job)
    TableMapReduceUtil.initTableReducerJob("chiTable", Reduce1.class, job);

    // boolean waitForCompletion(boolean verbose), verbose - print the progress to the user
    job.waitForCompletion(true); // Submit the job to the cluster and wait for it to finish

    /** extract value from chiTable */
    int totalY = 0;
    int totalN = 0;
    ArrayList<CellOfHTable> chiTable = new ArrayList<CellOfHTable>();
    Table table_h = con_h.getTable(tableName_chi);
    Scan s = new Scan();
    s.addFamily(Bytes.toBytes("count"));
    ResultScanner results = table_h.getScanner(s);
    for (Result r : results) {
      CellOfHTable c =
          new CellOfHTable(
              r.getRow(),
              r.getValue(Bytes.toBytes("count"), Bytes.toBytes("Y")) == null
                  ? Bytes.toBytes(0)
                  : r.getValue(Bytes.toBytes("count"), Bytes.toBytes("Y")),
              r.getValue(Bytes.toBytes("count"), Bytes.toBytes("N")) == null
                  ? Bytes.toBytes(0)
                  : r.getValue(
                      Bytes.toBytes("count"), Bytes.toBytes("N"))); // (id, count_Y, count_N)
      chiTable.add(c);
      totalY = totalY + c.countY;
      totalN = totalN + c.countN;
    }

    results.close();
    table_h.close();
    admin.disableTable(tableName_chi);
    admin.deleteTable(tableName_chi);

    double chisquare = 0.0;
    for (int i = 0; i < chiTable.size(); i++) {
      CellOfHTable c = chiTable.get(i);
      double expectY =
          (double) (c.countY + c.countN) * (double) totalY / (double) (totalY + totalN);
      chisquare =
          chisquare + (((double) c.countY - expectY) * ((double) c.countY - expectY) / expectY);
      double expectN =
          (double) (c.countY + c.countN) * (double) totalN / (double) (totalY + totalN);
      chisquare =
          chisquare + (((double) c.countN - expectN) * ((double) c.countN - expectN) / expectN);
    }

    System.out.println(chisquare);
    ChiSquareDist csd = new ChiSquareDist((chiTable.size() - 1));
    if (chisquare > csd.inverseF(1.0 - alpha)) {
      return true;
    }
    return false;
  }

예제 #25

0

파일 보기

파일: MonthlyWeatherMapredMain.java 프로젝트: manp94/weatheranalysis

  public static void main(String args[]) throws Exception {

    try {

      Configuration config = HBaseConfiguration.create();
      config.set("mapred.output.dir", "/mnt/data/workspace/weatheranalysis/mapred/monthly");
      Job job = new Job(config, "MonthlySummary");

      String sourceTable = DailyWeatherHbaseOperations.TABLE_NAME;

      Scan scan = new Scan();
      scan.setCaching(500); // 1 is the default in Scan, which will be bad
      // for
      // MapReduce jobs
      scan.setCacheBlocks(false); // don't set to true for MR jobs
      // set other scan attrs

      TableMapReduceUtil.initTableMapperJob(
          sourceTable, // input table
          scan, // Scan instance to control CF and attribute selection
          MonthlyWeatherMapper.class, // mapper class
          Text.class, // mapper output key
          Text.class, // mapper output value
          job);

      job.setReducerClass(MonthlyWeatherReducer.class); // reducer class
      job.setNumReduceTasks(1); // at least one, adjust as required

      Path out = new Path("/mnt/data/workspace/weatheranalysis/mapred/monthly");
      File outDir = new File(out.toString());
      FileUtil.fullyDelete(outDir);
      FileOutputFormat.setOutputPath(job, out);

      MonthlyWeatherHbaseOperations.useTable();

      // The required Total Precipitate for a particular station for an year

      /* List<Float> ForAllMonthsPpt = new ArrayList<Float>();
      	  List<MonthlyWeather> lr = MonthlyWeatherHbaseOperations.get(425010,2013);
              while(lr.isEmpty()) {
      	    ForAllMonthsPpt.add(lr.get(4));

      }
      	 *
      	 *
      	 *
      	 *
      	 */
      job.waitForCompletion(true);
      System.out.println("Job Completed.");

      boolean b = job.waitForCompletion(true);
      if (!b) {
        throw new IOException("error with job!");
      }

    } catch (IOException e) {
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      e.printStackTrace();
    } catch (InterruptedException e) {
      e.printStackTrace();
    }
  }