/**
   * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5
   * that too, to verify that it all went ok.
   *
   * @param testWithIndex Should we index or not?
   * @param charsToOutput How many characters of random data should we output.
   * @throws IOException
   * @throws NoSuchAlgorithmException
   * @throws InterruptedException
   */
  private void runTest(boolean testWithIndex, int charsToOutput)
      throws IOException, NoSuchAlgorithmException, InterruptedException {

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf));

    FileSystem.getLocal(conf).close(); // remove cached filesystem (if any)
    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir_, true);
    localFs.mkdirs(outputDir_);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir_);

    TaskAttemptContext attemptContext =
        new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
      Path lzoFile = new Path(outputDir_, lzoFileName_);
      LzoIndex.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir_);

    List<InputSplit> is = inputFormat.getSplits(job);
    // verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
      assertEquals(3, is.size());
    } else {
      assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
      RecordReader<LongWritable, Text> rr =
          inputFormat.createRecordReader(inputSplit, attemptContext);
      rr.initialize(inputSplit, attemptContext);

      while (rr.nextKeyValue()) {
        Text value = rr.getCurrentValue();

        md5_.update(value.getBytes(), 0, value.getLength());
      }

      rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5_.digest()));
  }
  /**
   * Run a local map reduce job to read records from HCatalog table and verify if the count is as
   * expected
   *
   * @param readCount
   * @param filter
   * @return
   * @throws Exception
   */
  List<HCatRecord> runMRRead(int readCount, String filter) throws Exception {
    MapRead.readCount = 0;
    readRecords.clear();

    Configuration conf = new Configuration();
    conf.set(HiveConf.ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname, "true");
    Job job = new Job(conf, "hcat mapreduce read test");
    job.setJarByClass(this.getClass());
    job.setMapperClass(HCatMapReduceTest.MapRead.class);

    // input/output settings
    job.setInputFormatClass(HCatInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    HCatInputFormat.setInput(job, dbName, tableName, filter);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceOutput");
    if (fs.exists(path)) {
      fs.delete(path, true);
    }

    TextOutputFormat.setOutputPath(job, path);

    job.waitForCompletion(true);
    Assert.assertEquals(readCount, MapRead.readCount);

    return readRecords;
  }
示例#3
0
  @Override
  public int run(String[] args) throws Exception {

    if (args.length != 2) {
      System.out.println("usage : need <input path>  <output path>");
      return 1;
    }
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);

    Configuration conf = getConf();

    Job job = new Job(conf, getClass().getName() + "--answer");
    job.setJarByClass(CombinerAnswer.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    // job.setCombinerClass(MyReducer.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setMapOutputKeyClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextInputFormat.setInputPaths(job, inputPath);
    TextOutputFormat.setOutputPath(job, outputPath);

    return job.waitForCompletion(true) ? 0 : 1;
  }
示例#4
0
  @SuppressWarnings({"unchecked", "rawtypes"})
  public void start(
      Path outputDir, int numReducers, boolean concurrent, String accessKey, String secretKey)
      throws GoraException, IOException, Exception {
    LOG.info("Running Verify with outputDir=" + outputDir + ", numReducers=" + numReducers);

    // DataStore<Long,CINode> store = DataStoreFactory.getDataStore(Long.class, CINode.class, new
    // Configuration());
    auth = new BasicAWSCredentials(accessKey, secretKey);

    DataStore<Long, cidynamonode> store =
        WSDataStoreFactory.createDataStore(
            DynamoDBStore.class, DynamoDBKey.class, cidynamonode.class, auth);

    job = new Job(getConf());

    if (!job.getConfiguration()
        .get("io.serializations")
        .contains("org.apache.hadoop.io.serializer.JavaSerialization")) {
      job.getConfiguration()
          .set(
              "io.serializations",
              job.getConfiguration().get("io.serializations")
                  + ",org.apache.hadoop.io.serializer.JavaSerialization");
    }

    job.setJobName("Link Verifier");
    job.setNumReduceTasks(numReducers);
    job.setJarByClass(getClass());

    Query query = store.newQuery();
    // if (!concurrent) {
    // no concurrency filtering, only need prev field
    // query.setFields("prev");
    // } else {
    // readFlushed(job.getCon  figuration());
    // }

    GoraMapper.initMapperJob(
        job, query, store, DynamoDBKey.class, VLongWritable.class, VerifyMapper.class, true);

    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);

    job.setReducerClass(VerifyReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    store.close();

    job.submit();
  }
 public static void main(String[] args)
     throws IOException, ClassNotFoundException, InterruptedException {
   // TODO Auto-generated method stub
   Configuration conf = new Configuration();
   conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ",");
   Job job = Job.getInstance(conf);
   job.setJarByClass(MapsideJoinDriver.class);
   job.setMapperClass(MapsideJoinMapper.class);
   job.setInputFormatClass(CompositeInputFormat.class);
   String expr =
       CompositeInputFormat.compose(
           "inner", KeyValueTextInputFormat.class, new Path(args[0]), new Path(args[1]));
   // String expr = CompositeInputFormat.compose("outer",KeyValueTextInputFormat.class , new
   // Path(args[0]),new Path(args[1]));
   job.getConfiguration().set("mapreduce.join.expr", expr);
   job.setNumReduceTasks(0);
   TextOutputFormat.setOutputPath(job, new Path(args[2]));
   job.waitForCompletion(true);
 }
  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 4) {
      System.err.println(
          "Usage: ReplicatedJoin <user data> <comment data> <out> [inner|leftouter]");
      System.exit(1);
    }

    String joinType = otherArgs[3];
    if (!(joinType.equalsIgnoreCase("inner") || joinType.equalsIgnoreCase("leftouter"))) {
      System.err.println("Join type not set to inner or leftouter");
      System.exit(2);
    }

    // Configure the join type
    Job job = new Job(conf, "Replicated Join");
    job.getConfiguration().set("join.type", joinType);
    job.setJarByClass(ReplicatedJoinDriver.class);

    job.setMapperClass(ReplicatedJoinMapper.class);
    job.setNumReduceTasks(0);

    TextInputFormat.setInputPaths(job, new Path(otherArgs[1]));
    TextOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Configure the DistributedCache
    DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration());

    DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]);

    System.exit(job.waitForCompletion(true) ? 0 : 3);
  }