/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
public int run(String[] args) throws Exception { Configuration conf = getConf(); GenericOptionsParser optionparser = new GenericOptionsParser(conf, args); conf = optionparser.getConfiguration(); Job job = new Job(conf, conf.get("job_name")); job.setJarByClass(DeliverFormatForUVMR.class); FileInputFormat.addInputPaths(job, conf.get("input_dir")); String outputDir = conf.get("output_dir"); String tmpDir = outputDir + "_tmp"; Path tmpOut = new Path(tmpDir); FileOutputFormat.setOutputPath(job, tmpOut); tmpOut.getFileSystem(conf).delete(tmpOut, true); job.setMapperClass(DeliverFormatForUVMapper.class); job.setCombinerClass(DeliverFormatForUVCombiner.class); job.setReducerClass(DeliverFormatForUVReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(conf.getInt("reduce_num", 20)); int code = job.waitForCompletion(true) ? 0 : 1; if (code == 0) { // this job is for combining small files into one Job combineJob = new Job(conf, "CombineTmpData"); combineJob.setJarByClass(DeliverFormatForUVMR.class); FileInputFormat.addInputPath(combineJob, new Path(tmpDir)); FileOutputFormat.setOutputPath(combineJob, new Path(outputDir)); combineJob.setMapperClass(IdentityMapper.class); combineJob.setReducerClass(IdentityReducer.class); combineJob.setInputFormatClass(KeyValueTextInputFormat.class); combineJob.setOutputFormatClass(TextOutputFormat.class); combineJob.setOutputKeyClass(Text.class); combineJob.setOutputValueClass(Text.class); TextOutputFormat.setCompressOutput(combineJob, true); TextOutputFormat.setOutputCompressorClass(combineJob, LzopCodec.class); combineJob.setNumReduceTasks(1); code = combineJob.waitForCompletion(true) ? 0 : 1; } FileSystem.get(conf).delete(tmpOut, true); LzoIndexer lzoIndexer = new LzoIndexer(conf); lzoIndexer.index(new Path(outputDir)); System.exit(code); return code; }
/** * Run a local map reduce job to read records from HCatalog table and verify if the count is as * expected * * @param readCount * @param filter * @return * @throws Exception */ List<HCatRecord> runMRRead(int readCount, String filter) throws Exception { MapRead.readCount = 0; readRecords.clear(); Configuration conf = new Configuration(); conf.set(HiveConf.ConfVars.METASTORE_INTEGER_JDO_PUSHDOWN.varname, "true"); Job job = new Job(conf, "hcat mapreduce read test"); job.setJarByClass(this.getClass()); job.setMapperClass(HCatMapReduceTest.MapRead.class); // input/output settings job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); HCatInputFormat.setInput(job, dbName, tableName, filter); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceOutput"); if (fs.exists(path)) { fs.delete(path, true); } TextOutputFormat.setOutputPath(job, path); job.waitForCompletion(true); Assert.assertEquals(readCount, MapRead.readCount); return readRecords; }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage : need <input path> <output path>"); return 1; } Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); Configuration conf = getConf(); Job job = new Job(conf, getClass().getName() + "--answer"); job.setJarByClass(CombinerAnswer.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // job.setCombinerClass(MyReducer.class); job.setMapOutputValueClass(IntWritable.class); job.setMapOutputKeyClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextInputFormat.setInputPaths(job, inputPath); TextOutputFormat.setOutputPath(job, outputPath); return job.waitForCompletion(true) ? 0 : 1; }
/** * Creates an lzo file with random data. * * @param outputDir Output directory. * @param fs File system we're using. * @param attemptContext Task attempt context, contains task id etc. * @throws IOException * @throws InterruptedException */ private byte[] createTestInput( Path outputDir, FileSystem fs, TaskAttemptContext attemptContext, int charsToOutput) throws IOException, InterruptedException { TextOutputFormat<Text, Text> output = new TextOutputFormat<Text, Text>(); RecordWriter<Text, Text> rw = null; md5_.reset(); try { rw = output.getRecordWriter(attemptContext); char[] chars = "abcdefghijklmnopqrstuvwxyz\u00E5\u00E4\u00F6".toCharArray(); Random r = new Random(System.currentTimeMillis()); Text key = new Text(); Text value = new Text(); int charsMax = chars.length - 1; for (int i = 0; i < charsToOutput; ) { i += fillText(chars, r, charsMax, key); i += fillText(chars, r, charsMax, value); rw.write(key, value); md5_.update(key.getBytes(), 0, key.getLength()); // text output format writes tab between the key and value md5_.update("\t".getBytes("UTF-8")); md5_.update(value.getBytes(), 0, value.getLength()); } } finally { if (rw != null) { rw.close(attemptContext); OutputCommitter committer = output.getOutputCommitter(attemptContext); committer.commitTask(attemptContext); committer.cleanupJob(attemptContext); } } byte[] result = md5_.digest(); md5_.reset(); return result; }
@SuppressWarnings({"unchecked", "rawtypes"}) public void start( Path outputDir, int numReducers, boolean concurrent, String accessKey, String secretKey) throws GoraException, IOException, Exception { LOG.info("Running Verify with outputDir=" + outputDir + ", numReducers=" + numReducers); // DataStore<Long,CINode> store = DataStoreFactory.getDataStore(Long.class, CINode.class, new // Configuration()); auth = new BasicAWSCredentials(accessKey, secretKey); DataStore<Long, cidynamonode> store = WSDataStoreFactory.createDataStore( DynamoDBStore.class, DynamoDBKey.class, cidynamonode.class, auth); job = new Job(getConf()); if (!job.getConfiguration() .get("io.serializations") .contains("org.apache.hadoop.io.serializer.JavaSerialization")) { job.getConfiguration() .set( "io.serializations", job.getConfiguration().get("io.serializations") + ",org.apache.hadoop.io.serializer.JavaSerialization"); } job.setJobName("Link Verifier"); job.setNumReduceTasks(numReducers); job.setJarByClass(getClass()); Query query = store.newQuery(); // if (!concurrent) { // no concurrency filtering, only need prev field // query.setFields("prev"); // } else { // readFlushed(job.getCon figuration()); // } GoraMapper.initMapperJob( job, query, store, DynamoDBKey.class, VLongWritable.class, VerifyMapper.class, true); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.setReducerClass(VerifyReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputDir); store.close(); job.submit(); }
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf = new Configuration(); conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", ","); Job job = Job.getInstance(conf); job.setJarByClass(MapsideJoinDriver.class); job.setMapperClass(MapsideJoinMapper.class); job.setInputFormatClass(CompositeInputFormat.class); String expr = CompositeInputFormat.compose( "inner", KeyValueTextInputFormat.class, new Path(args[0]), new Path(args[1])); // String expr = CompositeInputFormat.compose("outer",KeyValueTextInputFormat.class , new // Path(args[0]),new Path(args[1])); job.getConfiguration().set("mapreduce.join.expr", expr); job.setNumReduceTasks(0); TextOutputFormat.setOutputPath(job, new Path(args[2])); job.waitForCompletion(true); }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 4) { System.err.println( "Usage: ReplicatedJoin <user data> <comment data> <out> [inner|leftouter]"); System.exit(1); } String joinType = otherArgs[3]; if (!(joinType.equalsIgnoreCase("inner") || joinType.equalsIgnoreCase("leftouter"))) { System.err.println("Join type not set to inner or leftouter"); System.exit(2); } // Configure the join type Job job = new Job(conf, "Replicated Join"); job.getConfiguration().set("join.type", joinType); job.setJarByClass(ReplicatedJoinDriver.class); job.setMapperClass(ReplicatedJoinMapper.class); job.setNumReduceTasks(0); TextInputFormat.setInputPaths(job, new Path(otherArgs[1])); TextOutputFormat.setOutputPath(job, new Path(otherArgs[2])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Configure the DistributedCache DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration()); DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 3); }