private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); long fileLength = fileStatus.getLen(); // Hadoop does unsafe casting from long to int, so split length should not be greater than int // max value long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE; InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl( hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext); recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add( new Pair( fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
@Override protected void configureJob(Job job) throws IOException { Configuration conf = job.getConfiguration(); job.setJarByClass(PartialBuilder.class); FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(conf)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // For this implementation to work, mapred.map.tasks needs to be set to the actual // number of mappers Hadoop will use: TextInputFormat inputFormat = new TextInputFormat(); List<?> splits = inputFormat.getSplits(job); if (splits == null || splits.isEmpty()) { log.warn("Unable to compute number of splits?"); } else { int numSplits = splits.size(); log.info("Setting mapred.map.tasks = {}", numSplits); conf.setInt("mapred.map.tasks", numSplits); } }
public Path write(Message... messages) throws Exception { synchronized (WriteUsingMR.class) { outputPath = TestUtils.someTemporaryFilePath(); Path inputPath = TestUtils.someTemporaryFilePath(); FileSystem fileSystem = inputPath.getFileSystem(conf); fileSystem.create(inputPath); inputMessages = Collections.unmodifiableList(Arrays.asList(messages)); final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(WritingMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(ProtoParquetOutputFormat.class); ProtoParquetOutputFormat.setOutputPath(job, outputPath); ProtoParquetOutputFormat.setProtobufClass(job, TestUtils.inferRecordsClass(messages)); waitForJob(job); inputMessages = null; return outputPath; } }
@Override public int run(String[] args) throws Exception { String instance = args[0]; String zookeepers = args[1]; String user = args[2]; String tokenFile = args[3]; String input = args[4]; String tableName = args[5]; Job job = Job.getInstance(getConf()); job.setJobName(TokenFileWordCount.class.getName()); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, input); job.setMapperClass(MapClass.class); job.setNumReduceTasks(0); job.setOutputFormatClass(AccumuloOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Mutation.class); // AccumuloInputFormat not used here, but it uses the same functions. AccumuloOutputFormat.setZooKeeperInstance( job, ClientConfiguration.loadDefault().withInstance(instance).withZkHosts(zookeepers)); AccumuloOutputFormat.setConnectorInfo(job, user, tokenFile); AccumuloOutputFormat.setCreateTables(job, true); AccumuloOutputFormat.setDefaultTableName(job, tableName); job.waitForCompletion(true); return 0; }
/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage : need <input path> <output path>"); return 1; } Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); Configuration conf = getConf(); Job job = new Job(conf, getClass().getName() + "--answer"); job.setJarByClass(CombinerAnswer.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // job.setCombinerClass(MyReducer.class); job.setMapOutputValueClass(IntWritable.class); job.setMapOutputKeyClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); TextInputFormat.setInputPaths(job, inputPath); TextOutputFormat.setOutputPath(job, outputPath); return job.waitForCompletion(true) ? 0 : 1; }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: RedisOutput <user data> <redis hosts> <hash name>"); System.exit(1); } Path inputPath = new Path(otherArgs[0]); String hosts = otherArgs[1]; String hashName = otherArgs[2]; Job job = new Job(conf, "Redis Output"); job.setJarByClass(RedisOutputDriver.class); job.setMapperClass(RedisOutputMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(RedisHashOutputFormat.class); RedisHashOutputFormat.setRedisHosts(job, hosts); RedisHashOutputFormat.setRedisHashKey(job, hashName); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); int code = job.waitForCompletion(true) ? 0 : 2; System.exit(code); }
@Override public int run(String[] args) { Opts opts = new Opts(); opts.parseArgs(BulkIngestExample.class.getName(), args); Configuration conf = getConf(); PrintStream out = null; try { Job job = JobUtil.getJob(conf); job.setJobName("bulk ingest example"); job.setJarByClass(this.getClass()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MapClass.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(ReduceClass.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); opts.setAccumuloConfigs(job); Connector connector = opts.getConnector(); TextInputFormat.setInputPaths(job, new Path(opts.inputDir)); AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files")); FileSystem fs = FileSystem.get(conf); out = new PrintStream( new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt")))); Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100); for (Text split : splits) out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split)))); job.setNumReduceTasks(splits.size() + 1); out.close(); job.setPartitionerClass(RangePartitioner.class); RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt"); job.waitForCompletion(true); Path failures = new Path(opts.workDir, "failures"); fs.delete(failures, true); fs.mkdirs(new Path(opts.workDir, "failures")); connector .tableOperations() .importDirectory( opts.getTableName(), opts.workDir + "/files", opts.workDir + "/failures", false); } catch (Exception e) { throw new RuntimeException(e); } finally { if (out != null) out.close(); } return 0; }
public int run(String[] args) throws Exception { String input = args[0]; Job job = new Job(conf, "jobName"); job.setJarByClass(HbaseMR.class); job.setMapperClass(Map.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, input); job.setOutputFormatClass(NullOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
/** @param args */ @Override public int run(String[] args) throws Exception { Opts opts = new Opts(); opts.parseArgs(getClass().getName(), args); Job job = JobUtil.getJob(getConf()); job.setJobName(getClass().getSimpleName()); job.setJarByClass(getClass()); opts.setAccumuloConfigs(job); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(AccumuloOutputFormat.class); job.setMapperClass(NGramMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); job.setNumReduceTasks(0); job.setSpeculativeExecution(false); if (!opts.getConnector().tableOperations().exists(opts.tableName)) { log.info("Creating table " + opts.tableName); opts.getConnector().tableOperations().create(opts.tableName); SortedSet<Text> splits = new TreeSet<Text>(); String numbers[] = "1 2 3 4 5 6 7 8 9".split("\\s"); String lower[] = "a b c d e f g h i j k l m n o p q r s t u v w x y z".split("\\s"); String upper[] = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z".split("\\s"); for (String[] array : new String[][] {numbers, lower, upper}) { for (String s : array) { splits.add(new Text(s)); } } opts.getConnector().tableOperations().addSplits(opts.tableName, splits); } TextInputFormat.addInputPath(job, new Path(opts.inputDirectory)); job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 4) { System.err.println( "Usage: ReplicatedJoin <user data> <comment data> <out> [inner|leftouter]"); System.exit(1); } String joinType = otherArgs[3]; if (!(joinType.equalsIgnoreCase("inner") || joinType.equalsIgnoreCase("leftouter"))) { System.err.println("Join type not set to inner or leftouter"); System.exit(2); } // Configure the join type Job job = new Job(conf, "Replicated Join"); job.getConfiguration().set("join.type", joinType); job.setJarByClass(ReplicatedJoinDriver.class); job.setMapperClass(ReplicatedJoinMapper.class); job.setNumReduceTasks(0); TextInputFormat.setInputPaths(job, new Path(otherArgs[1])); TextOutputFormat.setOutputPath(job, new Path(otherArgs[2])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Configure the DistributedCache DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration()); DistributedCache.setLocalFiles(job.getConfiguration(), otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 3); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { return textIF.getSplits(context); }
/** * Run a local map reduce job to load data from in memory records to an HCatalog Table * * @param partitionValues * @param partitionColumns * @param records data to be written to HCatalog table * @param writeCount * @param assertWrite * @param asSingleMapTask * @return * @throws Exception */ Job runMRCreate( Map<String, String> partitionValues, List<HCatFieldSchema> partitionColumns, List<HCatRecord> records, int writeCount, boolean assertWrite, boolean asSingleMapTask, String customDynamicPathPattern) throws Exception { writeRecords = records; MapCreate.writeCount = 0; Configuration conf = new Configuration(); Job job = new Job(conf, "hcat mapreduce write test"); job.setJarByClass(this.getClass()); job.setMapperClass(HCatMapReduceTest.MapCreate.class); // input/output settings job.setInputFormatClass(TextInputFormat.class); if (asSingleMapTask) { // One input path would mean only one map task Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput"); createInputFile(path, writeCount); TextInputFormat.setInputPaths(job, path); } else { // Create two input paths so that two map tasks get triggered. There could be other ways // to trigger two map tasks. Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput"); createInputFile(path, writeCount / 2); Path path2 = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput2"); createInputFile(path2, (writeCount - writeCount / 2)); TextInputFormat.setInputPaths(job, path, path2); } job.setOutputFormatClass(HCatOutputFormat.class); OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName, partitionValues); if (customDynamicPathPattern != null) { job.getConfiguration() .set(HCatConstants.HCAT_DYNAMIC_CUSTOM_PATTERN, customDynamicPathPattern); } HCatOutputFormat.setOutput(job, outputJobInfo); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(DefaultHCatRecord.class); job.setNumReduceTasks(0); HCatOutputFormat.setSchema(job, new HCatSchema(partitionColumns)); boolean success = job.waitForCompletion(true); // Ensure counters are set when data has actually been read. if (partitionValues != null) { assertTrue( job.getCounters().getGroup("FileSystemCounters").findCounter("FILE_BYTES_READ").getValue() > 0); } if (!HCatUtil.isHadoop23()) { // Local mode outputcommitter hook is not invoked in Hadoop 1.x if (success) { new FileOutputCommitterContainer(job, null).commitJob(job); } else { new FileOutputCommitterContainer(job, null).abortJob(job, JobStatus.State.FAILED); } } if (assertWrite) { // we assert only if we expected to assert with this call. Assert.assertEquals(writeCount, MapCreate.writeCount); } if (isTableExternal()) { externalTableLocation = outputJobInfo.getTableInfo().getTableLocation(); } return job; }