void checkFormat(Job job) throws Exception { TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); MyClassMessagePackBase64LineInputFormat format = new MyClassMessagePackBase64LineInputFormat(); FileInputFormat.setInputPaths(job, workDir); List<InputSplit> splits = format.getSplits(job); for (int j = 0; j < splits.size(); j++) { RecordReader<LongWritable, MyClassWritable> reader = format.createRecordReader(splits.get(j), attemptContext); reader.initialize(splits.get(j), attemptContext); int count = 0; try { while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); MyClassWritable val = reader.getCurrentValue(); MyClass mc = val.get(); assertEquals(mc.v, count); assertEquals(mc.s, Integer.toString(count)); count++; } } finally { reader.close(); } } }
/** * Generate random data, compress it, index and md5 hash the data. Then read it all back and md5 * that too, to verify that it all went ok. * * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); Assume.assumeTrue(CoreTestUtil.okToRunLzoTests(conf)); FileSystem.getLocal(conf).close(); // remove cached filesystem (if any) FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir_, true); localFs.mkdirs(outputDir_); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir_); TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID("123", 0, false, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir_, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir_, lzoFileName_); LzoIndex.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir_); List<InputSplit> is = inputFormat.getSplits(job); // verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5_.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5_.digest())); }
@Test public void readOutsidePig() throws ClassCastException, ParseException, ClassNotFoundException, InstantiationException, IllegalAccessException, IOException, InterruptedException { // simulate Pig front-end runtime final SequenceFileLoader<IntWritable, Text> storage = new SequenceFileLoader<IntWritable, Text>( "-c " + IntWritableConverter.class.getName(), "-c " + TextConverter.class.getName()); Job job = new Job(); storage.setUDFContextSignature("12345"); storage.setLocation(tempFilename, job); // simulate Pig back-end runtime RecordReader<DataInputBuffer, DataInputBuffer> reader = new RawSequenceFileRecordReader(); FileSplit fileSplit = new FileSplit( new Path(tempFilename), 0, new File(tempFilename).length(), new String[] {"localhost"}); TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); reader.initialize(fileSplit, context); InputSplit[] wrappedSplits = new InputSplit[] {fileSplit}; int inputIndex = 0; List<OperatorKey> targetOps = Arrays.asList(new OperatorKey("54321", 0)); int splitIndex = 0; PigSplit split = new PigSplit(wrappedSplits, inputIndex, targetOps, splitIndex); split.setConf(job.getConfiguration()); storage.prepareToRead(reader, split); // read tuples and validate validate(new LoadFuncTupleIterator(storage)); }
@Override public RecordReader<LongWritable, Text> createRecordReader( InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final RecordReader<LongWritable, Text> rr = new SortRecordReader(); rr.initialize(split, ctx); return rr; }
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); long fileLength = fileStatus.getLen(); // Hadoop does unsafe casting from long to int, so split length should not be greater than int // max value long splitLength = (fileLength < Integer.MAX_VALUE) ? fileLength : Integer.MAX_VALUE; InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, splitLength, null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl( hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext); recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add( new Pair( fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { rr.initialize(split, context); conf = context.getConfiguration(); nextKeyValue(); if (!empty) { keyclass = key.getClass().asSubclass(WritableComparable.class); valueclass = value.getClass(); if (cmp == null) { cmp = WritableComparator.get(keyclass, conf); } } }
@Test public void testPreferredServerUnreachable() throws Exception { InfinispanInputSplit invalidSplit = createInfinispanSplit(); Configuration configuration = miniHadoopCluster.getConfiguration(); TaskAttemptContextImpl fakeTaskContext = new TaskAttemptContextImpl(configuration, new TaskAttemptID()); InfinispanInputFormat<Integer, WebPage> inputFormat = new InfinispanInputFormat<>(); RecordReader<Integer, WebPage> reader = inputFormat.createRecordReader(invalidSplit, fakeTaskContext); reader.initialize(invalidSplit, fakeTaskContext); reader.nextKeyValue(); assertNotNull(reader.getCurrentKey()); }
@Override public Iterator<HCatRecord> read() throws HCatException { HCatInputFormat inpFmt = new HCatInputFormat(); RecordReader<WritableComparable, HCatRecord> rr; try { TaskAttemptContext cntxt = ShimLoader.getHadoopShims() .getHCatShim() .createTaskAttemptContext(conf, new TaskAttemptID()); rr = inpFmt.createRecordReader(split, cntxt); rr.initialize(split, cntxt); } catch (IOException e) { throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e); } catch (InterruptedException e) { throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e); } return new HCatRecordItr(rr); }
private void runNewMapper( final JobConf job, MRTaskReporter reporter, final MRInputLegacy in, KeyValueWriter out) throws IOException, InterruptedException { // Initialize input in-line since it sets parameters which may be used by the processor. // Done only for MRInput. // TODO use new method in MRInput to get required info // in.initialize(job, master); // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = getTaskAttemptContext(); // make a mapper org.apache.hadoop.mapreduce.Mapper mapper; try { mapper = (org.apache.hadoop.mapreduce.Mapper) ReflectionUtils.newInstance(taskContext.getMapperClass(), job); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } org.apache.hadoop.mapreduce.RecordReader input = new NewRecordReader(in); org.apache.hadoop.mapreduce.RecordWriter output = new NewOutputCollector(out); org.apache.hadoop.mapreduce.InputSplit split = in.getNewInputSplit(); org.apache.hadoop.mapreduce.MapContext mapContext = new MapContextImpl( job, taskAttemptId, input, output, getCommitter(), processorContext, split); org.apache.hadoop.mapreduce.Mapper.Context mapperContext = new WrappedMapper().getMapContext(mapContext); input.initialize(split, mapperContext); mapper.run(mapperContext); this.statusUpdate(); input.close(); output.close(mapperContext); }
public static RecordReader getRecordReader( String cobolLocation, String datafileLocation, String delimiter, int fileFormat) throws IOException, InterruptedException { Configuration conf = new Configuration(false); conf.set("fs.default.name", "file:///"); conf.set(Constants.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF, cobolLocation); conf.set(Constants.COPYBOOK_INPUTFORMAT_FIELD_DELIMITER, delimiter); conf.set(Constants.COPYBOOK_INPUTFORMAT_FILE_STRUCTURE, Integer.toString(fileFormat)); File testFile = new File(datafileLocation); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(CopybookInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); reader.initialize(split, context); return reader; }
private static void run(BenchmarkArgs parsedArgs) throws TTransportException, IOException, InterruptedException { HiveInputDescription input = new HiveInputDescription(); input.setDbName(parsedArgs.getDatabase()); input.setTableName(parsedArgs.getTable()); input.setPartitionFilter(parsedArgs.getPartitionFilter()); HiveConf hiveConf = new HiveConf(InputBenchmark.class); ThriftHiveMetastore.Iface client = HiveMetastores.create(parsedArgs.getHiveHost(), parsedArgs.getHivePort()); System.err.println("Initialize profile with input data"); HiveApiInputFormat.setProfileInputDesc(hiveConf, input, HiveApiInputFormat.DEFAULT_PROFILE_ID); HiveApiInputFormat defaultInputFormat = new HiveApiInputFormat(); if (parsedArgs.isTrackMetrics()) { defaultInputFormat.setObserver( new MetricsObserver("default", parsedArgs.getRecordPrintPeriod())); } List<InputSplit> splits = defaultInputFormat.getSplits(hiveConf, client); System.err.println("getSplits returned " + splits.size() + " splits"); long numRows = 0; for (int i = 0; i < splits.size(); ++i) { InputSplit split = splits.get(i); TaskAttemptID taskID = new TaskAttemptID(); TaskAttemptContext taskContext = new TaskAttemptContext(hiveConf, taskID); if (i % parsedArgs.getSplitPrintPeriod() == 0) { System.err.println("Handling split " + i + " of " + splits.size()); } RecordReader<WritableComparable, HiveReadableRecord> reader = defaultInputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); numRows += readFully(reader); } System.err.println("Parsed " + numRows + " rows"); }
/** Get the record reader for the next chunk in this CombineFileSplit. */ protected boolean initNextRecordReader() throws IOException { if (curReader != null) { curReader.close(); curReader = null; if (idx > 0) { progress += split.getLength(idx - 1); // done processing so far } } // if all chunks have been processed, nothing more to do. if (idx == split.getNumPaths()) { return false; } // get a record reader for the idx-th chunk try { Configuration conf = context.getConfiguration(); // setup some helper config variables. conf.set(MRJobConfig.MAP_INPUT_FILE, split.getPath(idx).toString()); conf.setLong(MRJobConfig.MAP_INPUT_START, split.getOffset(idx)); conf.setLong(MRJobConfig.MAP_INPUT_PATH, split.getLength(idx)); curReader = rrConstructor.newInstance(new Object[] {split, context, Integer.valueOf(idx)}); if (idx > 0) { // initialize() for the first RecordReader will be called by MapTask; // we're responsible for initializing subsequent RecordReaders. curReader.initialize(split, context); } } catch (Exception e) { throw new RuntimeException(e); } idx++; return true; }
public void testBinary() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq"); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); FileOutputFormat.setOutputPath(job, outdir); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class); SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class); SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true); SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); BytesWritable bkey = new BytesWritable(); BytesWritable bval = new BytesWritable(); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context); IntWritable iwritable = new IntWritable(); DoubleWritable dwritable = new DoubleWritable(); DataOutputBuffer outbuf = new DataOutputBuffer(); LOG.info("Creating data by SequenceFileAsBinaryOutputFormat"); try { for (int i = 0; i < RECORDS; ++i) { iwritable = new IntWritable(r.nextInt()); iwritable.write(outbuf); bkey.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); dwritable = new DoubleWritable(r.nextDouble()); dwritable.write(outbuf); bval.set(outbuf.getData(), 0, outbuf.getLength()); outbuf.reset(); writer.write(bkey, bval); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>(); int count = 0; r.setSeed(seed); SequenceFileInputFormat.setInputPaths(job, outdir); LOG.info("Reading data by SequenceFileInputFormat"); for (InputSplit split : iformat.getSplits(job)) { RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context); MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { int sourceInt; double sourceDouble; while (reader.nextKeyValue()) { sourceInt = r.nextInt(); sourceDouble = r.nextDouble(); iwritable = reader.getCurrentKey(); dwritable = reader.getCurrentValue(); assertEquals( "Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get()); assertTrue( "Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0); ++count; } } finally { reader.close(); } } assertEquals("Some records not found", RECORDS, count); }
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { unfiltered.initialize(inputSplit, taskAttemptContext); }
@Override public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader( InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getTableSplit(); setHTable(HiveHBaseInputFormatUtil.getTable(jobConf)); setScan(HiveHBaseInputFormatUtil.getScan(jobConf)); Job job = new Job(jobConf); TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter); final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader(tableSplit, tac); try { recordReader.initialize(tableSplit, tac); } catch (InterruptedException e) { throw new IOException("Failed to initialize RecordReader", e); } return new RecordReader<ImmutableBytesWritable, ResultWritable>() { @Override public void close() throws IOException { recordReader.close(); } @Override public ImmutableBytesWritable createKey() { return new ImmutableBytesWritable(); } @Override public ResultWritable createValue() { return new ResultWritable(new Result()); } @Override public long getPos() throws IOException { return 0; } @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } @Override public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); if (next) { rowKey.set(recordReader.getCurrentValue().getRow()); value.setResult(recordReader.getCurrentValue()); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { originalRR.initialize(((TaggedInputSplit) split).getInputSplit(), context); }