@Override public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { initInputFormat(HadoopCompat.getConfiguration(context)); return new RecordReaderWrapper<K, V>(realInputFormat); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { JobConf jobConf = (JobConf) HadoopCompat.getConfiguration(context); initInputFormat(jobConf); org.apache.hadoop.mapred.InputSplit[] splits = realInputFormat.getSplits(jobConf, jobConf.getNumMapTasks()); if (splits == null) { return null; } List<InputSplit> resultSplits = new ArrayList<InputSplit>(splits.length); for (org.apache.hadoop.mapred.InputSplit split : splits) { if (split.getClass() == org.apache.hadoop.mapred.FileSplit.class) { org.apache.hadoop.mapred.FileSplit mapredFileSplit = ((org.apache.hadoop.mapred.FileSplit) split); resultSplits.add( new FileSplit( mapredFileSplit.getPath(), mapredFileSplit.getStart(), mapredFileSplit.getLength(), mapredFileSplit.getLocations())); } else { resultSplits.add(new InputSplitWrapper(split)); } } return resultSplits; }
@Override public RecordReader<LongWritable, BinaryWritable<M>> createRecordReader( InputSplit split, TaskAttemptContext taskAttempt) { Configuration conf = HadoopCompat.getConfiguration(taskAttempt); if (typeRef == null) { setTypeRef(conf); } return new LzoProtobufBlockRecordReader(typeRef); }
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = HadoopCompat.getConfiguration(job); Path path = getDefaultWorkFile(job, LzopCodec.DEFAULT_LZO_EXTENSION); return new LineRecordWriter<K, V>( LzoUtils.getIndexedLzoOutputStream(conf, path), conf.get("mapred.textoutputformat.separator", "\t")); }
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start_ = split.getStart(); end_ = start_ + split.getLength(); final Path file = split.getPath(); Configuration job = HadoopCompat.getConfiguration(context); errorTracker = new InputErrorTracker(job); LOG.info("input split: " + file + " " + start_ + ":" + end_); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " found, cannot run"); } // Open the file and seek to the start of the split. fileIn_ = fs.open(split.getPath()); // Creates input stream and also reads the file header. createInputReader(codec.createInputStream(fileIn_), job); if (start_ != 0) { fileIn_.seek(start_); skipToNextSyncPoint(false); start_ = fileIn_.getPos(); LOG.info("Start is now " + start_); } else { skipToNextSyncPoint(true); } pos_ = start_; }
private void testLuceneIndexRecordReader( ArrayList<String> queryStrings, ArrayList<Path> indexPaths, ArrayList<ArrayList<ArrayList<Integer>>> indexesQueriesDocIds) throws Exception { LuceneIndexInputSplit split = createStrictMock(LuceneIndexInputSplit.class); expect(split.getIndexDirs()).andReturn(indexPaths); replay(split); Configuration conf = new Configuration(); TaskAttemptContext context = createStrictMock(TaskAttemptContext.class); expect(HadoopCompat.getConfiguration(context)).andStubReturn(conf); ((Progressable) context).progress(); // casting to avoid Hadoop 2 incompatibility expectLastCall().atLeastOnce(); replay(context); LuceneIndexInputFormat.setQueries(queryStrings, conf); LuceneIndexRecordReader<IntWritable> rr = createMockBuilder(MockRecordReader.class) .addMockedMethod("openIndex") .addMockedMethod("createSearcher") .createMock(); Query[] queries = new Query[queryStrings.size()]; for (int i = 0; i < queries.length; i++) { Query query = createStrictMock(Query.class); replay(query); queries[i] = query; expect(rr.deserializeQuery(queryStrings.get(i))).andReturn(query); } for (int index = 0; index < indexPaths.size(); index++) { IndexReader reader = createStrictMock(IndexReader.class); expect(reader.maxDoc()).andStubReturn(4); replay(reader); expect(rr.openIndex(indexPaths.get(index), conf)).andReturn(reader); IndexSearcher searcher = createStrictMock(IndexSearcher.class); expect(rr.createSearcher(reader)).andReturn(searcher); for (int query = 0; query < queries.length; query++) { final ArrayList<Integer> ids = indexesQueriesDocIds.get(index).get(query); final Capture<Collector> collectorCapture = new Capture<Collector>(); expect(searcher.getIndexReader()).andReturn(reader); searcher.search(eq(queries[query]), capture(collectorCapture)); expectLastCall() .andAnswer( new IAnswer<Void>() { @Override public Void answer() throws Throwable { for (int id : ids) { collectorCapture.getValue().collect(id); } return null; } }); for (int docId : ids) { expect(searcher.doc(docId)).andReturn(docs[docId]); } } replay(searcher); } replay(rr); rr.initialize(split, context); float prevProgress = -1; for (int index = 0; index < indexesQueriesDocIds.size(); index++) { for (int query = 0; query < indexesQueriesDocIds.get(index).size(); query++) { for (int docId : indexesQueriesDocIds.get(index).get(query)) { assertTrue(rr.nextKeyValue()); assertEquals(query, rr.getCurrentKey().get()); assertEquals(docsAndValues.get(docs[docId]), (Integer) rr.getCurrentValue().get()); float newProgress = rr.getProgress(); assertTrue(newProgress > prevProgress); assertTrue(newProgress <= 1.0); } } } assertFalse(rr.nextKeyValue()); assertFalse(rr.nextKeyValue()); verifyAll(); }
/** * Sets jobs input format to {@link MapReduceInputFormatWrapper} and stores supplied real {@link * InputFormat} class name in job configuration. This configuration is read on the remote tasks to * instantiate actual InputFormat correctly. */ public static void setInputFormat(Class<?> realInputFormatClass, Job job) { job.setInputFormatClass(MapReduceInputFormatWrapper.class); setWrappedInputFormat(realInputFormatClass, HadoopCompat.getConfiguration(job)); }
@Override public void initialize(InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException { org.apache.hadoop.mapred.InputSplit oldSplit; if (split.getClass() == FileSplit.class) { oldSplit = new org.apache.hadoop.mapred.FileSplit( ((FileSplit) split).getPath(), ((FileSplit) split).getStart(), ((FileSplit) split).getLength(), split.getLocations()); } else { oldSplit = ((InputSplitWrapper) split).realSplit; } @SuppressWarnings("unchecked") Reporter reporter = new Reporter() { // Reporter interface over ctx final TaskInputOutputContext ioCtx = context instanceof TaskInputOutputContext ? (TaskInputOutputContext) context : null; public void progress() { HadoopCompat.progress(context); } // @Override public float getProgress() { return (ioCtx != null) ? ioCtx.getProgress() : 0; } public void setStatus(String status) { if (ioCtx != null) HadoopCompat.setStatus(ioCtx, status); } public void incrCounter(String group, String counter, long amount) { if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(group, counter), amount); } @SuppressWarnings("unchecked") public void incrCounter(Enum<?> key, long amount) { if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(key), amount); } public org.apache.hadoop.mapred.InputSplit getInputSplit() throws UnsupportedOperationException { throw new UnsupportedOperationException(); } public Counter getCounter(String group, String name) { return ioCtx != null ? (Counter) HadoopCompat.getCounter(ioCtx, group, name) : null; } @SuppressWarnings("unchecked") public Counter getCounter(Enum<?> name) { return ioCtx != null ? (Counter) ioCtx.getCounter(name) : null; } }; realReader = realInputFormat.getRecordReader( oldSplit, (JobConf) HadoopCompat.getConfiguration(context), reporter); keyObj = realReader.createKey(); valueObj = realReader.createValue(); }
public static void setInputFormatClass(Class<?> clazz, Job job) { job.setInputFormatClass(FourMcEbProtoInputFormat.class); setClassConf(clazz, HadoopCompat.getConfiguration(job)); }