@Override
  public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
      throws IOException, InterruptedException {

    initInputFormat(HadoopCompat.getConfiguration(context));
    return new RecordReaderWrapper<K, V>(realInputFormat);
  }
  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    JobConf jobConf = (JobConf) HadoopCompat.getConfiguration(context);

    initInputFormat(jobConf);

    org.apache.hadoop.mapred.InputSplit[] splits =
        realInputFormat.getSplits(jobConf, jobConf.getNumMapTasks());

    if (splits == null) {
      return null;
    }

    List<InputSplit> resultSplits = new ArrayList<InputSplit>(splits.length);

    for (org.apache.hadoop.mapred.InputSplit split : splits) {
      if (split.getClass() == org.apache.hadoop.mapred.FileSplit.class) {
        org.apache.hadoop.mapred.FileSplit mapredFileSplit =
            ((org.apache.hadoop.mapred.FileSplit) split);
        resultSplits.add(
            new FileSplit(
                mapredFileSplit.getPath(),
                mapredFileSplit.getStart(),
                mapredFileSplit.getLength(),
                mapredFileSplit.getLocations()));
      } else {
        resultSplits.add(new InputSplitWrapper(split));
      }
    }

    return resultSplits;
  }
Пример #3
0
 @Override
 public RecordReader<LongWritable, BinaryWritable<M>> createRecordReader(
     InputSplit split, TaskAttemptContext taskAttempt) {
   Configuration conf = HadoopCompat.getConfiguration(taskAttempt);
   if (typeRef == null) {
     setTypeRef(conf);
   }
   return new LzoProtobufBlockRecordReader(typeRef);
 }
  @Override
  public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job)
      throws IOException, InterruptedException {

    Configuration conf = HadoopCompat.getConfiguration(job);
    Path path = getDefaultWorkFile(job, LzopCodec.DEFAULT_LZO_EXTENSION);

    return new LineRecordWriter<K, V>(
        LzoUtils.getIndexedLzoOutputStream(conf, path),
        conf.get("mapred.textoutputformat.separator", "\t"));
  }
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext context)
      throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start_ = split.getStart();
    end_ = start_ + split.getLength();
    final Path file = split.getPath();
    Configuration job = HadoopCompat.getConfiguration(context);

    errorTracker = new InputErrorTracker(job);

    LOG.info("input split: " + file + " " + start_ + ":" + end_);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
      throw new IOException("No codec for file " + file + " found, cannot run");
    }

    // Open the file and seek to the start of the split.
    fileIn_ = fs.open(split.getPath());

    // Creates input stream and also reads the file header.
    createInputReader(codec.createInputStream(fileIn_), job);

    if (start_ != 0) {
      fileIn_.seek(start_);
      skipToNextSyncPoint(false);
      start_ = fileIn_.getPos();
      LOG.info("Start is now " + start_);
    } else {
      skipToNextSyncPoint(true);
    }
    pos_ = start_;
  }
  private void testLuceneIndexRecordReader(
      ArrayList<String> queryStrings,
      ArrayList<Path> indexPaths,
      ArrayList<ArrayList<ArrayList<Integer>>> indexesQueriesDocIds)
      throws Exception {

    LuceneIndexInputSplit split = createStrictMock(LuceneIndexInputSplit.class);
    expect(split.getIndexDirs()).andReturn(indexPaths);
    replay(split);

    Configuration conf = new Configuration();
    TaskAttemptContext context = createStrictMock(TaskAttemptContext.class);
    expect(HadoopCompat.getConfiguration(context)).andStubReturn(conf);
    ((Progressable) context).progress(); // casting to avoid Hadoop 2 incompatibility
    expectLastCall().atLeastOnce();
    replay(context);

    LuceneIndexInputFormat.setQueries(queryStrings, conf);

    LuceneIndexRecordReader<IntWritable> rr =
        createMockBuilder(MockRecordReader.class)
            .addMockedMethod("openIndex")
            .addMockedMethod("createSearcher")
            .createMock();

    Query[] queries = new Query[queryStrings.size()];
    for (int i = 0; i < queries.length; i++) {
      Query query = createStrictMock(Query.class);
      replay(query);
      queries[i] = query;
      expect(rr.deserializeQuery(queryStrings.get(i))).andReturn(query);
    }

    for (int index = 0; index < indexPaths.size(); index++) {
      IndexReader reader = createStrictMock(IndexReader.class);
      expect(reader.maxDoc()).andStubReturn(4);
      replay(reader);
      expect(rr.openIndex(indexPaths.get(index), conf)).andReturn(reader);

      IndexSearcher searcher = createStrictMock(IndexSearcher.class);
      expect(rr.createSearcher(reader)).andReturn(searcher);

      for (int query = 0; query < queries.length; query++) {
        final ArrayList<Integer> ids = indexesQueriesDocIds.get(index).get(query);
        final Capture<Collector> collectorCapture = new Capture<Collector>();
        expect(searcher.getIndexReader()).andReturn(reader);
        searcher.search(eq(queries[query]), capture(collectorCapture));

        expectLastCall()
            .andAnswer(
                new IAnswer<Void>() {
                  @Override
                  public Void answer() throws Throwable {
                    for (int id : ids) {
                      collectorCapture.getValue().collect(id);
                    }
                    return null;
                  }
                });

        for (int docId : ids) {
          expect(searcher.doc(docId)).andReturn(docs[docId]);
        }
      }
      replay(searcher);
    }

    replay(rr);

    rr.initialize(split, context);

    float prevProgress = -1;
    for (int index = 0; index < indexesQueriesDocIds.size(); index++) {
      for (int query = 0; query < indexesQueriesDocIds.get(index).size(); query++) {
        for (int docId : indexesQueriesDocIds.get(index).get(query)) {
          assertTrue(rr.nextKeyValue());
          assertEquals(query, rr.getCurrentKey().get());
          assertEquals(docsAndValues.get(docs[docId]), (Integer) rr.getCurrentValue().get());
          float newProgress = rr.getProgress();
          assertTrue(newProgress > prevProgress);
          assertTrue(newProgress <= 1.0);
        }
      }
    }

    assertFalse(rr.nextKeyValue());
    assertFalse(rr.nextKeyValue());

    verifyAll();
  }
 /**
  * Sets jobs input format to {@link MapReduceInputFormatWrapper} and stores supplied real {@link
  * InputFormat} class name in job configuration. This configuration is read on the remote tasks to
  * instantiate actual InputFormat correctly.
  */
 public static void setInputFormat(Class<?> realInputFormatClass, Job job) {
   job.setInputFormatClass(MapReduceInputFormatWrapper.class);
   setWrappedInputFormat(realInputFormatClass, HadoopCompat.getConfiguration(job));
 }
    @Override
    public void initialize(InputSplit split, final TaskAttemptContext context)
        throws IOException, InterruptedException {

      org.apache.hadoop.mapred.InputSplit oldSplit;

      if (split.getClass() == FileSplit.class) {
        oldSplit =
            new org.apache.hadoop.mapred.FileSplit(
                ((FileSplit) split).getPath(),
                ((FileSplit) split).getStart(),
                ((FileSplit) split).getLength(),
                split.getLocations());
      } else {
        oldSplit = ((InputSplitWrapper) split).realSplit;
      }

      @SuppressWarnings("unchecked")
      Reporter reporter = new Reporter() { // Reporter interface over ctx

            final TaskInputOutputContext ioCtx =
                context instanceof TaskInputOutputContext ? (TaskInputOutputContext) context : null;

            public void progress() {
              HadoopCompat.progress(context);
            }

            // @Override
            public float getProgress() {
              return (ioCtx != null) ? ioCtx.getProgress() : 0;
            }

            public void setStatus(String status) {
              if (ioCtx != null) HadoopCompat.setStatus(ioCtx, status);
            }

            public void incrCounter(String group, String counter, long amount) {
              if (ioCtx != null)
                HadoopCompat.incrementCounter(ioCtx.getCounter(group, counter), amount);
            }

            @SuppressWarnings("unchecked")
            public void incrCounter(Enum<?> key, long amount) {
              if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(key), amount);
            }

            public org.apache.hadoop.mapred.InputSplit getInputSplit()
                throws UnsupportedOperationException {
              throw new UnsupportedOperationException();
            }

            public Counter getCounter(String group, String name) {
              return ioCtx != null ? (Counter) HadoopCompat.getCounter(ioCtx, group, name) : null;
            }

            @SuppressWarnings("unchecked")
            public Counter getCounter(Enum<?> name) {
              return ioCtx != null ? (Counter) ioCtx.getCounter(name) : null;
            }
          };

      realReader =
          realInputFormat.getRecordReader(
              oldSplit, (JobConf) HadoopCompat.getConfiguration(context), reporter);

      keyObj = realReader.createKey();
      valueObj = realReader.createValue();
    }
Пример #9
0
 public static void setInputFormatClass(Class<?> clazz, Job job) {
   job.setInputFormatClass(FourMcEbProtoInputFormat.class);
   setClassConf(clazz, HadoopCompat.getConfiguration(job));
 }