@Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    JobConf jobConf = (JobConf) HadoopCompat.getConfiguration(context);

    initInputFormat(jobConf);

    org.apache.hadoop.mapred.InputSplit[] splits =
        realInputFormat.getSplits(jobConf, jobConf.getNumMapTasks());

    if (splits == null) {
      return null;
    }

    List<InputSplit> resultSplits = new ArrayList<InputSplit>(splits.length);

    for (org.apache.hadoop.mapred.InputSplit split : splits) {
      if (split.getClass() == org.apache.hadoop.mapred.FileSplit.class) {
        org.apache.hadoop.mapred.FileSplit mapredFileSplit =
            ((org.apache.hadoop.mapred.FileSplit) split);
        resultSplits.add(
            new FileSplit(
                mapredFileSplit.getPath(),
                mapredFileSplit.getStart(),
                mapredFileSplit.getLength(),
                mapredFileSplit.getLocations()));
      } else {
        resultSplits.add(new InputSplitWrapper(split));
      }
    }

    return resultSplits;
  }
示例#2
0
  private long[] getInputSizes(InputFormat[] inputFormats, JobConf[] jobConfs) throws IOException {
    long[] inputSizes = new long[inputFormats.length];

    for (int i = 0; i < inputFormats.length; i++) {
      InputFormat inputFormat = inputFormats[i];
      InputSplit[] splits = inputFormat.getSplits(jobConfs[i], 1);

      for (InputSplit split : splits) inputSizes[i] = inputSizes[i] + split.getLength();
    }

    return inputSizes;
  }
示例#3
0
 public static InputSplit deserializeInputSplit(String base64, String className)
     throws IOException, ReflectiveOperationException {
   Constructor<?> constructor = Class.forName(className).getDeclaredConstructor();
   if (constructor == null) {
     throw new ReflectiveOperationException(
         "Class " + className + " does not implement a default constructor.");
   }
   constructor.setAccessible(true);
   InputSplit split = (InputSplit) constructor.newInstance();
   ByteArrayDataInput byteArrayDataInput = ByteStreams.newDataInput(Base64.decodeBase64(base64));
   split.readFields(byteArrayDataInput);
   return split;
 }
    public RecordReader<Text, Text> getRecordReader(
        InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {

      reporter.setStatus(genericSplit.toString());
      FileSplit split = (FileSplit) genericSplit;
      final Path file = split.getPath();
      FileSystem fs = file.getFileSystem(job);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (compressionCodecs != null && compressionCodecs.getCodec(file) != null)
        throw new RuntimeException("Not handling compression!");

      return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job));
    }
 @SuppressWarnings("unchecked")
 @Override
 /**
  * Instantiates a FileCollectionRecordReader using the specified spit (which is assumed to be a
  * CombineFileSplit.
  *
  * @param genericSplit contains files to be processed, assumed to be a CombineFileSplit
  * @param job JobConf of this job
  * @param reported To report progress
  */
 public RecordReader<Text, SplitAwareWrapper<Document>> getRecordReader(
     InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
   reporter.setStatus(genericSplit.toString());
   return new FileCollectionRecordReader(job, (PositionAwareSplit<CombineFileSplit>) genericSplit);
 }
    @Override
    public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) {

      //      try{ compatible with hadoop-0.14 TODO MC
      reporter.setStatus(split.toString());
      /*
           } catch (IOException e) {
             throw new RuntimeException("Cannot set status for reported:", e);
           }
      */
      // find part name
      SegmentPart segmentPart;
      final String spString;
      try {
        segmentPart = SegmentPart.get((FileSplit) split);
        spString = segmentPart.toString();
      } catch (IOException e) {
        throw new RuntimeException("Cannot identify segment:", e);
      }

      try {
        return new SequenceFileRecordReader(job, (FileSplit) split) {

          @Override
          public synchronized boolean next(Writable key, Writable value) throws IOException {
            LOG.debug("Running OIF.next()");

            MetaWrapper wrapper = (MetaWrapper) value;
            try {
              wrapper.set(getValueClass().newInstance());
            } catch (Exception e) {
              throw new IOException(e.toString());
            }

            boolean res = super.next(key, (Writable) wrapper.get());
            wrapper.setMeta(SEGMENT_PART_KEY, spString);
            return res;
          }

          @Override
          public Writable createValue() {
            return new MetaWrapper();
          }
        };
      } catch (IOException e) {
        throw new RuntimeException("Cannot create RecordReader: ", e);
      }
    }
 @Override
 public void write(DataOutput out) throws IOException {
   WritableUtils.writeString(out, realSplit.getClass().getName());
   ((Writable) realSplit).write(out);
 }
 @Override
 public String[] getLocations() throws IOException {
   return realSplit.getLocations();
 }
 @Override
 public long getLength() throws IOException {
   return realSplit.getLength();
 }
 @Override
 public RecordReader<LongWritable, Text> getRecordReader(
     InputSplit split, JobConf job, Reporter reporter) throws IOException {
   reporter.setStatus(split.toString());
   return new ExampleRecordReader(job, (FileSplit) split);
 }
 // constructor used by the old API
 ESRecordReader(
     org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
   reporter.setStatus(split.toString());
   init((ESInputSplit) split, job);
 }
 @Override
 public RecordReader<LongWritable, Text> getRecordReader(
     InputSplit split, JobConf conf, Reporter reporter) throws IOException {
   reporter.setStatus(split.toString());
   return new DeprecatedLzoLineRecordReader(conf, (FileSplit) split);
 }