Esempio n. 1
0
  static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, Path path) {
    // ORC uses a custom InputFormat but is always splittable
    if (inputFormat.getClass().getSimpleName().equals("OrcInputFormat")) {
      return true;
    }

    // use reflection to get isSplittable method on FileInputFormat
    Method method = null;
    for (Class<?> clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) {
      try {
        method = clazz.getDeclaredMethod("isSplitable", FileSystem.class, Path.class);
        break;
      } catch (NoSuchMethodException ignored) {
      }
    }

    if (method == null) {
      return false;
    }
    try {
      method.setAccessible(true);
      return (boolean) method.invoke(inputFormat, fileSystem, path);
    } catch (InvocationTargetException | IllegalAccessException e) {
      throw Throwables.propagate(e);
    }
  }
Esempio n. 2
0
  private long[] getInputSizes(InputFormat[] inputFormats, JobConf[] jobConfs) throws IOException {
    long[] inputSizes = new long[inputFormats.length];

    for (int i = 0; i < inputFormats.length; i++) {
      InputFormat inputFormat = inputFormats[i];
      InputSplit[] splits = inputFormat.getSplits(jobConfs[i], 1);

      for (InputSplit split : splits) inputSizes[i] = inputSizes[i] + split.getLength();
    }

    return inputSizes;
  }
Esempio n. 3
0
  private long[] getInputSplitSizes(InputFormat[] inputFormats, JobConf[] jobConfs, int numSplits)
      throws IOException {
    long[] inputSizes = new long[inputFormats.length];

    for (int i = 0; i < inputFormats.length; i++) {
      InputFormat inputFormat = inputFormats[i];
      InputSplit[] splits = inputFormat.getSplits(jobConfs[i], numSplits);

      inputSizes[i] = splits.length;
    }

    return inputSizes;
  }
  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    JobConf jobConf = (JobConf) HadoopCompat.getConfiguration(context);

    initInputFormat(jobConf);

    org.apache.hadoop.mapred.InputSplit[] splits =
        realInputFormat.getSplits(jobConf, jobConf.getNumMapTasks());

    if (splits == null) {
      return null;
    }

    List<InputSplit> resultSplits = new ArrayList<InputSplit>(splits.length);

    for (org.apache.hadoop.mapred.InputSplit split : splits) {
      if (split.getClass() == org.apache.hadoop.mapred.FileSplit.class) {
        org.apache.hadoop.mapred.FileSplit mapredFileSplit =
            ((org.apache.hadoop.mapred.FileSplit) split);
        resultSplits.add(
            new FileSplit(
                mapredFileSplit.getPath(),
                mapredFileSplit.getStart(),
                mapredFileSplit.getLength(),
                mapredFileSplit.getLocations()));
      } else {
        resultSplits.add(new InputSplitWrapper(split));
      }
    }

    return resultSplits;
  }
Esempio n. 5
0
  public static RecordReader<?, ?> createRecordReader(
      Configuration configuration,
      Path path,
      long start,
      long length,
      Properties schema,
      List<HiveColumnHandle> columns) {
    // determine which hive columns we will read
    List<HiveColumnHandle> readColumns =
        ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
    List<Integer> readHiveColumnIndexes =
        ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));

    // Tell hive the columns we would like to read, this lets hive optimize reading column oriented
    // files
    setReadColumns(configuration, readHiveColumnIndexes);

    InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
    JobConf jobConf = new JobConf(configuration);
    FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);

    // propagate serialization configuration to getRecordReader
    schema
        .stringPropertyNames()
        .stream()
        .filter(name -> name.startsWith("serialization."))
        .forEach(name -> jobConf.set(name, schema.getProperty(name)));

    try {
      return retry()
          .stopOnIllegalExceptions()
          .run(
              "createRecordReader",
              () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
    } catch (Exception e) {
      throw new PrestoException(
          HIVE_CANNOT_OPEN_SPLIT,
          format(
              "Error opening Hive split %s (offset=%s, length=%s) using %s: %s",
              path, start, length, getInputFormatName(schema), e.getMessage()),
          e);
    }
  }
    @Override
    public void initialize(InputSplit split, final TaskAttemptContext context)
        throws IOException, InterruptedException {

      org.apache.hadoop.mapred.InputSplit oldSplit;

      if (split.getClass() == FileSplit.class) {
        oldSplit =
            new org.apache.hadoop.mapred.FileSplit(
                ((FileSplit) split).getPath(),
                ((FileSplit) split).getStart(),
                ((FileSplit) split).getLength(),
                split.getLocations());
      } else {
        oldSplit = ((InputSplitWrapper) split).realSplit;
      }

      @SuppressWarnings("unchecked")
      Reporter reporter = new Reporter() { // Reporter interface over ctx

            final TaskInputOutputContext ioCtx =
                context instanceof TaskInputOutputContext ? (TaskInputOutputContext) context : null;

            public void progress() {
              HadoopCompat.progress(context);
            }

            // @Override
            public float getProgress() {
              return (ioCtx != null) ? ioCtx.getProgress() : 0;
            }

            public void setStatus(String status) {
              if (ioCtx != null) HadoopCompat.setStatus(ioCtx, status);
            }

            public void incrCounter(String group, String counter, long amount) {
              if (ioCtx != null)
                HadoopCompat.incrementCounter(ioCtx.getCounter(group, counter), amount);
            }

            @SuppressWarnings("unchecked")
            public void incrCounter(Enum<?> key, long amount) {
              if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(key), amount);
            }

            public org.apache.hadoop.mapred.InputSplit getInputSplit()
                throws UnsupportedOperationException {
              throw new UnsupportedOperationException();
            }

            public Counter getCounter(String group, String name) {
              return ioCtx != null ? (Counter) HadoopCompat.getCounter(ioCtx, group, name) : null;
            }

            @SuppressWarnings("unchecked")
            public Counter getCounter(Enum<?> name) {
              return ioCtx != null ? (Counter) ioCtx.getCounter(name) : null;
            }
          };

      realReader =
          realInputFormat.getRecordReader(
              oldSplit, (JobConf) HadoopCompat.getConfiguration(context), reporter);

      keyObj = realReader.createKey();
      valueObj = realReader.createValue();
    }
 @Override
 public RecordReader<MapWritable, MapWritable> getRecordReader(
     InputSplit split, JobConf job, Reporter reporter) throws IOException {
   return baseInputFormat.getRecordReader(split, job, reporter);
 }
 @Override
 public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   return baseInputFormat.getSplits(job, numSplits);
 }