static boolean isSplittable(InputFormat<?, ?> inputFormat, FileSystem fileSystem, Path path) { // ORC uses a custom InputFormat but is always splittable if (inputFormat.getClass().getSimpleName().equals("OrcInputFormat")) { return true; } // use reflection to get isSplittable method on FileInputFormat Method method = null; for (Class<?> clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) { try { method = clazz.getDeclaredMethod("isSplitable", FileSystem.class, Path.class); break; } catch (NoSuchMethodException ignored) { } } if (method == null) { return false; } try { method.setAccessible(true); return (boolean) method.invoke(inputFormat, fileSystem, path); } catch (InvocationTargetException | IllegalAccessException e) { throw Throwables.propagate(e); } }
private long[] getInputSizes(InputFormat[] inputFormats, JobConf[] jobConfs) throws IOException { long[] inputSizes = new long[inputFormats.length]; for (int i = 0; i < inputFormats.length; i++) { InputFormat inputFormat = inputFormats[i]; InputSplit[] splits = inputFormat.getSplits(jobConfs[i], 1); for (InputSplit split : splits) inputSizes[i] = inputSizes[i] + split.getLength(); } return inputSizes; }
private long[] getInputSplitSizes(InputFormat[] inputFormats, JobConf[] jobConfs, int numSplits) throws IOException { long[] inputSizes = new long[inputFormats.length]; for (int i = 0; i < inputFormats.length; i++) { InputFormat inputFormat = inputFormats[i]; InputSplit[] splits = inputFormat.getSplits(jobConfs[i], numSplits); inputSizes[i] = splits.length; } return inputSizes; }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { JobConf jobConf = (JobConf) HadoopCompat.getConfiguration(context); initInputFormat(jobConf); org.apache.hadoop.mapred.InputSplit[] splits = realInputFormat.getSplits(jobConf, jobConf.getNumMapTasks()); if (splits == null) { return null; } List<InputSplit> resultSplits = new ArrayList<InputSplit>(splits.length); for (org.apache.hadoop.mapred.InputSplit split : splits) { if (split.getClass() == org.apache.hadoop.mapred.FileSplit.class) { org.apache.hadoop.mapred.FileSplit mapredFileSplit = ((org.apache.hadoop.mapred.FileSplit) split); resultSplits.add( new FileSplit( mapredFileSplit.getPath(), mapredFileSplit.getStart(), mapredFileSplit.getLength(), mapredFileSplit.getLocations())); } else { resultSplits.add(new InputSplitWrapper(split)); } } return resultSplits; }
public static RecordReader<?, ?> createRecordReader( Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) { // determine which hive columns we will read List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR)); List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex)); // Tell hive the columns we would like to read, this lets hive optimize reading column oriented // files setReadColumns(configuration, readHiveColumnIndexes); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true); JobConf jobConf = new JobConf(configuration); FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null); // propagate serialization configuration to getRecordReader schema .stringPropertyNames() .stream() .filter(name -> name.startsWith("serialization.")) .forEach(name -> jobConf.set(name, schema.getProperty(name))); try { return retry() .stopOnIllegalExceptions() .run( "createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL)); } catch (Exception e) { throw new PrestoException( HIVE_CANNOT_OPEN_SPLIT, format( "Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e); } }
@Override public void initialize(InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException { org.apache.hadoop.mapred.InputSplit oldSplit; if (split.getClass() == FileSplit.class) { oldSplit = new org.apache.hadoop.mapred.FileSplit( ((FileSplit) split).getPath(), ((FileSplit) split).getStart(), ((FileSplit) split).getLength(), split.getLocations()); } else { oldSplit = ((InputSplitWrapper) split).realSplit; } @SuppressWarnings("unchecked") Reporter reporter = new Reporter() { // Reporter interface over ctx final TaskInputOutputContext ioCtx = context instanceof TaskInputOutputContext ? (TaskInputOutputContext) context : null; public void progress() { HadoopCompat.progress(context); } // @Override public float getProgress() { return (ioCtx != null) ? ioCtx.getProgress() : 0; } public void setStatus(String status) { if (ioCtx != null) HadoopCompat.setStatus(ioCtx, status); } public void incrCounter(String group, String counter, long amount) { if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(group, counter), amount); } @SuppressWarnings("unchecked") public void incrCounter(Enum<?> key, long amount) { if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(key), amount); } public org.apache.hadoop.mapred.InputSplit getInputSplit() throws UnsupportedOperationException { throw new UnsupportedOperationException(); } public Counter getCounter(String group, String name) { return ioCtx != null ? (Counter) HadoopCompat.getCounter(ioCtx, group, name) : null; } @SuppressWarnings("unchecked") public Counter getCounter(Enum<?> name) { return ioCtx != null ? (Counter) ioCtx.getCounter(name) : null; } }; realReader = realInputFormat.getRecordReader( oldSplit, (JobConf) HadoopCompat.getConfiguration(context), reporter); keyObj = realReader.createKey(); valueObj = realReader.createValue(); }
@Override public RecordReader<MapWritable, MapWritable> getRecordReader( InputSplit split, JobConf job, Reporter reporter) throws IOException { return baseInputFormat.getRecordReader(split, job, reporter); }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { return baseInputFormat.getSplits(job, numSplits); }