boolean isDirReadable(DrillFileSystem fs, FileStatus dir) { Path p = new Path(dir.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE); try { if (fs.exists(p)) { return true; } else { if (metaDataFileExists(fs, dir)) { return true; } PathFilter filter = new DrillPathFilter(); FileStatus[] files = fs.listStatus(dir.getPath(), filter); if (files.length == 0) { return false; } return super.isFileReadable(fs, files[0]); } } catch (IOException e) { logger.info("Failure while attempting to check for Parquet metadata file.", e); return false; } }
/** * Performs the initial setup required for the record reader. Initializes the input stream, * handling of the output record batch and the actual reader to be used. * * @param context operator context from which buffer's will be allocated and managed * @param outputMutator Used to create the schema in the output record batch * @throws ExecutionSetupException */ @Override public void setup(OperatorContext context, OutputMutator outputMutator) throws ExecutionSetupException { oContext = context; readBuffer = context.getManagedBuffer(READ_BUFFER); whitespaceBuffer = context.getManagedBuffer(WHITE_SPACE_BUFFER); // setup Output, Input, and Reader try { TextOutput output = null; TextInput input = null; InputStream stream = null; // setup Output using OutputMutator if (settings.isHeaderExtractionEnabled()) { // extract header and use that to setup a set of VarCharVectors String[] fieldNames = extractHeader(); output = new FieldVarCharOutput(outputMutator, fieldNames, getColumns(), isStarQuery()); } else { // simply use RepeatedVarCharVector output = new RepeatedVarCharOutput(outputMutator, getColumns(), isStarQuery()); } // setup Input using InputStream stream = dfs.openPossiblyCompressedStream(split.getPath()); input = new TextInput( settings, stream, readBuffer, split.getStart(), split.getStart() + split.getLength()); // setup Reader using Input and Output reader = new TextReader(settings, input, output, whitespaceBuffer); reader.start(); } catch (SchemaChangeException | IOException e) { throw new ExecutionSetupException( String.format("Failure while setting up text reader for file %s", split.getPath()), e); } catch (IllegalArgumentException e) { throw UserException.dataReadError(e) .addContext("File Path", split.getPath().toString()) .build(logger); } }
/** * This method is responsible to implement logic for extracting header from text file Currently it * is assumed to be first line if headerExtractionEnabled is set to true TODO: enhance to support * more common header patterns * * @return field name strings */ private String[] extractHeader() throws SchemaChangeException, IOException, ExecutionSetupException { assert (settings.isHeaderExtractionEnabled()); assert (oContext != null); // don't skip header in case skipFirstLine is set true settings.setSkipFirstLine(false); // setup Output using OutputMutator // we should use a separate output mutator to avoid reshaping query output with header data HeaderOutputMutator hOutputMutator = new HeaderOutputMutator(); TextOutput hOutput = new RepeatedVarCharOutput(hOutputMutator, getColumns(), true); this.allocate(hOutputMutator.fieldVectorMap); // setup Input using InputStream // we should read file header irrespective of split given given to this reader InputStream hStream = dfs.openPossiblyCompressedStream(split.getPath()); TextInput hInput = new TextInput( settings, hStream, oContext.getManagedBuffer(READ_BUFFER), 0, split.getLength()); // setup Reader using Input and Output this.reader = new TextReader(settings, hInput, hOutput, oContext.getManagedBuffer(WHITE_SPACE_BUFFER)); reader.start(); // extract first row only reader.parseNext(); // grab the field names from output String[] fieldNames = ((RepeatedVarCharOutput) hOutput).getTextOutput(); // cleanup and set to skip the first line next time we read input reader.close(); hOutputMutator.close(); settings.setSkipFirstLine(true); return fieldNames; }