private void ensureMetadataReader() throws IOException { ensureOrcReader(); if (metadataReader != null) return; long startTime = counters.startTimeCounter(); metadataReader = orcReader.metadata(); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); }
/** Ensures orcReader is initialized for the split. */ private void ensureOrcReader() throws IOException { if (orcReader != null) return; Path path = HdfsUtils.getFileIdPath(fs, split.getPath(), fileId); if (DebugUtils.isTraceOrcEnabled()) { LOG.info("Creating reader for " + path + " (" + split.getPath() + ")"); } long startTime = counters.startTimeCounter(); ReaderOptions opts = OrcFile.readerOptions(conf).filesystem(fs).fileMetadata(fileMetadata); orcReader = EncodedOrcFile.createReader(path, opts); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); }
/** Gets file metadata for the split from cache, or reads it from the file. */ private OrcFileMetadata getOrReadFileMetadata() throws IOException { OrcFileMetadata metadata = metadataCache.getFileMetadata(fileId); if (metadata != null) { counters.incrCounter(Counter.METADATA_CACHE_HIT); return metadata; } counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureOrcReader(); // We assume this call doesn't touch HDFS because everything is already read; don't add time. metadata = new OrcFileMetadata(fileId, orcReader); return metadataCache.putFileMetadata(metadata); }
/** * In case if stripe metadata in cache does not have all indexes for current query, load the * missing one. This is a temporary cludge until real metadata cache becomes available. */ private void updateLoadedIndexes( OrcStripeMetadata stripeMetadata, StripeInformation stripe, boolean[] stripeIncludes, boolean[] sargColumns) throws IOException { // We only synchronize on write for now - design of metadata cache is very temporary; // we pre-allocate the array and never remove entries; so readers should be safe. synchronized (stripeMetadata) { if (stripeMetadata.hasAllIndexes(stripeIncludes)) return; long startTime = counters.startTimeCounter(); stripeMetadata.loadMissingIndexes(metadataReader, stripe, stripeIncludes, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); } }
/** Reads the metadata for all stripes in the file. */ private ArrayList<OrcStripeMetadata> readStripesMetadata( boolean[] globalInc, boolean[] sargColumns) throws IOException { ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(readState.length); OrcBatchKey stripeKey = new OrcBatchKey(fileId, 0, 0); for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { stripeKey.stripeIx = stripeIxMod + stripeIxFrom; OrcStripeMetadata value = metadataCache.getStripeMetadata(stripeKey); if (value == null || !value.hasAllIndexes(globalInc)) { counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); StripeInformation si = fileMetadata.getStripes().get(stripeKey.stripeIx); if (value == null) { long startTime = counters.startTimeCounter(); value = new OrcStripeMetadata(stripeKey, metadataReader, si, globalInc, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); value = metadataCache.putStripeMetadata(value); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Caching stripe " + stripeKey.stripeIx + " metadata with includes: " + DebugUtils.toString(globalInc)); } // Create new key object to reuse for gets; we've used the old one to put in cache. stripeKey = new OrcBatchKey(fileId, 0, 0); } // We might have got an old value from cache; recheck it has indexes. if (!value.hasAllIndexes(globalInc)) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Updating indexes in stripe " + stripeKey.stripeIx + " metadata for includes: " + DebugUtils.toString(globalInc)); } updateLoadedIndexes(value, si, globalInc, sargColumns); } } else { counters.incrCounter(Counter.METADATA_CACHE_HIT); } result.add(value); consumer.setStripeMetadata(value); } return result; }
private void adjustRgMetric(int rgCount, boolean[] rgsToRead, boolean isNone, boolean isAll) { int count = 0; if (!isAll) { for (boolean b : rgsToRead) { if (b) count++; } } else if (!isNone) { count = rgCount; } counters.setCounter(QueryFragmentCounters.Counter.SELECTED_ROWGROUPS, count); }
private void recordReaderTime(long startTime) { counters.incrTimeCounter(Counter.TOTAL_IO_TIME_US, startTime); }
protected Void performDataRead() throws IOException { long startTime = counters.startTimeCounter(); if (LlapIoImpl.LOGL.isInfoEnabled()) { LlapIoImpl.LOG.info("Processing data for " + split.getPath()); } if (processStop()) { recordReaderTime(startTime); return null; } counters.setDesc(QueryFragmentCounters.Desc.TABLE, getDbAndTableName(split.getPath())); orcReader = null; // 1. Get file metadata from cache, or create the reader and read it. // Don't cache the filesystem object for now; Tez closes it and FS cache will fix all that fs = split.getPath().getFileSystem(conf); fileId = determineFileId(fs, split); counters.setDesc(QueryFragmentCounters.Desc.FILE, fileId); try { fileMetadata = getOrReadFileMetadata(); consumer.setFileMetadata(fileMetadata); validateFileMetadata(); if (columnIds == null) { columnIds = createColumnIds(fileMetadata); } // 2. Determine which stripes to read based on the split. determineStripesToRead(); } catch (Throwable t) { recordReaderTime(startTime); consumer.setError(t); return null; } if (readState.length == 0) { consumer.setDone(); recordReaderTime(startTime); return null; // No data to read. } counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + readState.length); // 3. Apply SARG if needed, and otherwise determine what RGs to read. int stride = fileMetadata.getRowIndexStride(); ArrayList<OrcStripeMetadata> stripeMetadatas = null; boolean[] globalIncludes = null; boolean[] sargColumns = null; try { globalIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), columnIds, true); if (sarg != null && stride != 0) { // TODO: move this to a common method int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), columnNames, 0); // included will not be null, row options will fill the array with trues if null sargColumns = new boolean[globalIncludes.length]; for (int i : filterColumns) { // filter columns may have -1 as index which could be partition column in SARG. if (i > 0) { sargColumns[i] = true; } } // If SARG is present, get relevant stripe metadata from cache or readers. stripeMetadatas = readStripesMetadata(globalIncludes, sargColumns); } // Now, apply SARG if any; w/o sarg, this will just initialize readState. boolean hasData = determineRgsToRead(globalIncludes, stride, stripeMetadatas); if (!hasData) { consumer.setDone(); recordReaderTime(startTime); return null; // No data to read. } } catch (Throwable t) { cleanupReaders(); consumer.setError(t); recordReaderTime(startTime); return null; } if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } // 4. Get data from high-level cache. // If some cols are fully in cache, this will also give us the modified list of columns to // read for every stripe (null means read all of them - the usual path). In any case, // readState will be modified for column x rgs that were fetched from high-level cache. List<Integer>[] stripeColsToRead = null; if (cache != null) { try { stripeColsToRead = produceDataFromCache(stride); } catch (Throwable t) { // produceDataFromCache handles its own cleanup. consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } } // 5. Create encoded data reader. // In case if we have high-level cache, we will intercept the data and add it there; // otherwise just pass the data directly to the consumer. Consumer<OrcEncodedColumnBatch> dataConsumer = (cache == null) ? this.consumer : this; try { ensureOrcReader(); // Reader creating updates HDFS counters, don't do it here. DataWrapperForOrc dw = new DataWrapperForOrc(); stripeReader = orcReader.encodedReader(fileId, dw, dw, POOL_FACTORY); stripeReader.setDebugTracing(DebugUtils.isTraceOrcEnabled()); } catch (Throwable t) { consumer.setError(t); recordReaderTime(startTime); cleanupReaders(); return null; } // 6. Read data. // TODO: I/O threadpool could be here - one thread per stripe; for now, linear. OrcBatchKey stripeKey = new OrcBatchKey(fileId, -1, 0); for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } int stripeIx = stripeIxFrom + stripeIxMod; boolean[][] colRgs = null; boolean[] stripeIncludes = null; OrcStripeMetadata stripeMetadata = null; StripeInformation stripe; try { List<Integer> cols = stripeColsToRead == null ? null : stripeColsToRead[stripeIxMod]; if (cols != null && cols.isEmpty()) continue; // No need to read this stripe. stripe = fileMetadata.getStripes().get(stripeIx); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Reading stripe " + stripeIx + ": " + stripe.getOffset() + ", " + stripe.getLength()); } colRgs = readState[stripeIxMod]; // We assume that NO_RGS value is only set from SARG filter and for all columns; // intermediate changes for individual columns will unset values in the array. // Skip this case for 0-column read. We could probably special-case it just like we do // in EncodedReaderImpl, but for now it's not that important. if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue; // 6.1. Determine the columns to read (usually the same as requested). if (cache == null || cols == null || cols.size() == colRgs.length) { cols = columnIds; stripeIncludes = globalIncludes; } else { // We are reading subset of the original columns, remove unnecessary bitmasks/etc. // This will never happen w/o high-level cache. stripeIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), cols, true); colRgs = genStripeColRgs(cols, colRgs); } // 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering. boolean isFoundInCache = false; if (stripeMetadatas != null) { stripeMetadata = stripeMetadatas.get(stripeIxMod); } else { stripeKey.stripeIx = stripeIx; stripeMetadata = metadataCache.getStripeMetadata(stripeKey); isFoundInCache = (stripeMetadata != null); if (!isFoundInCache) { counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); long startTimeHdfs = counters.startTimeCounter(); stripeMetadata = new OrcStripeMetadata( stripeKey, metadataReader, stripe, stripeIncludes, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTimeHdfs); stripeMetadata = metadataCache.putStripeMetadata(stripeMetadata); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Caching stripe " + stripeKey.stripeIx + " metadata with includes: " + DebugUtils.toString(stripeIncludes)); } stripeKey = new OrcBatchKey(fileId, -1, 0); } consumer.setStripeMetadata(stripeMetadata); } if (!stripeMetadata.hasAllIndexes(stripeIncludes)) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Updating indexes in stripe " + stripeKey.stripeIx + " metadata for includes: " + DebugUtils.toString(stripeIncludes)); } assert isFoundInCache; counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); updateLoadedIndexes(stripeMetadata, stripe, stripeIncludes, sargColumns); } else if (isFoundInCache) { counters.incrCounter(Counter.METADATA_CACHE_HIT); } } catch (Throwable t) { consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } // 6.3. Finally, hand off to the stripe reader to produce the data. // This is a sync call that will feed data to the consumer. try { // TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru // consumer. It is potentially holding locked buffers, and must perform its own cleanup. // Also, currently readEncodedColumns is not stoppable. The consumer will discard the // data it receives for one stripe. We could probably interrupt it, if it checked that. stripeReader.readEncodedColumns( stripeIx, stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getEncodings(), stripeMetadata.getStreams(), stripeIncludes, colRgs, dataConsumer); } catch (Throwable t) { consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } } // Done with all the things. recordReaderTime(startTime); dataConsumer.setDone(); if (DebugUtils.isTraceMttEnabled()) { LlapIoImpl.LOG.info("done processing " + split); } // Close the stripe reader, we are done reading. cleanupReaders(); return null; }