/** * Determines which RGs need to be read, after stripes have been determined. SARG is applied, and * readState is populated for each stripe accordingly. */ private boolean determineRgsToRead( boolean[] globalIncludes, int rowIndexStride, ArrayList<OrcStripeMetadata> metadata) throws IOException { SargApplier sargApp = null; if (sarg != null && rowIndexStride != 0) { List<OrcProto.Type> types = fileMetadata.getTypes(); String[] colNamesForSarg = OrcInputFormat.getSargColumnNames( columnNames, types, globalIncludes, fileMetadata.isOriginalFormat()); sargApp = new SargApplier(sarg, colNamesForSarg, rowIndexStride, types, globalIncludes.length); } boolean hasAnyData = false; // readState should have been initialized by this time with an empty array. for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { int stripeIx = stripeIxMod + stripeIxFrom; StripeInformation stripe = fileMetadata.getStripes().get(stripeIx); int rgCount = getRgCount(stripe, rowIndexStride); boolean[] rgsToRead = null; if (sargApp != null) { OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod); rgsToRead = sargApp.pickRowGroups( stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getBloomFilterIndexes(), true); } boolean isNone = rgsToRead == SargApplier.READ_NO_RGS, isAll = rgsToRead == SargApplier.READ_ALL_RGS; hasAnyData = hasAnyData || !isNone; if (DebugUtils.isTraceOrcEnabled()) { if (isNone) { LlapIoImpl.LOG.info("SARG eliminated all RGs for stripe " + stripeIx); } else if (!isAll) { LlapIoImpl.LOG.info( "SARG picked RGs for stripe " + stripeIx + ": " + DebugUtils.toString(rgsToRead)); } else { LlapIoImpl.LOG.info("Will read all " + rgCount + " RGs for stripe " + stripeIx); } } assert isAll || isNone || rgsToRead.length == rgCount; readState[stripeIxMod] = new boolean[columnIds.size()][]; for (int j = 0; j < columnIds.size(); ++j) { readState[stripeIxMod][j] = (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length); } adjustRgMetric(rgCount, rgsToRead, isNone, isAll); } return hasAnyData; }
/** Reads the metadata for all stripes in the file. */ private ArrayList<OrcStripeMetadata> readStripesMetadata( boolean[] globalInc, boolean[] sargColumns) throws IOException { ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(readState.length); OrcBatchKey stripeKey = new OrcBatchKey(fileId, 0, 0); for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { stripeKey.stripeIx = stripeIxMod + stripeIxFrom; OrcStripeMetadata value = metadataCache.getStripeMetadata(stripeKey); if (value == null || !value.hasAllIndexes(globalInc)) { counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); StripeInformation si = fileMetadata.getStripes().get(stripeKey.stripeIx); if (value == null) { long startTime = counters.startTimeCounter(); value = new OrcStripeMetadata(stripeKey, metadataReader, si, globalInc, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); value = metadataCache.putStripeMetadata(value); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Caching stripe " + stripeKey.stripeIx + " metadata with includes: " + DebugUtils.toString(globalInc)); } // Create new key object to reuse for gets; we've used the old one to put in cache. stripeKey = new OrcBatchKey(fileId, 0, 0); } // We might have got an old value from cache; recheck it has indexes. if (!value.hasAllIndexes(globalInc)) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Updating indexes in stripe " + stripeKey.stripeIx + " metadata for includes: " + DebugUtils.toString(globalInc)); } updateLoadedIndexes(value, si, globalInc, sargColumns); } } else { counters.incrCounter(Counter.METADATA_CACHE_HIT); } result.add(value); consumer.setStripeMetadata(value); } return result; }
int decRef() { int newRefCount = refCount.decrementAndGet(); if (DebugUtils.isTraceLockingEnabled()) { LlapIoImpl.LOG.info("Unlocked " + this + "; refcount " + newRefCount); } if (newRefCount < 0) { throw new AssertionError("Unexpected refCount " + newRefCount + ": " + this); } return newRefCount; }
/** Ensures orcReader is initialized for the split. */ private void ensureOrcReader() throws IOException { if (orcReader != null) return; Path path = HdfsUtils.getFileIdPath(fs, split.getPath(), fileId); if (DebugUtils.isTraceOrcEnabled()) { LOG.info("Creating reader for " + path + " (" + split.getPath() + ")"); } long startTime = counters.startTimeCounter(); ReaderOptions opts = OrcFile.readerOptions(conf).filesystem(fs).fileMetadata(fileMetadata); orcReader = EncodedOrcFile.createReader(path, opts); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); }
/** @return Whether the we can invalidate; false if locked or already evicted. */ @Override public boolean invalidate() { while (true) { int value = refCount.get(); if (value != 0) return false; if (refCount.compareAndSet(value, EVICTED_REFCOUNT)) break; } if (DebugUtils.isTraceLockingEnabled()) { LlapIoImpl.LOG.info("Invalidated " + this + " due to eviction"); } return true; }
int incRef() { int newRefCount = -1; while (true) { int oldRefCount = refCount.get(); if (oldRefCount == EVICTED_REFCOUNT) return -1; assert oldRefCount >= 0 : "oldRefCount is " + oldRefCount + " " + this; newRefCount = oldRefCount + 1; if (refCount.compareAndSet(oldRefCount, newRefCount)) break; } if (DebugUtils.isTraceLockingEnabled()) { LlapIoImpl.LOG.info("Locked " + this + "; new ref count " + newRefCount); } return newRefCount; }
@Override public DiskRangeList readFileData(DiskRangeList range, long baseOffset, boolean doForceDirect) throws IOException { long startTime = counters.startTimeCounter(); DiskRangeList result = orcDataReader.readFileData(range, baseOffset, doForceDirect); counters.recordHdfsTime(startTime); if (DebugUtils.isTraceOrcEnabled() && LOG.isInfoEnabled()) { LOG.info( "Disk ranges after disk read (file " + fileId + ", base offset " + baseOffset + "): " + RecordReaderUtils.stringifyDiskRanges(result)); } return result; }
@Override public void returnData(OrcEncodedColumnBatch ecb) { for (ColumnStreamData[] datas : ecb.getColumnData()) { if (datas == null) continue; for (ColumnStreamData data : datas) { if (data == null || data.decRef() != 0) continue; if (DebugUtils.isTraceLockingEnabled()) { for (MemoryBuffer buf : data.getCacheBuffers()) { LlapIoImpl.LOG.info("Unlocking " + buf + " at the end of processing"); } } lowLevelCache.releaseBuffers(data.getCacheBuffers()); CSD_POOL.offer(data); } } // We can offer ECB even with some streams not discarded; reset() will clear the arrays. ECB_POOL.offer(ecb); }
/** Determine which stripes to read for a split. Populates stripeIxFrom and readState. */ public void determineStripesToRead() { // The unit of caching for ORC is (rg x column) (see OrcBatchKey). List<StripeInformation> stripes = fileMetadata.getStripes(); long offset = split.getStart(), maxOffset = offset + split.getLength(); stripeIxFrom = -1; int stripeIxTo = -1; if (LlapIoImpl.LOGL.isDebugEnabled()) { String tmp = "FileSplit {" + split.getStart() + ", " + split.getLength() + "}; stripes "; for (StripeInformation stripe : stripes) { tmp += "{" + stripe.getOffset() + ", " + stripe.getLength() + "}, "; } LlapIoImpl.LOG.debug(tmp); } int stripeIx = 0; for (StripeInformation stripe : stripes) { long stripeStart = stripe.getOffset(); if (offset > stripeStart) { // We assume splits will never start in the middle of the stripe. ++stripeIx; continue; } if (stripeIxFrom == -1) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Including stripes from " + stripeIx + " (" + stripeStart + " >= " + offset + ")"); } stripeIxFrom = stripeIx; } if (stripeStart >= maxOffset) { stripeIxTo = stripeIx; if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Including stripes until " + stripeIxTo + " (" + stripeStart + " >= " + maxOffset + "); " + (stripeIxTo - stripeIxFrom) + " stripes"); } break; } ++stripeIx; } if (stripeIxFrom == -1) { if (LlapIoImpl.LOG.isInfoEnabled()) { LlapIoImpl.LOG.info("Not including any stripes - empty split"); } } if (stripeIxTo == -1 && stripeIxFrom != -1) { stripeIxTo = stripeIx; if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Including stripes until " + stripeIx + " (end of file); " + (stripeIxTo - stripeIxFrom) + " stripes"); } } readState = new boolean[stripeIxTo - stripeIxFrom][][]; }
protected Void performDataRead() throws IOException { long startTime = counters.startTimeCounter(); if (LlapIoImpl.LOGL.isInfoEnabled()) { LlapIoImpl.LOG.info("Processing data for " + split.getPath()); } if (processStop()) { recordReaderTime(startTime); return null; } counters.setDesc(QueryFragmentCounters.Desc.TABLE, getDbAndTableName(split.getPath())); orcReader = null; // 1. Get file metadata from cache, or create the reader and read it. // Don't cache the filesystem object for now; Tez closes it and FS cache will fix all that fs = split.getPath().getFileSystem(conf); fileId = determineFileId(fs, split); counters.setDesc(QueryFragmentCounters.Desc.FILE, fileId); try { fileMetadata = getOrReadFileMetadata(); consumer.setFileMetadata(fileMetadata); validateFileMetadata(); if (columnIds == null) { columnIds = createColumnIds(fileMetadata); } // 2. Determine which stripes to read based on the split. determineStripesToRead(); } catch (Throwable t) { recordReaderTime(startTime); consumer.setError(t); return null; } if (readState.length == 0) { consumer.setDone(); recordReaderTime(startTime); return null; // No data to read. } counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + readState.length); // 3. Apply SARG if needed, and otherwise determine what RGs to read. int stride = fileMetadata.getRowIndexStride(); ArrayList<OrcStripeMetadata> stripeMetadatas = null; boolean[] globalIncludes = null; boolean[] sargColumns = null; try { globalIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), columnIds, true); if (sarg != null && stride != 0) { // TODO: move this to a common method int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), columnNames, 0); // included will not be null, row options will fill the array with trues if null sargColumns = new boolean[globalIncludes.length]; for (int i : filterColumns) { // filter columns may have -1 as index which could be partition column in SARG. if (i > 0) { sargColumns[i] = true; } } // If SARG is present, get relevant stripe metadata from cache or readers. stripeMetadatas = readStripesMetadata(globalIncludes, sargColumns); } // Now, apply SARG if any; w/o sarg, this will just initialize readState. boolean hasData = determineRgsToRead(globalIncludes, stride, stripeMetadatas); if (!hasData) { consumer.setDone(); recordReaderTime(startTime); return null; // No data to read. } } catch (Throwable t) { cleanupReaders(); consumer.setError(t); recordReaderTime(startTime); return null; } if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } // 4. Get data from high-level cache. // If some cols are fully in cache, this will also give us the modified list of columns to // read for every stripe (null means read all of them - the usual path). In any case, // readState will be modified for column x rgs that were fetched from high-level cache. List<Integer>[] stripeColsToRead = null; if (cache != null) { try { stripeColsToRead = produceDataFromCache(stride); } catch (Throwable t) { // produceDataFromCache handles its own cleanup. consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } } // 5. Create encoded data reader. // In case if we have high-level cache, we will intercept the data and add it there; // otherwise just pass the data directly to the consumer. Consumer<OrcEncodedColumnBatch> dataConsumer = (cache == null) ? this.consumer : this; try { ensureOrcReader(); // Reader creating updates HDFS counters, don't do it here. DataWrapperForOrc dw = new DataWrapperForOrc(); stripeReader = orcReader.encodedReader(fileId, dw, dw, POOL_FACTORY); stripeReader.setDebugTracing(DebugUtils.isTraceOrcEnabled()); } catch (Throwable t) { consumer.setError(t); recordReaderTime(startTime); cleanupReaders(); return null; } // 6. Read data. // TODO: I/O threadpool could be here - one thread per stripe; for now, linear. OrcBatchKey stripeKey = new OrcBatchKey(fileId, -1, 0); for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } int stripeIx = stripeIxFrom + stripeIxMod; boolean[][] colRgs = null; boolean[] stripeIncludes = null; OrcStripeMetadata stripeMetadata = null; StripeInformation stripe; try { List<Integer> cols = stripeColsToRead == null ? null : stripeColsToRead[stripeIxMod]; if (cols != null && cols.isEmpty()) continue; // No need to read this stripe. stripe = fileMetadata.getStripes().get(stripeIx); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Reading stripe " + stripeIx + ": " + stripe.getOffset() + ", " + stripe.getLength()); } colRgs = readState[stripeIxMod]; // We assume that NO_RGS value is only set from SARG filter and for all columns; // intermediate changes for individual columns will unset values in the array. // Skip this case for 0-column read. We could probably special-case it just like we do // in EncodedReaderImpl, but for now it's not that important. if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue; // 6.1. Determine the columns to read (usually the same as requested). if (cache == null || cols == null || cols.size() == colRgs.length) { cols = columnIds; stripeIncludes = globalIncludes; } else { // We are reading subset of the original columns, remove unnecessary bitmasks/etc. // This will never happen w/o high-level cache. stripeIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), cols, true); colRgs = genStripeColRgs(cols, colRgs); } // 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering. boolean isFoundInCache = false; if (stripeMetadatas != null) { stripeMetadata = stripeMetadatas.get(stripeIxMod); } else { stripeKey.stripeIx = stripeIx; stripeMetadata = metadataCache.getStripeMetadata(stripeKey); isFoundInCache = (stripeMetadata != null); if (!isFoundInCache) { counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); long startTimeHdfs = counters.startTimeCounter(); stripeMetadata = new OrcStripeMetadata( stripeKey, metadataReader, stripe, stripeIncludes, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTimeHdfs); stripeMetadata = metadataCache.putStripeMetadata(stripeMetadata); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Caching stripe " + stripeKey.stripeIx + " metadata with includes: " + DebugUtils.toString(stripeIncludes)); } stripeKey = new OrcBatchKey(fileId, -1, 0); } consumer.setStripeMetadata(stripeMetadata); } if (!stripeMetadata.hasAllIndexes(stripeIncludes)) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Updating indexes in stripe " + stripeKey.stripeIx + " metadata for includes: " + DebugUtils.toString(stripeIncludes)); } assert isFoundInCache; counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); updateLoadedIndexes(stripeMetadata, stripe, stripeIncludes, sargColumns); } else if (isFoundInCache) { counters.incrCounter(Counter.METADATA_CACHE_HIT); } } catch (Throwable t) { consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } // 6.3. Finally, hand off to the stripe reader to produce the data. // This is a sync call that will feed data to the consumer. try { // TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru // consumer. It is potentially holding locked buffers, and must perform its own cleanup. // Also, currently readEncodedColumns is not stoppable. The consumer will discard the // data it receives for one stripe. We could probably interrupt it, if it checked that. stripeReader.readEncodedColumns( stripeIx, stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getEncodings(), stripeMetadata.getStreams(), stripeIncludes, colRgs, dataConsumer); } catch (Throwable t) { consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } } // Done with all the things. recordReaderTime(startTime); dataConsumer.setDone(); if (DebugUtils.isTraceMttEnabled()) { LlapIoImpl.LOG.info("done processing " + split); } // Close the stripe reader, we are done reading. cleanupReaders(); return null; }