/** * Determines which RGs need to be read, after stripes have been determined. SARG is applied, and * readState is populated for each stripe accordingly. */ private boolean determineRgsToRead( boolean[] globalIncludes, int rowIndexStride, ArrayList<OrcStripeMetadata> metadata) throws IOException { SargApplier sargApp = null; if (sarg != null && rowIndexStride != 0) { List<OrcProto.Type> types = fileMetadata.getTypes(); String[] colNamesForSarg = OrcInputFormat.getSargColumnNames( columnNames, types, globalIncludes, fileMetadata.isOriginalFormat()); sargApp = new SargApplier(sarg, colNamesForSarg, rowIndexStride, types, globalIncludes.length); } boolean hasAnyData = false; // readState should have been initialized by this time with an empty array. for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { int stripeIx = stripeIxMod + stripeIxFrom; StripeInformation stripe = fileMetadata.getStripes().get(stripeIx); int rgCount = getRgCount(stripe, rowIndexStride); boolean[] rgsToRead = null; if (sargApp != null) { OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod); rgsToRead = sargApp.pickRowGroups( stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getBloomFilterIndexes(), true); } boolean isNone = rgsToRead == SargApplier.READ_NO_RGS, isAll = rgsToRead == SargApplier.READ_ALL_RGS; hasAnyData = hasAnyData || !isNone; if (DebugUtils.isTraceOrcEnabled()) { if (isNone) { LlapIoImpl.LOG.info("SARG eliminated all RGs for stripe " + stripeIx); } else if (!isAll) { LlapIoImpl.LOG.info( "SARG picked RGs for stripe " + stripeIx + ": " + DebugUtils.toString(rgsToRead)); } else { LlapIoImpl.LOG.info("Will read all " + rgCount + " RGs for stripe " + stripeIx); } } assert isAll || isNone || rgsToRead.length == rgCount; readState[stripeIxMod] = new boolean[columnIds.size()][]; for (int j = 0; j < columnIds.size(); ++j) { readState[stripeIxMod][j] = (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length); } adjustRgMetric(rgCount, rgsToRead, isNone, isAll); } return hasAnyData; }
/** Reads the metadata for all stripes in the file. */ private ArrayList<OrcStripeMetadata> readStripesMetadata( boolean[] globalInc, boolean[] sargColumns) throws IOException { ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(readState.length); OrcBatchKey stripeKey = new OrcBatchKey(fileId, 0, 0); for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { stripeKey.stripeIx = stripeIxMod + stripeIxFrom; OrcStripeMetadata value = metadataCache.getStripeMetadata(stripeKey); if (value == null || !value.hasAllIndexes(globalInc)) { counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); StripeInformation si = fileMetadata.getStripes().get(stripeKey.stripeIx); if (value == null) { long startTime = counters.startTimeCounter(); value = new OrcStripeMetadata(stripeKey, metadataReader, si, globalInc, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime); value = metadataCache.putStripeMetadata(value); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Caching stripe " + stripeKey.stripeIx + " metadata with includes: " + DebugUtils.toString(globalInc)); } // Create new key object to reuse for gets; we've used the old one to put in cache. stripeKey = new OrcBatchKey(fileId, 0, 0); } // We might have got an old value from cache; recheck it has indexes. if (!value.hasAllIndexes(globalInc)) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Updating indexes in stripe " + stripeKey.stripeIx + " metadata for includes: " + DebugUtils.toString(globalInc)); } updateLoadedIndexes(value, si, globalInc, sargColumns); } } else { counters.incrCounter(Counter.METADATA_CACHE_HIT); } result.add(value); consumer.setStripeMetadata(value); } return result; }
/** * Takes the data from high-level cache for all stripes and returns to consumer. * * @return List of columns to read per stripe, if any columns were fully eliminated by cache. */ private List<Integer>[] produceDataFromCache(int rowIndexStride) throws IOException { OrcCacheKey key = new OrcCacheKey(fileId, -1, -1, -1); // For each stripe, keep a list of columns that are not fully in cache (null => all of them). @SuppressWarnings("unchecked") List<Integer>[] stripeColsNotInCache = new List[readState.length]; for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { key.stripeIx = stripeIxFrom + stripeIxMod; boolean[][] cols = readState[stripeIxMod]; boolean[] isMissingAnyRgs = new boolean[cols.length]; int totalRgCount = getRgCount(fileMetadata.getStripes().get(key.stripeIx), rowIndexStride); for (int rgIx = 0; rgIx < totalRgCount; ++rgIx) { OrcEncodedColumnBatch col = ECB_POOL.take(); col.init(fileId, key.stripeIx, rgIx, cols.length); boolean hasAnyCached = false; try { key.rgIx = rgIx; for (int colIxMod = 0; colIxMod < cols.length; ++colIxMod) { boolean[] readMask = cols[colIxMod]; // Check if RG is eliminated by SARG if ((readMask == SargApplier.READ_NO_RGS) || (readMask != SargApplier.READ_ALL_RGS && (readMask.length <= rgIx || !readMask[rgIx]))) continue; key.colIx = columnIds.get(colIxMod); ColumnStreamData[] cached = cache.get(key); if (cached == null) { isMissingAnyRgs[colIxMod] = true; continue; } assert cached.length == OrcEncodedColumnBatch.MAX_DATA_STREAMS; col.setAllStreamsData(colIxMod, key.colIx, cached); hasAnyCached = true; if (readMask == SargApplier.READ_ALL_RGS) { // We were going to read all RGs, but some were in cache, allocate the mask. cols[colIxMod] = readMask = new boolean[totalRgCount]; Arrays.fill(readMask, true); } readMask[rgIx] = false; // Got from cache, don't read from disk. } } catch (Throwable t) { // TODO: Any cleanup needed to release data in col back to cache should be here. throw (t instanceof IOException) ? (IOException) t : new IOException(t); } if (hasAnyCached) { consumer.consumeData(col); } } boolean makeStripeColList = false; // By default assume we'll fetch all original columns. for (int colIxMod = 0; colIxMod < cols.length; ++colIxMod) { if (isMissingAnyRgs[colIxMod]) { if (makeStripeColList) { stripeColsNotInCache[stripeIxMod].add(columnIds.get(colIxMod)); } } else if (!makeStripeColList) { // Some columns were fully in cache. Make a per-stripe col list, add previous columns. makeStripeColList = true; stripeColsNotInCache[stripeIxMod] = new ArrayList<Integer>(cols.length - 1); for (int i = 0; i < colIxMod; ++i) { stripeColsNotInCache[stripeIxMod].add(columnIds.get(i)); } } } } return stripeColsNotInCache; }
/** Determine which stripes to read for a split. Populates stripeIxFrom and readState. */ public void determineStripesToRead() { // The unit of caching for ORC is (rg x column) (see OrcBatchKey). List<StripeInformation> stripes = fileMetadata.getStripes(); long offset = split.getStart(), maxOffset = offset + split.getLength(); stripeIxFrom = -1; int stripeIxTo = -1; if (LlapIoImpl.LOGL.isDebugEnabled()) { String tmp = "FileSplit {" + split.getStart() + ", " + split.getLength() + "}; stripes "; for (StripeInformation stripe : stripes) { tmp += "{" + stripe.getOffset() + ", " + stripe.getLength() + "}, "; } LlapIoImpl.LOG.debug(tmp); } int stripeIx = 0; for (StripeInformation stripe : stripes) { long stripeStart = stripe.getOffset(); if (offset > stripeStart) { // We assume splits will never start in the middle of the stripe. ++stripeIx; continue; } if (stripeIxFrom == -1) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Including stripes from " + stripeIx + " (" + stripeStart + " >= " + offset + ")"); } stripeIxFrom = stripeIx; } if (stripeStart >= maxOffset) { stripeIxTo = stripeIx; if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Including stripes until " + stripeIxTo + " (" + stripeStart + " >= " + maxOffset + "); " + (stripeIxTo - stripeIxFrom) + " stripes"); } break; } ++stripeIx; } if (stripeIxFrom == -1) { if (LlapIoImpl.LOG.isInfoEnabled()) { LlapIoImpl.LOG.info("Not including any stripes - empty split"); } } if (stripeIxTo == -1 && stripeIxFrom != -1) { stripeIxTo = stripeIx; if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Including stripes until " + stripeIx + " (end of file); " + (stripeIxTo - stripeIxFrom) + " stripes"); } } readState = new boolean[stripeIxTo - stripeIxFrom][][]; }
protected Void performDataRead() throws IOException { long startTime = counters.startTimeCounter(); if (LlapIoImpl.LOGL.isInfoEnabled()) { LlapIoImpl.LOG.info("Processing data for " + split.getPath()); } if (processStop()) { recordReaderTime(startTime); return null; } counters.setDesc(QueryFragmentCounters.Desc.TABLE, getDbAndTableName(split.getPath())); orcReader = null; // 1. Get file metadata from cache, or create the reader and read it. // Don't cache the filesystem object for now; Tez closes it and FS cache will fix all that fs = split.getPath().getFileSystem(conf); fileId = determineFileId(fs, split); counters.setDesc(QueryFragmentCounters.Desc.FILE, fileId); try { fileMetadata = getOrReadFileMetadata(); consumer.setFileMetadata(fileMetadata); validateFileMetadata(); if (columnIds == null) { columnIds = createColumnIds(fileMetadata); } // 2. Determine which stripes to read based on the split. determineStripesToRead(); } catch (Throwable t) { recordReaderTime(startTime); consumer.setError(t); return null; } if (readState.length == 0) { consumer.setDone(); recordReaderTime(startTime); return null; // No data to read. } counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + readState.length); // 3. Apply SARG if needed, and otherwise determine what RGs to read. int stride = fileMetadata.getRowIndexStride(); ArrayList<OrcStripeMetadata> stripeMetadatas = null; boolean[] globalIncludes = null; boolean[] sargColumns = null; try { globalIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), columnIds, true); if (sarg != null && stride != 0) { // TODO: move this to a common method int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), columnNames, 0); // included will not be null, row options will fill the array with trues if null sargColumns = new boolean[globalIncludes.length]; for (int i : filterColumns) { // filter columns may have -1 as index which could be partition column in SARG. if (i > 0) { sargColumns[i] = true; } } // If SARG is present, get relevant stripe metadata from cache or readers. stripeMetadatas = readStripesMetadata(globalIncludes, sargColumns); } // Now, apply SARG if any; w/o sarg, this will just initialize readState. boolean hasData = determineRgsToRead(globalIncludes, stride, stripeMetadatas); if (!hasData) { consumer.setDone(); recordReaderTime(startTime); return null; // No data to read. } } catch (Throwable t) { cleanupReaders(); consumer.setError(t); recordReaderTime(startTime); return null; } if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } // 4. Get data from high-level cache. // If some cols are fully in cache, this will also give us the modified list of columns to // read for every stripe (null means read all of them - the usual path). In any case, // readState will be modified for column x rgs that were fetched from high-level cache. List<Integer>[] stripeColsToRead = null; if (cache != null) { try { stripeColsToRead = produceDataFromCache(stride); } catch (Throwable t) { // produceDataFromCache handles its own cleanup. consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } } // 5. Create encoded data reader. // In case if we have high-level cache, we will intercept the data and add it there; // otherwise just pass the data directly to the consumer. Consumer<OrcEncodedColumnBatch> dataConsumer = (cache == null) ? this.consumer : this; try { ensureOrcReader(); // Reader creating updates HDFS counters, don't do it here. DataWrapperForOrc dw = new DataWrapperForOrc(); stripeReader = orcReader.encodedReader(fileId, dw, dw, POOL_FACTORY); stripeReader.setDebugTracing(DebugUtils.isTraceOrcEnabled()); } catch (Throwable t) { consumer.setError(t); recordReaderTime(startTime); cleanupReaders(); return null; } // 6. Read data. // TODO: I/O threadpool could be here - one thread per stripe; for now, linear. OrcBatchKey stripeKey = new OrcBatchKey(fileId, -1, 0); for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) { if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } int stripeIx = stripeIxFrom + stripeIxMod; boolean[][] colRgs = null; boolean[] stripeIncludes = null; OrcStripeMetadata stripeMetadata = null; StripeInformation stripe; try { List<Integer> cols = stripeColsToRead == null ? null : stripeColsToRead[stripeIxMod]; if (cols != null && cols.isEmpty()) continue; // No need to read this stripe. stripe = fileMetadata.getStripes().get(stripeIx); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Reading stripe " + stripeIx + ": " + stripe.getOffset() + ", " + stripe.getLength()); } colRgs = readState[stripeIxMod]; // We assume that NO_RGS value is only set from SARG filter and for all columns; // intermediate changes for individual columns will unset values in the array. // Skip this case for 0-column read. We could probably special-case it just like we do // in EncodedReaderImpl, but for now it's not that important. if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue; // 6.1. Determine the columns to read (usually the same as requested). if (cache == null || cols == null || cols.size() == colRgs.length) { cols = columnIds; stripeIncludes = globalIncludes; } else { // We are reading subset of the original columns, remove unnecessary bitmasks/etc. // This will never happen w/o high-level cache. stripeIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), cols, true); colRgs = genStripeColRgs(cols, colRgs); } // 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering. boolean isFoundInCache = false; if (stripeMetadatas != null) { stripeMetadata = stripeMetadatas.get(stripeIxMod); } else { stripeKey.stripeIx = stripeIx; stripeMetadata = metadataCache.getStripeMetadata(stripeKey); isFoundInCache = (stripeMetadata != null); if (!isFoundInCache) { counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); long startTimeHdfs = counters.startTimeCounter(); stripeMetadata = new OrcStripeMetadata( stripeKey, metadataReader, stripe, stripeIncludes, sargColumns); counters.incrTimeCounter(Counter.HDFS_TIME_US, startTimeHdfs); stripeMetadata = metadataCache.putStripeMetadata(stripeMetadata); if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Caching stripe " + stripeKey.stripeIx + " metadata with includes: " + DebugUtils.toString(stripeIncludes)); } stripeKey = new OrcBatchKey(fileId, -1, 0); } consumer.setStripeMetadata(stripeMetadata); } if (!stripeMetadata.hasAllIndexes(stripeIncludes)) { if (DebugUtils.isTraceOrcEnabled()) { LlapIoImpl.LOG.info( "Updating indexes in stripe " + stripeKey.stripeIx + " metadata for includes: " + DebugUtils.toString(stripeIncludes)); } assert isFoundInCache; counters.incrCounter(Counter.METADATA_CACHE_MISS); ensureMetadataReader(); updateLoadedIndexes(stripeMetadata, stripe, stripeIncludes, sargColumns); } else if (isFoundInCache) { counters.incrCounter(Counter.METADATA_CACHE_HIT); } } catch (Throwable t) { consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } if (processStop()) { cleanupReaders(); recordReaderTime(startTime); return null; } // 6.3. Finally, hand off to the stripe reader to produce the data. // This is a sync call that will feed data to the consumer. try { // TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru // consumer. It is potentially holding locked buffers, and must perform its own cleanup. // Also, currently readEncodedColumns is not stoppable. The consumer will discard the // data it receives for one stripe. We could probably interrupt it, if it checked that. stripeReader.readEncodedColumns( stripeIx, stripe, stripeMetadata.getRowIndexes(), stripeMetadata.getEncodings(), stripeMetadata.getStreams(), stripeIncludes, colRgs, dataConsumer); } catch (Throwable t) { consumer.setError(t); cleanupReaders(); recordReaderTime(startTime); return null; } } // Done with all the things. recordReaderTime(startTime); dataConsumer.setDone(); if (DebugUtils.isTraceMttEnabled()) { LlapIoImpl.LOG.info("done processing " + split); } // Close the stripe reader, we are done reading. cleanupReaders(); return null; }