Example #1
0
 /** Puts all column indexes from metadata to make a column list to read all column. */
 private static List<Integer> createColumnIds(OrcFileMetadata metadata) {
   List<Integer> columnIds = new ArrayList<Integer>(metadata.getTypes().size());
   for (int i = 1; i < metadata.getTypes().size(); ++i) {
     columnIds.add(i);
   }
   return columnIds;
 }
Example #2
0
  /**
   * Determines which RGs need to be read, after stripes have been determined. SARG is applied, and
   * readState is populated for each stripe accordingly.
   */
  private boolean determineRgsToRead(
      boolean[] globalIncludes, int rowIndexStride, ArrayList<OrcStripeMetadata> metadata)
      throws IOException {
    SargApplier sargApp = null;
    if (sarg != null && rowIndexStride != 0) {
      List<OrcProto.Type> types = fileMetadata.getTypes();
      String[] colNamesForSarg =
          OrcInputFormat.getSargColumnNames(
              columnNames, types, globalIncludes, fileMetadata.isOriginalFormat());
      sargApp =
          new SargApplier(sarg, colNamesForSarg, rowIndexStride, types, globalIncludes.length);
    }
    boolean hasAnyData = false;
    // readState should have been initialized by this time with an empty array.
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      int stripeIx = stripeIxMod + stripeIxFrom;
      StripeInformation stripe = fileMetadata.getStripes().get(stripeIx);
      int rgCount = getRgCount(stripe, rowIndexStride);
      boolean[] rgsToRead = null;
      if (sargApp != null) {
        OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod);
        rgsToRead =
            sargApp.pickRowGroups(
                stripe,
                stripeMetadata.getRowIndexes(),
                stripeMetadata.getBloomFilterIndexes(),
                true);
      }
      boolean isNone = rgsToRead == SargApplier.READ_NO_RGS,
          isAll = rgsToRead == SargApplier.READ_ALL_RGS;
      hasAnyData = hasAnyData || !isNone;
      if (DebugUtils.isTraceOrcEnabled()) {
        if (isNone) {
          LlapIoImpl.LOG.info("SARG eliminated all RGs for stripe " + stripeIx);
        } else if (!isAll) {
          LlapIoImpl.LOG.info(
              "SARG picked RGs for stripe " + stripeIx + ": " + DebugUtils.toString(rgsToRead));
        } else {
          LlapIoImpl.LOG.info("Will read all " + rgCount + " RGs for stripe " + stripeIx);
        }
      }
      assert isAll || isNone || rgsToRead.length == rgCount;
      readState[stripeIxMod] = new boolean[columnIds.size()][];
      for (int j = 0; j < columnIds.size(); ++j) {
        readState[stripeIxMod][j] =
            (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length);
      }

      adjustRgMetric(rgCount, rgsToRead, isNone, isAll);
    }
    return hasAnyData;
  }
Example #3
0
 private void validateFileMetadata() throws IOException {
   if (fileMetadata.getCompressionKind() == CompressionKind.NONE) return;
   int bufferSize = fileMetadata.getCompressionBufferSize();
   int minAllocSize = HiveConf.getIntVar(conf, HiveConf.ConfVars.LLAP_ORC_CACHE_MIN_ALLOC);
   if (bufferSize < minAllocSize) {
     LOG.warn(
         "ORC compression buffer size ("
             + bufferSize
             + ") is smaller than LLAP low-level "
             + "cache minimum allocation size ("
             + minAllocSize
             + "). Decrease the value for "
             + HiveConf.ConfVars.LLAP_ORC_CACHE_MIN_ALLOC.toString()
             + " to avoid wasting memory");
   }
 }
Example #4
0
 /** Reads the metadata for all stripes in the file. */
 private ArrayList<OrcStripeMetadata> readStripesMetadata(
     boolean[] globalInc, boolean[] sargColumns) throws IOException {
   ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(readState.length);
   OrcBatchKey stripeKey = new OrcBatchKey(fileId, 0, 0);
   for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
     stripeKey.stripeIx = stripeIxMod + stripeIxFrom;
     OrcStripeMetadata value = metadataCache.getStripeMetadata(stripeKey);
     if (value == null || !value.hasAllIndexes(globalInc)) {
       counters.incrCounter(Counter.METADATA_CACHE_MISS);
       ensureMetadataReader();
       StripeInformation si = fileMetadata.getStripes().get(stripeKey.stripeIx);
       if (value == null) {
         long startTime = counters.startTimeCounter();
         value = new OrcStripeMetadata(stripeKey, metadataReader, si, globalInc, sargColumns);
         counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime);
         value = metadataCache.putStripeMetadata(value);
         if (DebugUtils.isTraceOrcEnabled()) {
           LlapIoImpl.LOG.info(
               "Caching stripe "
                   + stripeKey.stripeIx
                   + " metadata with includes: "
                   + DebugUtils.toString(globalInc));
         }
         // Create new key object to reuse for gets; we've used the old one to put in cache.
         stripeKey = new OrcBatchKey(fileId, 0, 0);
       }
       // We might have got an old value from cache; recheck it has indexes.
       if (!value.hasAllIndexes(globalInc)) {
         if (DebugUtils.isTraceOrcEnabled()) {
           LlapIoImpl.LOG.info(
               "Updating indexes in stripe "
                   + stripeKey.stripeIx
                   + " metadata for includes: "
                   + DebugUtils.toString(globalInc));
         }
         updateLoadedIndexes(value, si, globalInc, sargColumns);
       }
     } else {
       counters.incrCounter(Counter.METADATA_CACHE_HIT);
     }
     result.add(value);
     consumer.setStripeMetadata(value);
   }
   return result;
 }
Example #5
0
 /**
  * Takes the data from high-level cache for all stripes and returns to consumer.
  *
  * @return List of columns to read per stripe, if any columns were fully eliminated by cache.
  */
 private List<Integer>[] produceDataFromCache(int rowIndexStride) throws IOException {
   OrcCacheKey key = new OrcCacheKey(fileId, -1, -1, -1);
   // For each stripe, keep a list of columns that are not fully in cache (null => all of them).
   @SuppressWarnings("unchecked")
   List<Integer>[] stripeColsNotInCache = new List[readState.length];
   for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
     key.stripeIx = stripeIxFrom + stripeIxMod;
     boolean[][] cols = readState[stripeIxMod];
     boolean[] isMissingAnyRgs = new boolean[cols.length];
     int totalRgCount = getRgCount(fileMetadata.getStripes().get(key.stripeIx), rowIndexStride);
     for (int rgIx = 0; rgIx < totalRgCount; ++rgIx) {
       OrcEncodedColumnBatch col = ECB_POOL.take();
       col.init(fileId, key.stripeIx, rgIx, cols.length);
       boolean hasAnyCached = false;
       try {
         key.rgIx = rgIx;
         for (int colIxMod = 0; colIxMod < cols.length; ++colIxMod) {
           boolean[] readMask = cols[colIxMod];
           // Check if RG is eliminated by SARG
           if ((readMask == SargApplier.READ_NO_RGS)
               || (readMask != SargApplier.READ_ALL_RGS
                   && (readMask.length <= rgIx || !readMask[rgIx]))) continue;
           key.colIx = columnIds.get(colIxMod);
           ColumnStreamData[] cached = cache.get(key);
           if (cached == null) {
             isMissingAnyRgs[colIxMod] = true;
             continue;
           }
           assert cached.length == OrcEncodedColumnBatch.MAX_DATA_STREAMS;
           col.setAllStreamsData(colIxMod, key.colIx, cached);
           hasAnyCached = true;
           if (readMask == SargApplier.READ_ALL_RGS) {
             // We were going to read all RGs, but some were in cache, allocate the mask.
             cols[colIxMod] = readMask = new boolean[totalRgCount];
             Arrays.fill(readMask, true);
           }
           readMask[rgIx] = false; // Got from cache, don't read from disk.
         }
       } catch (Throwable t) {
         // TODO: Any cleanup needed to release data in col back to cache should be here.
         throw (t instanceof IOException) ? (IOException) t : new IOException(t);
       }
       if (hasAnyCached) {
         consumer.consumeData(col);
       }
     }
     boolean makeStripeColList = false; // By default assume we'll fetch all original columns.
     for (int colIxMod = 0; colIxMod < cols.length; ++colIxMod) {
       if (isMissingAnyRgs[colIxMod]) {
         if (makeStripeColList) {
           stripeColsNotInCache[stripeIxMod].add(columnIds.get(colIxMod));
         }
       } else if (!makeStripeColList) {
         // Some columns were fully in cache. Make a per-stripe col list, add previous columns.
         makeStripeColList = true;
         stripeColsNotInCache[stripeIxMod] = new ArrayList<Integer>(cols.length - 1);
         for (int i = 0; i < colIxMod; ++i) {
           stripeColsNotInCache[stripeIxMod].add(columnIds.get(i));
         }
       }
     }
   }
   return stripeColsNotInCache;
 }
Example #6
0
  /** Determine which stripes to read for a split. Populates stripeIxFrom and readState. */
  public void determineStripesToRead() {
    // The unit of caching for ORC is (rg x column) (see OrcBatchKey).
    List<StripeInformation> stripes = fileMetadata.getStripes();
    long offset = split.getStart(), maxOffset = offset + split.getLength();
    stripeIxFrom = -1;
    int stripeIxTo = -1;
    if (LlapIoImpl.LOGL.isDebugEnabled()) {
      String tmp = "FileSplit {" + split.getStart() + ", " + split.getLength() + "}; stripes ";
      for (StripeInformation stripe : stripes) {
        tmp += "{" + stripe.getOffset() + ", " + stripe.getLength() + "}, ";
      }
      LlapIoImpl.LOG.debug(tmp);
    }

    int stripeIx = 0;
    for (StripeInformation stripe : stripes) {
      long stripeStart = stripe.getOffset();
      if (offset > stripeStart) {
        // We assume splits will never start in the middle of the stripe.
        ++stripeIx;
        continue;
      }
      if (stripeIxFrom == -1) {
        if (DebugUtils.isTraceOrcEnabled()) {
          LlapIoImpl.LOG.info(
              "Including stripes from " + stripeIx + " (" + stripeStart + " >= " + offset + ")");
        }
        stripeIxFrom = stripeIx;
      }
      if (stripeStart >= maxOffset) {
        stripeIxTo = stripeIx;
        if (DebugUtils.isTraceOrcEnabled()) {
          LlapIoImpl.LOG.info(
              "Including stripes until "
                  + stripeIxTo
                  + " ("
                  + stripeStart
                  + " >= "
                  + maxOffset
                  + "); "
                  + (stripeIxTo - stripeIxFrom)
                  + " stripes");
        }
        break;
      }
      ++stripeIx;
    }
    if (stripeIxFrom == -1) {
      if (LlapIoImpl.LOG.isInfoEnabled()) {
        LlapIoImpl.LOG.info("Not including any stripes - empty split");
      }
    }
    if (stripeIxTo == -1 && stripeIxFrom != -1) {
      stripeIxTo = stripeIx;
      if (DebugUtils.isTraceOrcEnabled()) {
        LlapIoImpl.LOG.info(
            "Including stripes until "
                + stripeIx
                + " (end of file); "
                + (stripeIxTo - stripeIxFrom)
                + " stripes");
      }
    }
    readState = new boolean[stripeIxTo - stripeIxFrom][][];
  }
Example #7
0
  protected Void performDataRead() throws IOException {
    long startTime = counters.startTimeCounter();
    if (LlapIoImpl.LOGL.isInfoEnabled()) {
      LlapIoImpl.LOG.info("Processing data for " + split.getPath());
    }
    if (processStop()) {
      recordReaderTime(startTime);
      return null;
    }
    counters.setDesc(QueryFragmentCounters.Desc.TABLE, getDbAndTableName(split.getPath()));
    orcReader = null;
    // 1. Get file metadata from cache, or create the reader and read it.
    // Don't cache the filesystem object for now; Tez closes it and FS cache will fix all that
    fs = split.getPath().getFileSystem(conf);
    fileId = determineFileId(fs, split);
    counters.setDesc(QueryFragmentCounters.Desc.FILE, fileId);

    try {
      fileMetadata = getOrReadFileMetadata();
      consumer.setFileMetadata(fileMetadata);
      validateFileMetadata();
      if (columnIds == null) {
        columnIds = createColumnIds(fileMetadata);
      }

      // 2. Determine which stripes to read based on the split.
      determineStripesToRead();
    } catch (Throwable t) {
      recordReaderTime(startTime);
      consumer.setError(t);
      return null;
    }

    if (readState.length == 0) {
      consumer.setDone();
      recordReaderTime(startTime);
      return null; // No data to read.
    }
    counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + readState.length);

    // 3. Apply SARG if needed, and otherwise determine what RGs to read.
    int stride = fileMetadata.getRowIndexStride();
    ArrayList<OrcStripeMetadata> stripeMetadatas = null;
    boolean[] globalIncludes = null;
    boolean[] sargColumns = null;
    try {
      globalIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), columnIds, true);
      if (sarg != null && stride != 0) {
        // TODO: move this to a common method
        int[] filterColumns =
            RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), columnNames, 0);
        // included will not be null, row options will fill the array with trues if null
        sargColumns = new boolean[globalIncludes.length];
        for (int i : filterColumns) {
          // filter columns may have -1 as index which could be partition column in SARG.
          if (i > 0) {
            sargColumns[i] = true;
          }
        }

        // If SARG is present, get relevant stripe metadata from cache or readers.
        stripeMetadatas = readStripesMetadata(globalIncludes, sargColumns);
      }

      // Now, apply SARG if any; w/o sarg, this will just initialize readState.
      boolean hasData = determineRgsToRead(globalIncludes, stride, stripeMetadatas);
      if (!hasData) {
        consumer.setDone();
        recordReaderTime(startTime);
        return null; // No data to read.
      }
    } catch (Throwable t) {
      cleanupReaders();
      consumer.setError(t);
      recordReaderTime(startTime);
      return null;
    }

    if (processStop()) {
      cleanupReaders();
      recordReaderTime(startTime);
      return null;
    }

    // 4. Get data from high-level cache.
    //    If some cols are fully in cache, this will also give us the modified list of columns to
    //    read for every stripe (null means read all of them - the usual path). In any case,
    //    readState will be modified for column x rgs that were fetched from high-level cache.
    List<Integer>[] stripeColsToRead = null;
    if (cache != null) {
      try {
        stripeColsToRead = produceDataFromCache(stride);
      } catch (Throwable t) {
        // produceDataFromCache handles its own cleanup.
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
    }

    // 5. Create encoded data reader.
    // In case if we have high-level cache, we will intercept the data and add it there;
    // otherwise just pass the data directly to the consumer.
    Consumer<OrcEncodedColumnBatch> dataConsumer = (cache == null) ? this.consumer : this;
    try {
      ensureOrcReader();
      // Reader creating updates HDFS counters, don't do it here.
      DataWrapperForOrc dw = new DataWrapperForOrc();
      stripeReader = orcReader.encodedReader(fileId, dw, dw, POOL_FACTORY);
      stripeReader.setDebugTracing(DebugUtils.isTraceOrcEnabled());
    } catch (Throwable t) {
      consumer.setError(t);
      recordReaderTime(startTime);
      cleanupReaders();
      return null;
    }

    // 6. Read data.
    // TODO: I/O threadpool could be here - one thread per stripe; for now, linear.
    OrcBatchKey stripeKey = new OrcBatchKey(fileId, -1, 0);
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      if (processStop()) {
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
      int stripeIx = stripeIxFrom + stripeIxMod;
      boolean[][] colRgs = null;
      boolean[] stripeIncludes = null;
      OrcStripeMetadata stripeMetadata = null;
      StripeInformation stripe;
      try {
        List<Integer> cols = stripeColsToRead == null ? null : stripeColsToRead[stripeIxMod];
        if (cols != null && cols.isEmpty()) continue; // No need to read this stripe.
        stripe = fileMetadata.getStripes().get(stripeIx);

        if (DebugUtils.isTraceOrcEnabled()) {
          LlapIoImpl.LOG.info(
              "Reading stripe " + stripeIx + ": " + stripe.getOffset() + ", " + stripe.getLength());
        }
        colRgs = readState[stripeIxMod];
        // We assume that NO_RGS value is only set from SARG filter and for all columns;
        // intermediate changes for individual columns will unset values in the array.
        // Skip this case for 0-column read. We could probably special-case it just like we do
        // in EncodedReaderImpl, but for now it's not that important.
        if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue;

        // 6.1. Determine the columns to read (usually the same as requested).
        if (cache == null || cols == null || cols.size() == colRgs.length) {
          cols = columnIds;
          stripeIncludes = globalIncludes;
        } else {
          // We are reading subset of the original columns, remove unnecessary bitmasks/etc.
          // This will never happen w/o high-level cache.
          stripeIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), cols, true);
          colRgs = genStripeColRgs(cols, colRgs);
        }

        // 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering.
        boolean isFoundInCache = false;
        if (stripeMetadatas != null) {
          stripeMetadata = stripeMetadatas.get(stripeIxMod);
        } else {
          stripeKey.stripeIx = stripeIx;
          stripeMetadata = metadataCache.getStripeMetadata(stripeKey);
          isFoundInCache = (stripeMetadata != null);
          if (!isFoundInCache) {
            counters.incrCounter(Counter.METADATA_CACHE_MISS);
            ensureMetadataReader();
            long startTimeHdfs = counters.startTimeCounter();
            stripeMetadata =
                new OrcStripeMetadata(
                    stripeKey, metadataReader, stripe, stripeIncludes, sargColumns);
            counters.incrTimeCounter(Counter.HDFS_TIME_US, startTimeHdfs);
            stripeMetadata = metadataCache.putStripeMetadata(stripeMetadata);
            if (DebugUtils.isTraceOrcEnabled()) {
              LlapIoImpl.LOG.info(
                  "Caching stripe "
                      + stripeKey.stripeIx
                      + " metadata with includes: "
                      + DebugUtils.toString(stripeIncludes));
            }
            stripeKey = new OrcBatchKey(fileId, -1, 0);
          }
          consumer.setStripeMetadata(stripeMetadata);
        }
        if (!stripeMetadata.hasAllIndexes(stripeIncludes)) {
          if (DebugUtils.isTraceOrcEnabled()) {
            LlapIoImpl.LOG.info(
                "Updating indexes in stripe "
                    + stripeKey.stripeIx
                    + " metadata for includes: "
                    + DebugUtils.toString(stripeIncludes));
          }
          assert isFoundInCache;
          counters.incrCounter(Counter.METADATA_CACHE_MISS);
          ensureMetadataReader();
          updateLoadedIndexes(stripeMetadata, stripe, stripeIncludes, sargColumns);
        } else if (isFoundInCache) {
          counters.incrCounter(Counter.METADATA_CACHE_HIT);
        }
      } catch (Throwable t) {
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
      if (processStop()) {
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }

      // 6.3. Finally, hand off to the stripe reader to produce the data.
      //      This is a sync call that will feed data to the consumer.
      try {
        // TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru
        // consumer. It is potentially holding locked buffers, and must perform its own cleanup.
        // Also, currently readEncodedColumns is not stoppable. The consumer will discard the
        // data it receives for one stripe. We could probably interrupt it, if it checked that.
        stripeReader.readEncodedColumns(
            stripeIx,
            stripe,
            stripeMetadata.getRowIndexes(),
            stripeMetadata.getEncodings(),
            stripeMetadata.getStreams(),
            stripeIncludes,
            colRgs,
            dataConsumer);
      } catch (Throwable t) {
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
    }

    // Done with all the things.
    recordReaderTime(startTime);
    dataConsumer.setDone();
    if (DebugUtils.isTraceMttEnabled()) {
      LlapIoImpl.LOG.info("done processing " + split);
    }

    // Close the stripe reader, we are done reading.
    cleanupReaders();
    return null;
  }