Beispiel #1
0
  /**
   * Determines which RGs need to be read, after stripes have been determined. SARG is applied, and
   * readState is populated for each stripe accordingly.
   */
  private boolean determineRgsToRead(
      boolean[] globalIncludes, int rowIndexStride, ArrayList<OrcStripeMetadata> metadata)
      throws IOException {
    SargApplier sargApp = null;
    if (sarg != null && rowIndexStride != 0) {
      List<OrcProto.Type> types = fileMetadata.getTypes();
      String[] colNamesForSarg =
          OrcInputFormat.getSargColumnNames(
              columnNames, types, globalIncludes, fileMetadata.isOriginalFormat());
      sargApp =
          new SargApplier(sarg, colNamesForSarg, rowIndexStride, types, globalIncludes.length);
    }
    boolean hasAnyData = false;
    // readState should have been initialized by this time with an empty array.
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      int stripeIx = stripeIxMod + stripeIxFrom;
      StripeInformation stripe = fileMetadata.getStripes().get(stripeIx);
      int rgCount = getRgCount(stripe, rowIndexStride);
      boolean[] rgsToRead = null;
      if (sargApp != null) {
        OrcStripeMetadata stripeMetadata = metadata.get(stripeIxMod);
        rgsToRead =
            sargApp.pickRowGroups(
                stripe,
                stripeMetadata.getRowIndexes(),
                stripeMetadata.getBloomFilterIndexes(),
                true);
      }
      boolean isNone = rgsToRead == SargApplier.READ_NO_RGS,
          isAll = rgsToRead == SargApplier.READ_ALL_RGS;
      hasAnyData = hasAnyData || !isNone;
      if (DebugUtils.isTraceOrcEnabled()) {
        if (isNone) {
          LlapIoImpl.LOG.info("SARG eliminated all RGs for stripe " + stripeIx);
        } else if (!isAll) {
          LlapIoImpl.LOG.info(
              "SARG picked RGs for stripe " + stripeIx + ": " + DebugUtils.toString(rgsToRead));
        } else {
          LlapIoImpl.LOG.info("Will read all " + rgCount + " RGs for stripe " + stripeIx);
        }
      }
      assert isAll || isNone || rgsToRead.length == rgCount;
      readState[stripeIxMod] = new boolean[columnIds.size()][];
      for (int j = 0; j < columnIds.size(); ++j) {
        readState[stripeIxMod][j] =
            (isAll || isNone) ? rgsToRead : Arrays.copyOf(rgsToRead, rgsToRead.length);
      }

      adjustRgMetric(rgCount, rgsToRead, isNone, isAll);
    }
    return hasAnyData;
  }
Beispiel #2
0
 /** Reads the metadata for all stripes in the file. */
 private ArrayList<OrcStripeMetadata> readStripesMetadata(
     boolean[] globalInc, boolean[] sargColumns) throws IOException {
   ArrayList<OrcStripeMetadata> result = new ArrayList<OrcStripeMetadata>(readState.length);
   OrcBatchKey stripeKey = new OrcBatchKey(fileId, 0, 0);
   for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
     stripeKey.stripeIx = stripeIxMod + stripeIxFrom;
     OrcStripeMetadata value = metadataCache.getStripeMetadata(stripeKey);
     if (value == null || !value.hasAllIndexes(globalInc)) {
       counters.incrCounter(Counter.METADATA_CACHE_MISS);
       ensureMetadataReader();
       StripeInformation si = fileMetadata.getStripes().get(stripeKey.stripeIx);
       if (value == null) {
         long startTime = counters.startTimeCounter();
         value = new OrcStripeMetadata(stripeKey, metadataReader, si, globalInc, sargColumns);
         counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime);
         value = metadataCache.putStripeMetadata(value);
         if (DebugUtils.isTraceOrcEnabled()) {
           LlapIoImpl.LOG.info(
               "Caching stripe "
                   + stripeKey.stripeIx
                   + " metadata with includes: "
                   + DebugUtils.toString(globalInc));
         }
         // Create new key object to reuse for gets; we've used the old one to put in cache.
         stripeKey = new OrcBatchKey(fileId, 0, 0);
       }
       // We might have got an old value from cache; recheck it has indexes.
       if (!value.hasAllIndexes(globalInc)) {
         if (DebugUtils.isTraceOrcEnabled()) {
           LlapIoImpl.LOG.info(
               "Updating indexes in stripe "
                   + stripeKey.stripeIx
                   + " metadata for includes: "
                   + DebugUtils.toString(globalInc));
         }
         updateLoadedIndexes(value, si, globalInc, sargColumns);
       }
     } else {
       counters.incrCounter(Counter.METADATA_CACHE_HIT);
     }
     result.add(value);
     consumer.setStripeMetadata(value);
   }
   return result;
 }
Beispiel #3
0
 int decRef() {
   int newRefCount = refCount.decrementAndGet();
   if (DebugUtils.isTraceLockingEnabled()) {
     LlapIoImpl.LOG.info("Unlocked " + this + "; refcount " + newRefCount);
   }
   if (newRefCount < 0) {
     throw new AssertionError("Unexpected refCount " + newRefCount + ": " + this);
   }
   return newRefCount;
 }
Beispiel #4
0
 /** Ensures orcReader is initialized for the split. */
 private void ensureOrcReader() throws IOException {
   if (orcReader != null) return;
   Path path = HdfsUtils.getFileIdPath(fs, split.getPath(), fileId);
   if (DebugUtils.isTraceOrcEnabled()) {
     LOG.info("Creating reader for " + path + " (" + split.getPath() + ")");
   }
   long startTime = counters.startTimeCounter();
   ReaderOptions opts = OrcFile.readerOptions(conf).filesystem(fs).fileMetadata(fileMetadata);
   orcReader = EncodedOrcFile.createReader(path, opts);
   counters.incrTimeCounter(Counter.HDFS_TIME_US, startTime);
 }
Beispiel #5
0
 /** @return Whether the we can invalidate; false if locked or already evicted. */
 @Override
 public boolean invalidate() {
   while (true) {
     int value = refCount.get();
     if (value != 0) return false;
     if (refCount.compareAndSet(value, EVICTED_REFCOUNT)) break;
   }
   if (DebugUtils.isTraceLockingEnabled()) {
     LlapIoImpl.LOG.info("Invalidated " + this + " due to eviction");
   }
   return true;
 }
Beispiel #6
0
 int incRef() {
   int newRefCount = -1;
   while (true) {
     int oldRefCount = refCount.get();
     if (oldRefCount == EVICTED_REFCOUNT) return -1;
     assert oldRefCount >= 0 : "oldRefCount is " + oldRefCount + " " + this;
     newRefCount = oldRefCount + 1;
     if (refCount.compareAndSet(oldRefCount, newRefCount)) break;
   }
   if (DebugUtils.isTraceLockingEnabled()) {
     LlapIoImpl.LOG.info("Locked " + this + "; new ref count " + newRefCount);
   }
   return newRefCount;
 }
Beispiel #7
0
 @Override
 public DiskRangeList readFileData(DiskRangeList range, long baseOffset, boolean doForceDirect)
     throws IOException {
   long startTime = counters.startTimeCounter();
   DiskRangeList result = orcDataReader.readFileData(range, baseOffset, doForceDirect);
   counters.recordHdfsTime(startTime);
   if (DebugUtils.isTraceOrcEnabled() && LOG.isInfoEnabled()) {
     LOG.info(
         "Disk ranges after disk read (file "
             + fileId
             + ", base offset "
             + baseOffset
             + "): "
             + RecordReaderUtils.stringifyDiskRanges(result));
   }
   return result;
 }
Beispiel #8
0
 @Override
 public void returnData(OrcEncodedColumnBatch ecb) {
   for (ColumnStreamData[] datas : ecb.getColumnData()) {
     if (datas == null) continue;
     for (ColumnStreamData data : datas) {
       if (data == null || data.decRef() != 0) continue;
       if (DebugUtils.isTraceLockingEnabled()) {
         for (MemoryBuffer buf : data.getCacheBuffers()) {
           LlapIoImpl.LOG.info("Unlocking " + buf + " at the end of processing");
         }
       }
       lowLevelCache.releaseBuffers(data.getCacheBuffers());
       CSD_POOL.offer(data);
     }
   }
   // We can offer ECB even with some streams not discarded; reset() will clear the arrays.
   ECB_POOL.offer(ecb);
 }
Beispiel #9
0
  /** Determine which stripes to read for a split. Populates stripeIxFrom and readState. */
  public void determineStripesToRead() {
    // The unit of caching for ORC is (rg x column) (see OrcBatchKey).
    List<StripeInformation> stripes = fileMetadata.getStripes();
    long offset = split.getStart(), maxOffset = offset + split.getLength();
    stripeIxFrom = -1;
    int stripeIxTo = -1;
    if (LlapIoImpl.LOGL.isDebugEnabled()) {
      String tmp = "FileSplit {" + split.getStart() + ", " + split.getLength() + "}; stripes ";
      for (StripeInformation stripe : stripes) {
        tmp += "{" + stripe.getOffset() + ", " + stripe.getLength() + "}, ";
      }
      LlapIoImpl.LOG.debug(tmp);
    }

    int stripeIx = 0;
    for (StripeInformation stripe : stripes) {
      long stripeStart = stripe.getOffset();
      if (offset > stripeStart) {
        // We assume splits will never start in the middle of the stripe.
        ++stripeIx;
        continue;
      }
      if (stripeIxFrom == -1) {
        if (DebugUtils.isTraceOrcEnabled()) {
          LlapIoImpl.LOG.info(
              "Including stripes from " + stripeIx + " (" + stripeStart + " >= " + offset + ")");
        }
        stripeIxFrom = stripeIx;
      }
      if (stripeStart >= maxOffset) {
        stripeIxTo = stripeIx;
        if (DebugUtils.isTraceOrcEnabled()) {
          LlapIoImpl.LOG.info(
              "Including stripes until "
                  + stripeIxTo
                  + " ("
                  + stripeStart
                  + " >= "
                  + maxOffset
                  + "); "
                  + (stripeIxTo - stripeIxFrom)
                  + " stripes");
        }
        break;
      }
      ++stripeIx;
    }
    if (stripeIxFrom == -1) {
      if (LlapIoImpl.LOG.isInfoEnabled()) {
        LlapIoImpl.LOG.info("Not including any stripes - empty split");
      }
    }
    if (stripeIxTo == -1 && stripeIxFrom != -1) {
      stripeIxTo = stripeIx;
      if (DebugUtils.isTraceOrcEnabled()) {
        LlapIoImpl.LOG.info(
            "Including stripes until "
                + stripeIx
                + " (end of file); "
                + (stripeIxTo - stripeIxFrom)
                + " stripes");
      }
    }
    readState = new boolean[stripeIxTo - stripeIxFrom][][];
  }
Beispiel #10
0
  protected Void performDataRead() throws IOException {
    long startTime = counters.startTimeCounter();
    if (LlapIoImpl.LOGL.isInfoEnabled()) {
      LlapIoImpl.LOG.info("Processing data for " + split.getPath());
    }
    if (processStop()) {
      recordReaderTime(startTime);
      return null;
    }
    counters.setDesc(QueryFragmentCounters.Desc.TABLE, getDbAndTableName(split.getPath()));
    orcReader = null;
    // 1. Get file metadata from cache, or create the reader and read it.
    // Don't cache the filesystem object for now; Tez closes it and FS cache will fix all that
    fs = split.getPath().getFileSystem(conf);
    fileId = determineFileId(fs, split);
    counters.setDesc(QueryFragmentCounters.Desc.FILE, fileId);

    try {
      fileMetadata = getOrReadFileMetadata();
      consumer.setFileMetadata(fileMetadata);
      validateFileMetadata();
      if (columnIds == null) {
        columnIds = createColumnIds(fileMetadata);
      }

      // 2. Determine which stripes to read based on the split.
      determineStripesToRead();
    } catch (Throwable t) {
      recordReaderTime(startTime);
      consumer.setError(t);
      return null;
    }

    if (readState.length == 0) {
      consumer.setDone();
      recordReaderTime(startTime);
      return null; // No data to read.
    }
    counters.setDesc(QueryFragmentCounters.Desc.STRIPES, stripeIxFrom + "," + readState.length);

    // 3. Apply SARG if needed, and otherwise determine what RGs to read.
    int stride = fileMetadata.getRowIndexStride();
    ArrayList<OrcStripeMetadata> stripeMetadatas = null;
    boolean[] globalIncludes = null;
    boolean[] sargColumns = null;
    try {
      globalIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), columnIds, true);
      if (sarg != null && stride != 0) {
        // TODO: move this to a common method
        int[] filterColumns =
            RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sarg.getLeaves(), columnNames, 0);
        // included will not be null, row options will fill the array with trues if null
        sargColumns = new boolean[globalIncludes.length];
        for (int i : filterColumns) {
          // filter columns may have -1 as index which could be partition column in SARG.
          if (i > 0) {
            sargColumns[i] = true;
          }
        }

        // If SARG is present, get relevant stripe metadata from cache or readers.
        stripeMetadatas = readStripesMetadata(globalIncludes, sargColumns);
      }

      // Now, apply SARG if any; w/o sarg, this will just initialize readState.
      boolean hasData = determineRgsToRead(globalIncludes, stride, stripeMetadatas);
      if (!hasData) {
        consumer.setDone();
        recordReaderTime(startTime);
        return null; // No data to read.
      }
    } catch (Throwable t) {
      cleanupReaders();
      consumer.setError(t);
      recordReaderTime(startTime);
      return null;
    }

    if (processStop()) {
      cleanupReaders();
      recordReaderTime(startTime);
      return null;
    }

    // 4. Get data from high-level cache.
    //    If some cols are fully in cache, this will also give us the modified list of columns to
    //    read for every stripe (null means read all of them - the usual path). In any case,
    //    readState will be modified for column x rgs that were fetched from high-level cache.
    List<Integer>[] stripeColsToRead = null;
    if (cache != null) {
      try {
        stripeColsToRead = produceDataFromCache(stride);
      } catch (Throwable t) {
        // produceDataFromCache handles its own cleanup.
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
    }

    // 5. Create encoded data reader.
    // In case if we have high-level cache, we will intercept the data and add it there;
    // otherwise just pass the data directly to the consumer.
    Consumer<OrcEncodedColumnBatch> dataConsumer = (cache == null) ? this.consumer : this;
    try {
      ensureOrcReader();
      // Reader creating updates HDFS counters, don't do it here.
      DataWrapperForOrc dw = new DataWrapperForOrc();
      stripeReader = orcReader.encodedReader(fileId, dw, dw, POOL_FACTORY);
      stripeReader.setDebugTracing(DebugUtils.isTraceOrcEnabled());
    } catch (Throwable t) {
      consumer.setError(t);
      recordReaderTime(startTime);
      cleanupReaders();
      return null;
    }

    // 6. Read data.
    // TODO: I/O threadpool could be here - one thread per stripe; for now, linear.
    OrcBatchKey stripeKey = new OrcBatchKey(fileId, -1, 0);
    for (int stripeIxMod = 0; stripeIxMod < readState.length; ++stripeIxMod) {
      if (processStop()) {
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
      int stripeIx = stripeIxFrom + stripeIxMod;
      boolean[][] colRgs = null;
      boolean[] stripeIncludes = null;
      OrcStripeMetadata stripeMetadata = null;
      StripeInformation stripe;
      try {
        List<Integer> cols = stripeColsToRead == null ? null : stripeColsToRead[stripeIxMod];
        if (cols != null && cols.isEmpty()) continue; // No need to read this stripe.
        stripe = fileMetadata.getStripes().get(stripeIx);

        if (DebugUtils.isTraceOrcEnabled()) {
          LlapIoImpl.LOG.info(
              "Reading stripe " + stripeIx + ": " + stripe.getOffset() + ", " + stripe.getLength());
        }
        colRgs = readState[stripeIxMod];
        // We assume that NO_RGS value is only set from SARG filter and for all columns;
        // intermediate changes for individual columns will unset values in the array.
        // Skip this case for 0-column read. We could probably special-case it just like we do
        // in EncodedReaderImpl, but for now it's not that important.
        if (colRgs.length > 0 && colRgs[0] == SargApplier.READ_NO_RGS) continue;

        // 6.1. Determine the columns to read (usually the same as requested).
        if (cache == null || cols == null || cols.size() == colRgs.length) {
          cols = columnIds;
          stripeIncludes = globalIncludes;
        } else {
          // We are reading subset of the original columns, remove unnecessary bitmasks/etc.
          // This will never happen w/o high-level cache.
          stripeIncludes = OrcInputFormat.genIncludedColumns(fileMetadata.getTypes(), cols, true);
          colRgs = genStripeColRgs(cols, colRgs);
        }

        // 6.2. Ensure we have stripe metadata. We might have read it before for RG filtering.
        boolean isFoundInCache = false;
        if (stripeMetadatas != null) {
          stripeMetadata = stripeMetadatas.get(stripeIxMod);
        } else {
          stripeKey.stripeIx = stripeIx;
          stripeMetadata = metadataCache.getStripeMetadata(stripeKey);
          isFoundInCache = (stripeMetadata != null);
          if (!isFoundInCache) {
            counters.incrCounter(Counter.METADATA_CACHE_MISS);
            ensureMetadataReader();
            long startTimeHdfs = counters.startTimeCounter();
            stripeMetadata =
                new OrcStripeMetadata(
                    stripeKey, metadataReader, stripe, stripeIncludes, sargColumns);
            counters.incrTimeCounter(Counter.HDFS_TIME_US, startTimeHdfs);
            stripeMetadata = metadataCache.putStripeMetadata(stripeMetadata);
            if (DebugUtils.isTraceOrcEnabled()) {
              LlapIoImpl.LOG.info(
                  "Caching stripe "
                      + stripeKey.stripeIx
                      + " metadata with includes: "
                      + DebugUtils.toString(stripeIncludes));
            }
            stripeKey = new OrcBatchKey(fileId, -1, 0);
          }
          consumer.setStripeMetadata(stripeMetadata);
        }
        if (!stripeMetadata.hasAllIndexes(stripeIncludes)) {
          if (DebugUtils.isTraceOrcEnabled()) {
            LlapIoImpl.LOG.info(
                "Updating indexes in stripe "
                    + stripeKey.stripeIx
                    + " metadata for includes: "
                    + DebugUtils.toString(stripeIncludes));
          }
          assert isFoundInCache;
          counters.incrCounter(Counter.METADATA_CACHE_MISS);
          ensureMetadataReader();
          updateLoadedIndexes(stripeMetadata, stripe, stripeIncludes, sargColumns);
        } else if (isFoundInCache) {
          counters.incrCounter(Counter.METADATA_CACHE_HIT);
        }
      } catch (Throwable t) {
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
      if (processStop()) {
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }

      // 6.3. Finally, hand off to the stripe reader to produce the data.
      //      This is a sync call that will feed data to the consumer.
      try {
        // TODO: readEncodedColumns is not supposed to throw; errors should be propagated thru
        // consumer. It is potentially holding locked buffers, and must perform its own cleanup.
        // Also, currently readEncodedColumns is not stoppable. The consumer will discard the
        // data it receives for one stripe. We could probably interrupt it, if it checked that.
        stripeReader.readEncodedColumns(
            stripeIx,
            stripe,
            stripeMetadata.getRowIndexes(),
            stripeMetadata.getEncodings(),
            stripeMetadata.getStreams(),
            stripeIncludes,
            colRgs,
            dataConsumer);
      } catch (Throwable t) {
        consumer.setError(t);
        cleanupReaders();
        recordReaderTime(startTime);
        return null;
      }
    }

    // Done with all the things.
    recordReaderTime(startTime);
    dataConsumer.setDone();
    if (DebugUtils.isTraceMttEnabled()) {
      LlapIoImpl.LOG.info("done processing " + split);
    }

    // Close the stripe reader, we are done reading.
    cleanupReaders();
    return null;
  }