/** Validates the properties of the chosen cache pool. Throws on error. */ public static void validateCachePool( THdfsCachingOp op, Long directiveId, TableName table, HdfsPartition partition) throws ImpalaRuntimeException { CacheDirectiveEntry entry = getDirective(directiveId); Preconditions.checkNotNull(entry); if (!op.getCache_pool_name().equals(entry.getInfo().getPool())) { throw new ImpalaRuntimeException( String.format( "Cannot cache partition in " + "pool '%s' because it is already cached in '%s'. To change the cache " + "pool for this partition, first uncache using: ALTER TABLE %s.%s " + "%sSET UNCACHED", op.getCache_pool_name(), entry.getInfo().getPool(), table.getDb(), table, // Insert partition string if partition non null partition != null ? String.format( " PARTITION(%s) ", partition.getPartitionName().replaceAll("/", ", ")) : "")); } }
/** * Returns a boolean indicating if the given thrift caching operation would perform an update on * an already existing cache directive. */ public static boolean isUpdateOp(THdfsCachingOp op, Map<String, String> params) throws ImpalaRuntimeException { Long directiveId = Long.parseLong(params.get(CACHE_DIR_ID_PROP_NAME)); CacheDirectiveEntry entry = getDirective(directiveId); Preconditions.checkNotNull(entry); // Verify cache pool if (!op.getCache_pool_name().equals(entry.getInfo().getPool())) { return false; } // Check cache replication factor if ((op.isSetReplication() && op.getReplication() != entry.getInfo().getReplication()) || (!op.isSetReplication() && entry.getInfo().getReplication() != JniCatalogConstants.HDFS_DEFAULT_CACHE_REPLICATION_FACTOR)) { return true; } return false; }
/** * Validates and returns true if a parameter map contains a cache directive ID and validates it * against the NameNode to make sure it exists. If the cache directive ID does not exist, we * remove the value from the parameter map, issue a log message and return false. As the value is * not written back to the Hive MS from this method, the result will be only valid until the next * metadata fetch. Lastly, we update the cache replication factor in the parameters with the value * read from HDFS. */ public static boolean validateCacheParams(Map<String, String> params) { Long directiveId = getCacheDirectiveId(params); if (directiveId == null) return false; CacheDirectiveEntry entry = null; try { entry = getDirective(directiveId); } catch (ImpalaRuntimeException e) { if (e.getCause() != null && e.getCause() instanceof RemoteException) { // This exception signals that the cache directive no longer exists. LOG.error("Cache directive does not exist", e); params.remove(CACHE_DIR_ID_PROP_NAME); params.remove(CACHE_DIR_REPLICATION_PROP_NAME); } else { // This exception signals that there was a connection problem with HDFS. LOG.error("IO Exception, possible connectivity issues with HDFS", e); } return false; } Preconditions.checkNotNull(entry); // On the upgrade path the property might not exist, if it exists // and is different from the one from the meta store, issue a warning. String replicationFactor = params.get(CACHE_DIR_REPLICATION_PROP_NAME); if (replicationFactor != null && Short.parseShort(replicationFactor) != entry.getInfo().getReplication()) { LOG.info( "Replication factor for entry in HDFS differs from value in Hive MS: " + entry.getInfo().getPath().toString() + " " + entry.getInfo().getReplication().toString() + " != " + params.get(CACHE_DIR_REPLICATION_PROP_NAME)); } params.put(CACHE_DIR_REPLICATION_PROP_NAME, String.valueOf(entry.getInfo().getReplication())); return true; }
/** * Waits on a cache directive to either complete or stop making progress. Progress is checked by * polling the HDFS caching stats every DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS. We * verify the request's "currentBytesCached" is increasing compared to "bytesNeeded". If * "currentBytesCached" == "bytesNeeded" or if no progress is made for a * MAX_UNCHANGED_CACHING_REFRESH_INTERVALS, this function returns. */ public static void waitForDirective(long directiveId) throws ImpalaRuntimeException { long bytesNeeded = 0L; long currentBytesCached = 0L; CacheDirectiveEntry cacheDir = getDirective(directiveId); if (cacheDir == null) return; bytesNeeded = cacheDir.getStats().getBytesNeeded(); currentBytesCached = cacheDir.getStats().getBytesCached(); LOG.debug( String.format( "Waiting on cache directive id: %d. Bytes " + "cached (%d) / needed (%d)", directiveId, currentBytesCached, bytesNeeded)); // All the bytes are cached, just return. if (bytesNeeded == currentBytesCached) return; // The refresh interval is how often HDFS will update cache directive stats. We use // this value to determine how frequently we should poll for changes. long hdfsRefreshIntervalMs = FileSystemUtil.getConfiguration() .getLong( DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS, DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS_DEFAULT); Preconditions.checkState(hdfsRefreshIntervalMs > 0); // Loop until either MAX_UNCHANGED_CACHING_REFRESH_INTERVALS have passed with no // changes or all required data is cached. int unchangedCounter = 0; while (unchangedCounter < MAX_UNCHANGED_CACHING_REFRESH_INTERVALS) { long previousBytesCached = currentBytesCached; cacheDir = getDirective(directiveId); if (cacheDir == null) return; currentBytesCached = cacheDir.getStats().getBytesCached(); bytesNeeded = cacheDir.getStats().getBytesNeeded(); if (currentBytesCached == bytesNeeded) { LOG.debug( String.format( "Cache directive id: %d has completed." + "Bytes cached (%d) / needed (%d)", directiveId, currentBytesCached, bytesNeeded)); return; } if (currentBytesCached == previousBytesCached) { ++unchangedCounter; } else { unchangedCounter = 0; } try { // Sleep for the refresh interval + a little bit more to ensure a full interval // has completed. A value of 25% the refresh interval was arbitrarily chosen. Thread.sleep((long) (hdfsRefreshIntervalMs * 1.25)); } catch (InterruptedException e) { /* ignore */ } } LOG.warn( String.format( "No changes in cached bytes in: %d(ms). All data may not " + "be cached. Final stats for cache directive id: %d. Bytes cached (%d)/needed " + "(%d)", hdfsRefreshIntervalMs * MAX_UNCHANGED_CACHING_REFRESH_INTERVALS, directiveId, currentBytesCached, bytesNeeded)); }
/** * Given a cache directive ID, returns the replication factor for the directive. Returns null if * no outstanding cache directives match this ID. */ public static Short getCacheReplication(long directiveId) throws ImpalaRuntimeException { CacheDirectiveEntry entry = getDirective(directiveId); return entry != null ? entry.getInfo().getReplication() : null; }
/** * Given a cache directive ID, returns the pool the directive is cached in. Returns null if no * outstanding cache directive match this ID. */ public static String getCachePool(long directiveId) throws ImpalaRuntimeException { CacheDirectiveEntry entry = getDirective(directiveId); return entry == null ? null : entry.getInfo().getPool(); }