/** * Returns statistics on this table as a tabular result set. Used for the SHOW TABLE STATS * statement. The schema of the returned TResultSet is set inside this method. */ public TResultSet getTableStats() { TResultSet result = new TResultSet(); TResultSetMetadata resultSchema = new TResultSetMetadata(); result.setSchema(resultSchema); for (int i = 0; i < numClusteringCols_; ++i) { // Add the partition-key values as strings for simplicity. Column partCol = colsByPos_.get(i); TColumn colDesc = new TColumn(partCol.getName(), partCol.getType().toThrift()); resultSchema.addToColumns(colDesc); } resultSchema.addToColumns(new TColumn("#Rows", ColumnType.BIGINT.toThrift())); resultSchema.addToColumns(new TColumn("#Files", ColumnType.BIGINT.toThrift())); resultSchema.addToColumns(new TColumn("Size", ColumnType.STRING.toThrift())); resultSchema.addToColumns(new TColumn("Format", ColumnType.STRING.toThrift())); // Pretty print partitions and their stats. ArrayList<HdfsPartition> orderedPartitions = Lists.newArrayList(partitions_); Collections.sort(orderedPartitions); for (HdfsPartition p : orderedPartitions) { // Ignore dummy default partition. if (p.getId() == ImpalaInternalServiceConstants.DEFAULT_PARTITION_ID) continue; TResultRowBuilder rowBuilder = new TResultRowBuilder(); // Add the partition-key values (as strings for simplicity). for (LiteralExpr expr : p.getPartitionValues()) { rowBuilder.add(expr.getStringValue()); } // Add number of rows, files, bytes and the file format. rowBuilder .add(p.getNumRows()) .add(p.getFileDescriptors().size()) .addBytes(p.getSize()) .add(p.getInputFormatDescriptor().getFileFormat().toString()); result.addToRows(rowBuilder.get()); } // For partitioned tables add a summary row at the bottom. if (numClusteringCols_ > 0) { TResultRowBuilder rowBuilder = new TResultRowBuilder(); int numEmptyCells = numClusteringCols_ - 1; rowBuilder.add("Total"); for (int i = 0; i < numEmptyCells; ++i) { rowBuilder.add(""); } // Total num rows, files, and bytes (leave format empty). rowBuilder.add(numRows_).add(numHdfsFiles_).addBytes(totalHdfsBytes_).add(""); result.addToRows(rowBuilder.get()); } return result; }
/** * Create HdfsPartition objects corresponding to 'partitions'. * * <p>If there are no partitions in the Hive metadata, a single partition is added with no * partition keys. * * <p>For files that have not been changed, reuses file descriptors from oldFileDescMap. */ private void loadPartitions( List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions, org.apache.hadoop.hive.metastore.api.Table msTbl, Map<String, FileDescriptor> oldFileDescMap) throws IOException, CatalogException { partitions_.clear(); hdfsBaseDir_ = msTbl.getSd().getLocation(); List<FileDescriptor> newFileDescs = Lists.newArrayList(); // INSERT statements need to refer to this if they try to write to new partitions // Scans don't refer to this because by definition all partitions they refer to // exist. addDefaultPartition(msTbl.getSd()); if (msTbl.getPartitionKeysSize() == 0) { Preconditions.checkArgument(msPartitions == null || msPartitions.isEmpty()); // This table has no partition key, which means it has no declared partitions. // We model partitions slightly differently to Hive - every file must exist in a // partition, so add a single partition with no keys which will get all the // files in the table's root directory. addPartition(msTbl.getSd(), null, new ArrayList<LiteralExpr>(), oldFileDescMap, newFileDescs); Path location = new Path(hdfsBaseDir_); if (DFS.exists(location)) { accessLevel_ = getAvailableAccessLevel(location); } } else { // keep track of distinct partition key values and how many nulls there are Set<String>[] uniquePartitionKeys = new HashSet[numClusteringCols_]; long[] numNullKeys = new long[numClusteringCols_]; for (int i = 0; i < numClusteringCols_; ++i) { uniquePartitionKeys[i] = new HashSet<String>(); numNullKeys[i] = 0; } for (org.apache.hadoop.hive.metastore.api.Partition msPartition : msPartitions) { // load key values List<LiteralExpr> keyValues = Lists.newArrayList(); int i = 0; for (String partitionKey : msPartition.getValues()) { uniquePartitionKeys[i].add(partitionKey); // Deal with Hive's special NULL partition key. if (partitionKey.equals(nullPartitionKeyValue_)) { keyValues.add(new NullLiteral()); ++numNullKeys[i]; } else { ColumnType type = colsByPos_.get(keyValues.size()).getType(); try { Expr expr = LiteralExpr.create(partitionKey, type); // Force the literal to be of type declared in the metadata. expr = expr.castTo(type); keyValues.add((LiteralExpr) expr); } catch (AnalysisException ex) { LOG.warn("Failed to create literal expression of type: " + type, ex); throw new InvalidStorageDescriptorException(ex); } } ++i; } HdfsPartition partition = addPartition(msPartition.getSd(), msPartition, keyValues, oldFileDescMap, newFileDescs); // If the partition is null, its HDFS path does not exist, and it was not added to // this table's partition list. Skip the partition. if (partition == null) continue; if (msPartition.getParameters() != null) { partition.setNumRows(getRowCount(msPartition.getParameters())); } if (!TAccessLevelUtil.impliesWriteAccess(partition.getAccessLevel())) { // TODO: READ_ONLY isn't exactly correct because the it's possible the // partition does not have READ permissions either. When we start checking // whether we can READ from a table, this should be updated to set the // table's access level to the "lowest" effective level across all // partitions. That is, if one partition has READ_ONLY and another has // WRITE_ONLY the table's access level should be NONE. accessLevel_ = TAccessLevel.READ_ONLY; } } // update col stats for partition key cols for (int i = 0; i < numClusteringCols_; ++i) { ColumnStats stats = colsByPos_.get(i).getStats(); stats.setNumNulls(numNullKeys[i]); stats.setNumDistinctValues(uniquePartitionKeys[i].size()); LOG.debug("#col=" + Integer.toString(i) + " stats=" + stats.toString()); } } if (newFileDescs.size() > 0) { loadBlockMd(newFileDescs); } uniqueHostPortsCount_ = countUniqueDataNetworkLocations(partitions_); }