/** * @param db * @param table * @param filter * @param jobConf * @return A list of locations */ public static List<String> getDataStorageLocation( String db, String table, String filter, JobConf jobConf) { Preconditions.checkNotNull(table, "Table name must not be null"); HiveMetaStoreClient client = null; List<String> locations = new ArrayList<String>(); try { client = getHiveMetaStoreClient(jobConf); Table hiveTable = HCatUtil.getTable(client, db, table); if (hiveTable.isPartitioned()) { List<Partition> parts = null; if (null != StringUtils.stripToNull(filter)) { parts = client.listPartitionsByFilter(db, table, filter, (short) -1); } else { parts = client.listPartitions(db, table, (short) -1); } if (parts.size() > 0) { // Return more than one partitions when filter is // something // like ds >= 1234 for (Partition part : parts) { locations.addAll(getFilesInHivePartition(part, jobConf)); } } else { logError( "Table " + hiveTable.getTableName() + " doesn't have the specified partition:" + filter, null); } } else { locations.add(hiveTable.getTTable().getSd().getLocation()); } } catch (IOException e) { logError("Error occured when getting hiveconf", e); } catch (MetaException e) { logError("Error occured when getting HiveMetaStoreClient", e); } catch (NoSuchObjectException e) { logError("Table doesn't exist in HCatalog: " + table, e); } catch (TException e) { logError("Error occured when getting Table", e); } finally { HCatUtil.closeHiveClientQuietly(client); } return locations; }
@Override /** * Load the table metadata and reuse metadata to speed up metadata loading. If the lastDdlTime has * not been changed, that means the Hive metastore metadata has not been changed. Reuses the old * Hive partition metadata from cachedEntry. To speed up Hdfs metadata loading, if a file's mtime * has not been changed, reuses the old file block metadata from old value. * * <p>There are several cases where the cachedEntry might be reused incorrectly: 1. an ALTER TABLE * ADD PARTITION or dynamic partition insert is executed through Hive. This does not update the * lastDdlTime. 2. Hdfs rebalancer is executed. This changes the block locations but won't update * the mtime (file modification time). If any of these occurs, user has to execute "invalidate * metadata" to invalidate the metadata cache of the table to trigger a fresh load. */ public void load( Table cachedEntry, HiveMetaStoreClient client, org.apache.hadoop.hive.metastore.api.Table msTbl) throws TableLoadingException { numHdfsFiles_ = 0; totalHdfsBytes_ = 0; LOG.debug("load table: " + db_.getName() + "." + name_); // turn all exceptions into TableLoadingException try { // set nullPartitionKeyValue from the hive conf. nullPartitionKeyValue_ = client.getConfigValue("hive.exec.default.partition.name", "__HIVE_DEFAULT_PARTITION__"); // set NULL indicator string from table properties nullColumnValue_ = msTbl.getParameters().get(serdeConstants.SERIALIZATION_NULL_FORMAT); if (nullColumnValue_ == null) nullColumnValue_ = DEFAULT_NULL_COLUMN_VALUE; // populate with both partition keys and regular columns List<FieldSchema> partKeys = msTbl.getPartitionKeys(); List<FieldSchema> tblFields = Lists.newArrayList(); String inputFormat = msTbl.getSd().getInputFormat(); if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) { tblFields.addAll(client.getFields(db_.getName(), name_)); } else { tblFields.addAll(msTbl.getSd().getCols()); } List<FieldSchema> fieldSchemas = new ArrayList<FieldSchema>(partKeys.size() + tblFields.size()); fieldSchemas.addAll(partKeys); fieldSchemas.addAll(tblFields); // The number of clustering columns is the number of partition keys. numClusteringCols_ = partKeys.size(); loadColumns(fieldSchemas, client); // Collect the list of partitions to use for the table. Partitions may be reused // from the existing cached table entry (if one exists), read from the metastore, // or a mix of both. Whether or not a partition is reused depends on whether // the table or partition has been modified. List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions = Lists.newArrayList(); if (cachedEntry == null || !(cachedEntry instanceof HdfsTable) || cachedEntry.lastDdlTime_ != lastDdlTime_) { msPartitions.addAll(client.listPartitions(db_.getName(), name_, Short.MAX_VALUE)); } else { // The table was already in the metadata cache and it has not been modified. Preconditions.checkArgument(cachedEntry instanceof HdfsTable); HdfsTable cachedHdfsTableEntry = (HdfsTable) cachedEntry; // Set of partition names that have been modified. Partitions in this Set need to // be reloaded from the metastore. Set<String> modifiedPartitionNames = Sets.newHashSet(); // If these are not the exact same object, look up the set of partition names in // the metastore. This is to support the special case of CTAS which creates a // "temp" table that doesn't actually exist in the metastore. if (cachedEntry != this) { // Since the table has not been modified, we might be able to reuse some of the // old partition metadata if the individual partitions have not been modified. // First get a list of all the partition names for this table from the // metastore, this is much faster than listing all the Partition objects. modifiedPartitionNames.addAll( client.listPartitionNames(db_.getName(), name_, Short.MAX_VALUE)); } int totalPartitions = modifiedPartitionNames.size(); // Get all the partitions from the cached entry that have not been modified. for (HdfsPartition cachedPart : cachedHdfsTableEntry.getPartitions()) { // Skip the default partition and any partitions that have been modified. if (cachedPart.isDirty() || cachedPart.getMetaStorePartition() == null || cachedPart.getId() == DEFAULT_PARTITION_ID) { continue; } org.apache.hadoop.hive.metastore.api.Partition cachedMsPart = cachedPart.getMetaStorePartition(); Preconditions.checkNotNull(cachedMsPart); // This is a partition we already know about and it hasn't been modified. // No need to reload the metadata. String cachedPartName = cachedPart.getPartitionName(); if (modifiedPartitionNames.contains(cachedPartName)) { msPartitions.add(cachedMsPart); modifiedPartitionNames.remove(cachedPartName); } } LOG.info( String.format( "Incrementally refreshing %d/%d partitions.", modifiedPartitionNames.size(), totalPartitions)); // No need to make the metastore call if no partitions are to be updated. if (modifiedPartitionNames.size() > 0) { // Now reload the the remaining partitions. msPartitions.addAll( client.getPartitionsByNames( db_.getName(), name_, Lists.newArrayList(modifiedPartitionNames))); } } Map<String, FileDescriptor> oldFileDescMap = null; if (cachedEntry != null && cachedEntry instanceof HdfsTable) { oldFileDescMap = ((HdfsTable) cachedEntry).fileDescMap_; } loadPartitions(msPartitions, msTbl, oldFileDescMap); // load table stats numRows_ = getRowCount(msTbl.getParameters()); LOG.debug("table #rows=" + Long.toString(numRows_)); // For unpartitioned tables set the numRows in its partitions // to the table's numRows. if (numClusteringCols_ == 0 && !partitions_.isEmpty()) { // Unpartitioned tables have a 'dummy' partition and a default partition. // Temp tables used in CTAS statements have one partition. Preconditions.checkState(partitions_.size() == 2 || partitions_.size() == 1); for (HdfsPartition p : partitions_) { p.setNumRows(numRows_); } } // populate Avro schema if necessary if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) { // Look for the schema in TBLPROPERTIES and in SERDEPROPERTIES, with the latter // taking precedence. List<Map<String, String>> schemaSearchLocations = Lists.newArrayList(); schemaSearchLocations.add(getMetaStoreTable().getSd().getSerdeInfo().getParameters()); schemaSearchLocations.add(getMetaStoreTable().getParameters()); avroSchema_ = HdfsTable.getAvroSchema(schemaSearchLocations, getFullName(), true); } } catch (TableLoadingException e) { throw e; } catch (Exception e) { throw new TableLoadingException("Failed to load metadata for table: " + name_, e); } }