コード例 #1
0
ファイル: HdfsTable.java プロジェクト: nongli/Impala
  @Override
  /**
   * Load the table metadata and reuse metadata to speed up metadata loading. If the lastDdlTime has
   * not been changed, that means the Hive metastore metadata has not been changed. Reuses the old
   * Hive partition metadata from cachedEntry. To speed up Hdfs metadata loading, if a file's mtime
   * has not been changed, reuses the old file block metadata from old value.
   *
   * <p>There are several cases where the cachedEntry might be reused incorrectly: 1. an ALTER TABLE
   * ADD PARTITION or dynamic partition insert is executed through Hive. This does not update the
   * lastDdlTime. 2. Hdfs rebalancer is executed. This changes the block locations but won't update
   * the mtime (file modification time). If any of these occurs, user has to execute "invalidate
   * metadata" to invalidate the metadata cache of the table to trigger a fresh load.
   */
  public void load(
      Table cachedEntry,
      HiveMetaStoreClient client,
      org.apache.hadoop.hive.metastore.api.Table msTbl)
      throws TableLoadingException {
    numHdfsFiles_ = 0;
    totalHdfsBytes_ = 0;
    LOG.debug("load table: " + db_.getName() + "." + name_);
    // turn all exceptions into TableLoadingException
    try {
      // set nullPartitionKeyValue from the hive conf.
      nullPartitionKeyValue_ =
          client.getConfigValue("hive.exec.default.partition.name", "__HIVE_DEFAULT_PARTITION__");

      // set NULL indicator string from table properties
      nullColumnValue_ = msTbl.getParameters().get(serdeConstants.SERIALIZATION_NULL_FORMAT);
      if (nullColumnValue_ == null) nullColumnValue_ = DEFAULT_NULL_COLUMN_VALUE;

      // populate with both partition keys and regular columns
      List<FieldSchema> partKeys = msTbl.getPartitionKeys();
      List<FieldSchema> tblFields = Lists.newArrayList();
      String inputFormat = msTbl.getSd().getInputFormat();
      if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) {
        tblFields.addAll(client.getFields(db_.getName(), name_));
      } else {
        tblFields.addAll(msTbl.getSd().getCols());
      }
      List<FieldSchema> fieldSchemas =
          new ArrayList<FieldSchema>(partKeys.size() + tblFields.size());
      fieldSchemas.addAll(partKeys);
      fieldSchemas.addAll(tblFields);
      // The number of clustering columns is the number of partition keys.
      numClusteringCols_ = partKeys.size();
      loadColumns(fieldSchemas, client);

      // Collect the list of partitions to use for the table. Partitions may be reused
      // from the existing cached table entry (if one exists), read from the metastore,
      // or a mix of both. Whether or not a partition is reused depends on whether
      // the table or partition has been modified.
      List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions = Lists.newArrayList();
      if (cachedEntry == null
          || !(cachedEntry instanceof HdfsTable)
          || cachedEntry.lastDdlTime_ != lastDdlTime_) {
        msPartitions.addAll(client.listPartitions(db_.getName(), name_, Short.MAX_VALUE));
      } else {
        // The table was already in the metadata cache and it has not been modified.
        Preconditions.checkArgument(cachedEntry instanceof HdfsTable);
        HdfsTable cachedHdfsTableEntry = (HdfsTable) cachedEntry;
        // Set of partition names that have been modified. Partitions in this Set need to
        // be reloaded from the metastore.
        Set<String> modifiedPartitionNames = Sets.newHashSet();
        // If these are not the exact same object, look up the set of partition names in
        // the metastore. This is to support the special case of CTAS which creates a
        // "temp" table that doesn't actually exist in the metastore.
        if (cachedEntry != this) {
          // Since the table has not been modified, we might be able to reuse some of the
          // old partition metadata if the individual partitions have not been modified.
          // First get a list of all the partition names for this table from the
          // metastore, this is much faster than listing all the Partition objects.
          modifiedPartitionNames.addAll(
              client.listPartitionNames(db_.getName(), name_, Short.MAX_VALUE));
        }

        int totalPartitions = modifiedPartitionNames.size();
        // Get all the partitions from the cached entry that have not been modified.
        for (HdfsPartition cachedPart : cachedHdfsTableEntry.getPartitions()) {
          // Skip the default partition and any partitions that have been modified.
          if (cachedPart.isDirty()
              || cachedPart.getMetaStorePartition() == null
              || cachedPart.getId() == DEFAULT_PARTITION_ID) {
            continue;
          }
          org.apache.hadoop.hive.metastore.api.Partition cachedMsPart =
              cachedPart.getMetaStorePartition();
          Preconditions.checkNotNull(cachedMsPart);

          // This is a partition we already know about and it hasn't been modified.
          // No need to reload the metadata.
          String cachedPartName = cachedPart.getPartitionName();
          if (modifiedPartitionNames.contains(cachedPartName)) {
            msPartitions.add(cachedMsPart);
            modifiedPartitionNames.remove(cachedPartName);
          }
        }
        LOG.info(
            String.format(
                "Incrementally refreshing %d/%d partitions.",
                modifiedPartitionNames.size(), totalPartitions));

        // No need to make the metastore call if no partitions are to be updated.
        if (modifiedPartitionNames.size() > 0) {
          // Now reload the the remaining partitions.
          msPartitions.addAll(
              client.getPartitionsByNames(
                  db_.getName(), name_, Lists.newArrayList(modifiedPartitionNames)));
        }
      }
      Map<String, FileDescriptor> oldFileDescMap = null;
      if (cachedEntry != null && cachedEntry instanceof HdfsTable) {
        oldFileDescMap = ((HdfsTable) cachedEntry).fileDescMap_;
      }
      loadPartitions(msPartitions, msTbl, oldFileDescMap);

      // load table stats
      numRows_ = getRowCount(msTbl.getParameters());
      LOG.debug("table #rows=" + Long.toString(numRows_));

      // For unpartitioned tables set the numRows in its partitions
      // to the table's numRows.
      if (numClusteringCols_ == 0 && !partitions_.isEmpty()) {
        // Unpartitioned tables have a 'dummy' partition and a default partition.
        // Temp tables used in CTAS statements have one partition.
        Preconditions.checkState(partitions_.size() == 2 || partitions_.size() == 1);
        for (HdfsPartition p : partitions_) {
          p.setNumRows(numRows_);
        }
      }

      // populate Avro schema if necessary
      if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) {
        // Look for the schema in TBLPROPERTIES and in SERDEPROPERTIES, with the latter
        // taking precedence.
        List<Map<String, String>> schemaSearchLocations = Lists.newArrayList();
        schemaSearchLocations.add(getMetaStoreTable().getSd().getSerdeInfo().getParameters());
        schemaSearchLocations.add(getMetaStoreTable().getParameters());
        avroSchema_ = HdfsTable.getAvroSchema(schemaSearchLocations, getFullName(), true);
      }
    } catch (TableLoadingException e) {
      throw e;
    } catch (Exception e) {
      throw new TableLoadingException("Failed to load metadata for table: " + name_, e);
    }
  }