@Override public void preCreateTable(Table tbl) throws MetaException { boolean isExternal = MetaStoreUtils.isExternalTable(tbl); if (isExternal) { Log.info("Creating External table for Splice..."); } String inputTableName = tbl.getParameters().get(MRConstants.SPLICE_TABLE_NAME); if (inputTableName == null) throw new MetaException( "Wrong param, you are missing " + MRConstants.SPLICE_TABLE_NAME + " ? "); // We can choose to support user define column mapping. // But currently I don't think it is necessary // We map all columns from Splice Table to Hive Table. String connStr = tbl.getParameters().get(MRConstants.SPLICE_JDBC_STR); if (connStr == null) throw new MetaException("Wrong param, did you mean " + MRConstants.SPLICE_JDBC_STR + " ? "); if (sqlUtil == null) sqlUtil = SMSQLUtil.getInstance(connStr); if (inputTableName != null) { inputTableName = inputTableName.trim(); checkTableExists(inputTableName); } }
public static Table fromMetastoreApiTable(org.apache.hadoop.hive.metastore.api.Table table) { StorageDescriptor storageDescriptor = table.getSd(); if (storageDescriptor == null) { throw new PrestoException(HIVE_INVALID_METADATA, "Table is missing storage descriptor"); } Table.Builder tableBuilder = Table.builder() .setDatabaseName(table.getDbName()) .setTableName(table.getTableName()) .setOwner(nullToEmpty(table.getOwner())) .setTableType(table.getTableType()) .setDataColumns( storageDescriptor .getCols() .stream() .map(MetastoreUtil::fromMetastoreApiFieldSchema) .collect(toList())) .setPartitionColumns( table .getPartitionKeys() .stream() .map(MetastoreUtil::fromMetastoreApiFieldSchema) .collect(toList())) .setParameters( table.getParameters() == null ? ImmutableMap.of() : table.getParameters()) .setViewOriginalText(Optional.ofNullable(emptyToNull(table.getViewOriginalText()))) .setViewExpandedText(Optional.ofNullable(emptyToNull(table.getViewExpandedText()))); fromMetastoreApiStorageDescriptor( storageDescriptor, tableBuilder.getStorageBuilder(), table.getTableName()); return tableBuilder.build(); }
/** * Removes the cache directive associated with the table from HDFS, uncaching all data. Also * updates the table's metadata. No-op if the table is not cached. */ public static void uncacheTbl(org.apache.hadoop.hive.metastore.api.Table table) throws ImpalaRuntimeException { Preconditions.checkNotNull(table); LOG.debug("Uncaching table: " + table.getDbName() + "." + table.getTableName()); Long id = getCacheDirectiveId(table.getParameters()); if (id == null) return; HdfsCachingUtil.removeDirective(id); table.getParameters().remove(CACHE_DIR_ID_PROP_NAME); table.getParameters().remove(CACHE_DIR_REPLICATION_PROP_NAME); }
/** * Returns the table parameter 'transient_lastDdlTime', or -1 if it's not set. TODO: move this to * a metastore helper class. */ public static long getLastDdlTime(org.apache.hadoop.hive.metastore.api.Table msTbl) { Preconditions.checkNotNull(msTbl); Map<String, String> params = msTbl.getParameters(); String lastDdlTimeStr = params.get("transient_lastDdlTime"); if (lastDdlTimeStr != null) { try { return Long.parseLong(lastDdlTimeStr); } catch (NumberFormatException e) { } } return -1; }
// This method is completely copied from Hive's HBaseStorageHandler.java. private String getHBaseTableName(org.apache.hadoop.hive.metastore.api.Table tbl) { // Give preference to TBLPROPERTIES over SERDEPROPERTIES // (really we should only use TBLPROPERTIES, so this is just // for backwards compatibility with the original specs). String tableName = tbl.getParameters().get(HBaseSerDe.HBASE_TABLE_NAME); if (tableName == null) { tableName = tbl.getSd().getSerdeInfo().getParameters().get(HBaseSerDe.HBASE_TABLE_NAME); } if (tableName == null) { tableName = tbl.getDbName() + "." + tbl.getTableName(); if (tableName.startsWith(DEFAULT_PREFIX)) { tableName = tableName.substring(DEFAULT_PREFIX.length()); } } return tableName; }
@Override /** * Load the table metadata and reuse metadata to speed up metadata loading. If the lastDdlTime has * not been changed, that means the Hive metastore metadata has not been changed. Reuses the old * Hive partition metadata from cachedEntry. To speed up Hdfs metadata loading, if a file's mtime * has not been changed, reuses the old file block metadata from old value. * * <p>There are several cases where the cachedEntry might be reused incorrectly: 1. an ALTER TABLE * ADD PARTITION or dynamic partition insert is executed through Hive. This does not update the * lastDdlTime. 2. Hdfs rebalancer is executed. This changes the block locations but won't update * the mtime (file modification time). If any of these occurs, user has to execute "invalidate * metadata" to invalidate the metadata cache of the table to trigger a fresh load. */ public void load( Table cachedEntry, HiveMetaStoreClient client, org.apache.hadoop.hive.metastore.api.Table msTbl) throws TableLoadingException { numHdfsFiles_ = 0; totalHdfsBytes_ = 0; LOG.debug("load table: " + db_.getName() + "." + name_); // turn all exceptions into TableLoadingException try { // set nullPartitionKeyValue from the hive conf. nullPartitionKeyValue_ = client.getConfigValue("hive.exec.default.partition.name", "__HIVE_DEFAULT_PARTITION__"); // set NULL indicator string from table properties nullColumnValue_ = msTbl.getParameters().get(serdeConstants.SERIALIZATION_NULL_FORMAT); if (nullColumnValue_ == null) nullColumnValue_ = DEFAULT_NULL_COLUMN_VALUE; // populate with both partition keys and regular columns List<FieldSchema> partKeys = msTbl.getPartitionKeys(); List<FieldSchema> tblFields = Lists.newArrayList(); String inputFormat = msTbl.getSd().getInputFormat(); if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) { tblFields.addAll(client.getFields(db_.getName(), name_)); } else { tblFields.addAll(msTbl.getSd().getCols()); } List<FieldSchema> fieldSchemas = new ArrayList<FieldSchema>(partKeys.size() + tblFields.size()); fieldSchemas.addAll(partKeys); fieldSchemas.addAll(tblFields); // The number of clustering columns is the number of partition keys. numClusteringCols_ = partKeys.size(); loadColumns(fieldSchemas, client); // Collect the list of partitions to use for the table. Partitions may be reused // from the existing cached table entry (if one exists), read from the metastore, // or a mix of both. Whether or not a partition is reused depends on whether // the table or partition has been modified. List<org.apache.hadoop.hive.metastore.api.Partition> msPartitions = Lists.newArrayList(); if (cachedEntry == null || !(cachedEntry instanceof HdfsTable) || cachedEntry.lastDdlTime_ != lastDdlTime_) { msPartitions.addAll(client.listPartitions(db_.getName(), name_, Short.MAX_VALUE)); } else { // The table was already in the metadata cache and it has not been modified. Preconditions.checkArgument(cachedEntry instanceof HdfsTable); HdfsTable cachedHdfsTableEntry = (HdfsTable) cachedEntry; // Set of partition names that have been modified. Partitions in this Set need to // be reloaded from the metastore. Set<String> modifiedPartitionNames = Sets.newHashSet(); // If these are not the exact same object, look up the set of partition names in // the metastore. This is to support the special case of CTAS which creates a // "temp" table that doesn't actually exist in the metastore. if (cachedEntry != this) { // Since the table has not been modified, we might be able to reuse some of the // old partition metadata if the individual partitions have not been modified. // First get a list of all the partition names for this table from the // metastore, this is much faster than listing all the Partition objects. modifiedPartitionNames.addAll( client.listPartitionNames(db_.getName(), name_, Short.MAX_VALUE)); } int totalPartitions = modifiedPartitionNames.size(); // Get all the partitions from the cached entry that have not been modified. for (HdfsPartition cachedPart : cachedHdfsTableEntry.getPartitions()) { // Skip the default partition and any partitions that have been modified. if (cachedPart.isDirty() || cachedPart.getMetaStorePartition() == null || cachedPart.getId() == DEFAULT_PARTITION_ID) { continue; } org.apache.hadoop.hive.metastore.api.Partition cachedMsPart = cachedPart.getMetaStorePartition(); Preconditions.checkNotNull(cachedMsPart); // This is a partition we already know about and it hasn't been modified. // No need to reload the metadata. String cachedPartName = cachedPart.getPartitionName(); if (modifiedPartitionNames.contains(cachedPartName)) { msPartitions.add(cachedMsPart); modifiedPartitionNames.remove(cachedPartName); } } LOG.info( String.format( "Incrementally refreshing %d/%d partitions.", modifiedPartitionNames.size(), totalPartitions)); // No need to make the metastore call if no partitions are to be updated. if (modifiedPartitionNames.size() > 0) { // Now reload the the remaining partitions. msPartitions.addAll( client.getPartitionsByNames( db_.getName(), name_, Lists.newArrayList(modifiedPartitionNames))); } } Map<String, FileDescriptor> oldFileDescMap = null; if (cachedEntry != null && cachedEntry instanceof HdfsTable) { oldFileDescMap = ((HdfsTable) cachedEntry).fileDescMap_; } loadPartitions(msPartitions, msTbl, oldFileDescMap); // load table stats numRows_ = getRowCount(msTbl.getParameters()); LOG.debug("table #rows=" + Long.toString(numRows_)); // For unpartitioned tables set the numRows in its partitions // to the table's numRows. if (numClusteringCols_ == 0 && !partitions_.isEmpty()) { // Unpartitioned tables have a 'dummy' partition and a default partition. // Temp tables used in CTAS statements have one partition. Preconditions.checkState(partitions_.size() == 2 || partitions_.size() == 1); for (HdfsPartition p : partitions_) { p.setNumRows(numRows_); } } // populate Avro schema if necessary if (HdfsFileFormat.fromJavaClassName(inputFormat) == HdfsFileFormat.AVRO) { // Look for the schema in TBLPROPERTIES and in SERDEPROPERTIES, with the latter // taking precedence. List<Map<String, String>> schemaSearchLocations = Lists.newArrayList(); schemaSearchLocations.add(getMetaStoreTable().getSd().getSerdeInfo().getParameters()); schemaSearchLocations.add(getMetaStoreTable().getParameters()); avroSchema_ = HdfsTable.getAvroSchema(schemaSearchLocations, getFullName(), true); } } catch (TableLoadingException e) { throw e; } catch (Exception e) { throw new TableLoadingException("Failed to load metadata for table: " + name_, e); } }
/** * Run a compactor job. * * @param conf Hive configuration file * @param jobName name to run this job with * @param t metastore table * @param sd metastore storage descriptor * @param txns list of valid transactions * @param isMajor is this a major compaction? * @throws java.io.IOException if the job fails */ void run( HiveConf conf, String jobName, Table t, StorageDescriptor sd, ValidTxnList txns, boolean isMajor, Worker.StatsUpdater su) throws IOException { JobConf job = new JobConf(conf); job.setJobName(jobName); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setJarByClass(CompactorMR.class); LOG.debug("User jar set to " + job.getJar()); job.setMapperClass(CompactorMap.class); job.setNumReduceTasks(0); job.setInputFormat(CompactorInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setOutputCommitter(CompactorOutputCommitter.class); String queueName = conf.getVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE); if (queueName != null && queueName.length() > 0) { job.setQueueName(queueName); } job.set(FINAL_LOCATION, sd.getLocation()); job.set(TMP_LOCATION, sd.getLocation() + "/" + TMPDIR + "_" + UUID.randomUUID().toString()); job.set(INPUT_FORMAT_CLASS_NAME, sd.getInputFormat()); job.set(OUTPUT_FORMAT_CLASS_NAME, sd.getOutputFormat()); job.setBoolean(IS_MAJOR, isMajor); job.setBoolean(IS_COMPRESSED, sd.isCompressed()); job.set(TABLE_PROPS, new StringableMap(t.getParameters()).toString()); job.setInt(NUM_BUCKETS, sd.getNumBuckets()); job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString()); setColumnTypes(job, sd.getCols()); // Figure out and encode what files we need to read. We do this here (rather than in // getSplits below) because as part of this we discover our minimum and maximum transactions, // and discovering that in getSplits is too late as we then have no way to pass it to our // mapper. AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false); StringableList dirsToSearch = new StringableList(); Path baseDir = null; if (isMajor) { // There may not be a base dir if the partition was empty before inserts or if this // partition is just now being converted to ACID. baseDir = dir.getBaseDirectory(); if (baseDir == null) { List<HdfsFileStatusWithId> originalFiles = dir.getOriginalFiles(); if (!(originalFiles == null) && !(originalFiles.size() == 0)) { // There are original format files for (HdfsFileStatusWithId stat : originalFiles) { Path path = stat.getFileStatus().getPath(); dirsToSearch.add(path); LOG.debug("Adding original file " + path + " to dirs to search"); } // Set base to the location so that the input format reads the original files. baseDir = new Path(sd.getLocation()); } } else { // add our base to the list of directories to search for files in. LOG.debug("Adding base directory " + baseDir + " to dirs to search"); dirsToSearch.add(baseDir); } } List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories(); if (parsedDeltas == null || parsedDeltas.size() == 0) { // Seriously, no deltas? Can't compact that. LOG.error("No delta files found to compact in " + sd.getLocation()); return; } StringableList deltaDirs = new StringableList(); long minTxn = Long.MAX_VALUE; long maxTxn = Long.MIN_VALUE; for (AcidUtils.ParsedDelta delta : parsedDeltas) { LOG.debug("Adding delta " + delta.getPath() + " to directories to search"); dirsToSearch.add(delta.getPath()); deltaDirs.add(delta.getPath()); minTxn = Math.min(minTxn, delta.getMinTransaction()); maxTxn = Math.max(maxTxn, delta.getMaxTransaction()); } if (baseDir != null) job.set(BASE_DIR, baseDir.toString()); job.set(DELTA_DIRS, deltaDirs.toString()); job.set(DIRS_TO_SEARCH, dirsToSearch.toString()); job.setLong(MIN_TXN, minTxn); job.setLong(MAX_TXN, maxTxn); LOG.debug("Setting minimum transaction to " + minTxn); LOG.debug("Setting maximume transaction to " + maxTxn); RunningJob rj = JobClient.runJob(job); LOG.info( "Submitted " + (isMajor ? CompactionType.MAJOR : CompactionType.MINOR) + " compaction job '" + jobName + "' with jobID=" + rj.getID() + " to " + job.getQueueName() + " queue. " + "(current delta dirs count=" + dir.getCurrentDirectories().size() + ", obsolete delta dirs count=" + dir.getObsolete()); rj.waitForCompletion(); su.gatherStats(); }
@Override public final void createTable(final CatalogProtos.TableDescProto tableDescProto) throws CatalogException { HiveCatalogStoreClientPool.HiveCatalogStoreClient client = null; TableDesc tableDesc = new TableDesc(tableDescProto); String[] splitted = CatalogUtil.splitFQTableName(tableDesc.getName()); String databaseName = splitted[0]; String tableName = splitted[1]; try { client = clientPool.getClient(); org.apache.hadoop.hive.metastore.api.Table table = new org.apache.hadoop.hive.metastore.api.Table(); table.setDbName(databaseName); table.setTableName(tableName); table.setParameters( new HashMap<String, String>(tableDesc.getMeta().getOptions().getAllKeyValus())); // TODO: set owner // table.setOwner(); StorageDescriptor sd = new StorageDescriptor(); sd.setSerdeInfo(new SerDeInfo()); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().setName(table.getTableName()); // if tajo set location method, thrift client make exception as follows: // Caused by: MetaException(message:java.lang.NullPointerException) // If you want to modify table path, you have to modify on Hive cli. if (tableDesc.isExternal()) { table.setTableType(TableType.EXTERNAL_TABLE.name()); table.putToParameters("EXTERNAL", "TRUE"); Path tablePath = new Path(tableDesc.getUri()); FileSystem fs = tablePath.getFileSystem(conf); if (fs.isFile(tablePath)) { LOG.warn("A table path is a file, but HiveCatalogStore does not allow a file path."); sd.setLocation(tablePath.getParent().toString()); } else { sd.setLocation(tablePath.toString()); } } // set column information List<Column> columns = tableDesc.getSchema().getRootColumns(); ArrayList<FieldSchema> cols = new ArrayList<FieldSchema>(columns.size()); for (Column eachField : columns) { cols.add( new FieldSchema( eachField.getSimpleName(), HiveCatalogUtil.getHiveFieldType(eachField.getDataType()), "")); } sd.setCols(cols); // set partition keys if (tableDesc.hasPartition() && tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)) { List<FieldSchema> partitionKeys = new ArrayList<FieldSchema>(); for (Column eachPartitionKey : tableDesc.getPartitionMethod().getExpressionSchema().getRootColumns()) { partitionKeys.add( new FieldSchema( eachPartitionKey.getSimpleName(), HiveCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()), "")); } table.setPartitionKeys(partitionKeys); } if (tableDesc.getMeta().getStoreType().equalsIgnoreCase(BuiltinStorages.RCFILE)) { String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE); sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName()); sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName()); if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) { sd.getSerdeInfo() .setSerializationLib( org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName()); } else { sd.getSerdeInfo() .setSerializationLib( org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe.class.getName()); } if (tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)) { table.putToParameters( serdeConstants.SERIALIZATION_NULL_FORMAT, StringEscapeUtils.unescapeJava( tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL))); } } else if (tableDesc.getMeta().getStoreType().equals(BuiltinStorages.TEXT)) { sd.getSerdeInfo() .setSerializationLib( org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName()); sd.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class.getName()); sd.setOutputFormat( org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat.class.getName()); String fieldDelimiter = tableDesc .getMeta() .getOption( StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER); // User can use an unicode for filed delimiter such as \u0001, \001. // In this case, java console will convert this value into "\\u001". // And hive will un-espace this value again. // As a result, user can use right field delimiter. // So, we have to un-escape this value. sd.getSerdeInfo() .putToParameters( serdeConstants.SERIALIZATION_FORMAT, StringEscapeUtils.unescapeJava(fieldDelimiter)); sd.getSerdeInfo() .putToParameters( serdeConstants.FIELD_DELIM, StringEscapeUtils.unescapeJava(fieldDelimiter)); table.getParameters().remove(StorageConstants.TEXT_DELIMITER); if (tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)) { table.putToParameters( serdeConstants.SERIALIZATION_NULL_FORMAT, StringEscapeUtils.unescapeJava( tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL))); table.getParameters().remove(StorageConstants.TEXT_NULL); } } else if (tableDesc .getMeta() .getStoreType() .equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)) { String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE); sd.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getName()); sd.setOutputFormat( org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat.class.getName()); if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) { sd.getSerdeInfo() .setSerializationLib( org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName()); String fieldDelimiter = tableDesc .getMeta() .getOption( StorageConstants.SEQUENCEFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER); // User can use an unicode for filed delimiter such as \u0001, \001. // In this case, java console will convert this value into "\\u001". // And hive will un-espace this value again. // As a result, user can use right field delimiter. // So, we have to un-escape this value. sd.getSerdeInfo() .putToParameters( serdeConstants.SERIALIZATION_FORMAT, StringEscapeUtils.unescapeJava(fieldDelimiter)); sd.getSerdeInfo() .putToParameters( serdeConstants.FIELD_DELIM, StringEscapeUtils.unescapeJava(fieldDelimiter)); table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER); } else { sd.getSerdeInfo() .setSerializationLib( org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class.getName()); } if (tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)) { table.putToParameters( serdeConstants.SERIALIZATION_NULL_FORMAT, StringEscapeUtils.unescapeJava( tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL))); table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL); } } else { if (tableDesc.getMeta().getStoreType().equalsIgnoreCase(BuiltinStorages.PARQUET)) { sd.setInputFormat(parquet.hive.DeprecatedParquetInputFormat.class.getName()); sd.setOutputFormat(parquet.hive.DeprecatedParquetOutputFormat.class.getName()); sd.getSerdeInfo() .setSerializationLib(parquet.hive.serde.ParquetHiveSerDe.class.getName()); } else { throw new UnsupportedException( tableDesc.getMeta().getStoreType() + " in HivecatalogStore"); } } sd.setSortCols(new ArrayList<Order>()); table.setSd(sd); client.getHiveClient().createTable(table); } catch (Throwable t) { throw new TajoInternalError(t); } finally { if (client != null) client.release(); } }