public static Partition fromMetastoreApiPartition( org.apache.hadoop.hive.metastore.api.Partition partition) { StorageDescriptor storageDescriptor = partition.getSd(); if (storageDescriptor == null) { throw new PrestoException( HIVE_INVALID_METADATA, "Partition does not contain a storage descriptor: " + partition); } Partition.Builder partitionBuilder = Partition.builder() .setDatabaseName(partition.getDbName()) .setTableName(partition.getTableName()) .setValues(partition.getValues()) .setColumns( storageDescriptor .getCols() .stream() .map(MetastoreUtil::fromMetastoreApiFieldSchema) .collect(toList())) .setParameters(partition.getParameters()); fromMetastoreApiStorageDescriptor( storageDescriptor, partitionBuilder.getStorageBuilder(), format("%s.%s", partition.getTableName(), partition.getValues())); return partitionBuilder.build(); }
public static Table fromMetastoreApiTable(org.apache.hadoop.hive.metastore.api.Table table) { StorageDescriptor storageDescriptor = table.getSd(); if (storageDescriptor == null) { throw new PrestoException(HIVE_INVALID_METADATA, "Table is missing storage descriptor"); } Table.Builder tableBuilder = Table.builder() .setDatabaseName(table.getDbName()) .setTableName(table.getTableName()) .setOwner(nullToEmpty(table.getOwner())) .setTableType(table.getTableType()) .setDataColumns( storageDescriptor .getCols() .stream() .map(MetastoreUtil::fromMetastoreApiFieldSchema) .collect(toList())) .setPartitionColumns( table .getPartitionKeys() .stream() .map(MetastoreUtil::fromMetastoreApiFieldSchema) .collect(toList())) .setParameters( table.getParameters() == null ? ImmutableMap.of() : table.getParameters()) .setViewOriginalText(Optional.ofNullable(emptyToNull(table.getViewOriginalText()))) .setViewExpandedText(Optional.ofNullable(emptyToNull(table.getViewExpandedText()))); fromMetastoreApiStorageDescriptor( storageDescriptor, tableBuilder.getStorageBuilder(), table.getTableName()); return tableBuilder.build(); }
/** * This code block iterates over indexes on the table and populates the indexToKeys map for all * the indexes that satisfy the rewrite criteria. * * @param indexTables * @return * @throws SemanticException */ Map<Index, Set<String>> getIndexToKeysMap(List<Index> indexTables) throws SemanticException { Index index = null; Hive hiveInstance = hiveDb; Map<Index, Set<String>> indexToKeysMap = new LinkedHashMap<Index, Set<String>>(); for (int idxCtr = 0; idxCtr < indexTables.size(); idxCtr++) { final Set<String> indexKeyNames = new LinkedHashSet<String>(); index = indexTables.get(idxCtr); // Getting index key columns StorageDescriptor sd = index.getSd(); List<FieldSchema> idxColList = sd.getCols(); for (FieldSchema fieldSchema : idxColList) { indexKeyNames.add(fieldSchema.getName()); } assert indexKeyNames.size() == 1; // Check that the index schema is as expected. This code block should // catch problems of this rewrite breaking when the AggregateIndexHandler // index is changed. List<String> idxTblColNames = new ArrayList<String>(); try { Table idxTbl = hiveInstance.getTable(index.getDbName(), index.getIndexTableName()); for (FieldSchema idxTblCol : idxTbl.getCols()) { idxTblColNames.add(idxTblCol.getName()); } } catch (HiveException e) { LOG.error( "Got exception while locating index table, " + "skipping " + getName() + " optimization"); LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } assert (idxTblColNames.contains(IDX_BUCKET_COL)); assert (idxTblColNames.contains(IDX_OFFSETS_ARRAY_COL)); // we add all index tables which can be used for rewrite // and defer the decision of using a particular index for later // this is to allow choosing a index if a better mechanism is // designed later to chose a better rewrite indexToKeysMap.put(index, indexKeyNames); } return indexToKeysMap; }
/** * Run a compactor job. * * @param conf Hive configuration file * @param jobName name to run this job with * @param t metastore table * @param sd metastore storage descriptor * @param txns list of valid transactions * @param isMajor is this a major compaction? * @throws java.io.IOException if the job fails */ void run( HiveConf conf, String jobName, Table t, StorageDescriptor sd, ValidTxnList txns, boolean isMajor, Worker.StatsUpdater su) throws IOException { JobConf job = new JobConf(conf); job.setJobName(jobName); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setJarByClass(CompactorMR.class); LOG.debug("User jar set to " + job.getJar()); job.setMapperClass(CompactorMap.class); job.setNumReduceTasks(0); job.setInputFormat(CompactorInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setOutputCommitter(CompactorOutputCommitter.class); String queueName = conf.getVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE); if (queueName != null && queueName.length() > 0) { job.setQueueName(queueName); } job.set(FINAL_LOCATION, sd.getLocation()); job.set(TMP_LOCATION, sd.getLocation() + "/" + TMPDIR + "_" + UUID.randomUUID().toString()); job.set(INPUT_FORMAT_CLASS_NAME, sd.getInputFormat()); job.set(OUTPUT_FORMAT_CLASS_NAME, sd.getOutputFormat()); job.setBoolean(IS_MAJOR, isMajor); job.setBoolean(IS_COMPRESSED, sd.isCompressed()); job.set(TABLE_PROPS, new StringableMap(t.getParameters()).toString()); job.setInt(NUM_BUCKETS, sd.getNumBuckets()); job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString()); setColumnTypes(job, sd.getCols()); // Figure out and encode what files we need to read. We do this here (rather than in // getSplits below) because as part of this we discover our minimum and maximum transactions, // and discovering that in getSplits is too late as we then have no way to pass it to our // mapper. AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false); StringableList dirsToSearch = new StringableList(); Path baseDir = null; if (isMajor) { // There may not be a base dir if the partition was empty before inserts or if this // partition is just now being converted to ACID. baseDir = dir.getBaseDirectory(); if (baseDir == null) { List<HdfsFileStatusWithId> originalFiles = dir.getOriginalFiles(); if (!(originalFiles == null) && !(originalFiles.size() == 0)) { // There are original format files for (HdfsFileStatusWithId stat : originalFiles) { Path path = stat.getFileStatus().getPath(); dirsToSearch.add(path); LOG.debug("Adding original file " + path + " to dirs to search"); } // Set base to the location so that the input format reads the original files. baseDir = new Path(sd.getLocation()); } } else { // add our base to the list of directories to search for files in. LOG.debug("Adding base directory " + baseDir + " to dirs to search"); dirsToSearch.add(baseDir); } } List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories(); if (parsedDeltas == null || parsedDeltas.size() == 0) { // Seriously, no deltas? Can't compact that. LOG.error("No delta files found to compact in " + sd.getLocation()); return; } StringableList deltaDirs = new StringableList(); long minTxn = Long.MAX_VALUE; long maxTxn = Long.MIN_VALUE; for (AcidUtils.ParsedDelta delta : parsedDeltas) { LOG.debug("Adding delta " + delta.getPath() + " to directories to search"); dirsToSearch.add(delta.getPath()); deltaDirs.add(delta.getPath()); minTxn = Math.min(minTxn, delta.getMinTransaction()); maxTxn = Math.max(maxTxn, delta.getMaxTransaction()); } if (baseDir != null) job.set(BASE_DIR, baseDir.toString()); job.set(DELTA_DIRS, deltaDirs.toString()); job.set(DIRS_TO_SEARCH, dirsToSearch.toString()); job.setLong(MIN_TXN, minTxn); job.setLong(MAX_TXN, maxTxn); LOG.debug("Setting minimum transaction to " + minTxn); LOG.debug("Setting maximume transaction to " + maxTxn); RunningJob rj = JobClient.runJob(job); LOG.info( "Submitted " + (isMajor ? CompactionType.MAJOR : CompactionType.MINOR) + " compaction job '" + jobName + "' with jobID=" + rj.getID() + " to " + job.getQueueName() + " queue. " + "(current delta dirs count=" + dir.getCurrentDirectories().size() + ", obsolete delta dirs count=" + dir.getObsolete()); rj.waitForCompletion(); su.gatherStats(); }