@Override public InputSplit[] getSplits(JobConf entries, int i) throws IOException { Path baseDir = null; if (entries.get(BASE_DIR) != null) baseDir = new Path(entries.get(BASE_DIR)); StringableList tmpDeltaDirs = new StringableList(entries.get(DELTA_DIRS)); Path[] deltaDirs = tmpDeltaDirs.toArray(new Path[tmpDeltaDirs.size()]); StringableList dirsToSearch = new StringableList(entries.get(DIRS_TO_SEARCH)); Map<Integer, BucketTracker> splitToBucketMap = new HashMap<Integer, BucketTracker>(); for (Path dir : dirsToSearch) { FileSystem fs = dir.getFileSystem(entries); // If this is a base or delta directory, then we need to be looking for the bucket files. // But if it's a legacy file then we need to add it directly. if (dir.getName().startsWith(AcidUtils.BASE_PREFIX) || dir.getName().startsWith(AcidUtils.DELTA_PREFIX)) { boolean sawBase = dir.getName().startsWith(AcidUtils.BASE_PREFIX); FileStatus[] files = fs.listStatus(dir, AcidUtils.bucketFileFilter); for (FileStatus f : files) { // For each file, figure out which bucket it is. Matcher matcher = AcidUtils.BUCKET_DIGIT_PATTERN.matcher(f.getPath().getName()); addFileToMap(matcher, f.getPath(), sawBase, splitToBucketMap); } } else { // Legacy file, see if it's a bucket file Matcher matcher = AcidUtils.LEGACY_BUCKET_DIGIT_PATTERN.matcher(dir.getName()); addFileToMap(matcher, dir, true, splitToBucketMap); } } List<InputSplit> splits = new ArrayList<InputSplit>(splitToBucketMap.size()); for (Map.Entry<Integer, BucketTracker> e : splitToBucketMap.entrySet()) { BucketTracker bt = e.getValue(); splits.add( new CompactorInputSplit( entries, e.getKey(), bt.buckets, bt.sawBase ? baseDir : null, deltaDirs)); } LOG.debug("Returning " + splits.size() + " splits"); return splits.toArray(new InputSplit[splits.size()]); }
/** * Run a compactor job. * * @param conf Hive configuration file * @param jobName name to run this job with * @param t metastore table * @param sd metastore storage descriptor * @param txns list of valid transactions * @param isMajor is this a major compaction? * @throws java.io.IOException if the job fails */ void run( HiveConf conf, String jobName, Table t, StorageDescriptor sd, ValidTxnList txns, boolean isMajor, Worker.StatsUpdater su) throws IOException { JobConf job = new JobConf(conf); job.setJobName(jobName); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setJarByClass(CompactorMR.class); LOG.debug("User jar set to " + job.getJar()); job.setMapperClass(CompactorMap.class); job.setNumReduceTasks(0); job.setInputFormat(CompactorInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setOutputCommitter(CompactorOutputCommitter.class); String queueName = conf.getVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE); if (queueName != null && queueName.length() > 0) { job.setQueueName(queueName); } job.set(FINAL_LOCATION, sd.getLocation()); job.set(TMP_LOCATION, sd.getLocation() + "/" + TMPDIR + "_" + UUID.randomUUID().toString()); job.set(INPUT_FORMAT_CLASS_NAME, sd.getInputFormat()); job.set(OUTPUT_FORMAT_CLASS_NAME, sd.getOutputFormat()); job.setBoolean(IS_MAJOR, isMajor); job.setBoolean(IS_COMPRESSED, sd.isCompressed()); job.set(TABLE_PROPS, new StringableMap(t.getParameters()).toString()); job.setInt(NUM_BUCKETS, sd.getNumBuckets()); job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString()); setColumnTypes(job, sd.getCols()); // Figure out and encode what files we need to read. We do this here (rather than in // getSplits below) because as part of this we discover our minimum and maximum transactions, // and discovering that in getSplits is too late as we then have no way to pass it to our // mapper. AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false); StringableList dirsToSearch = new StringableList(); Path baseDir = null; if (isMajor) { // There may not be a base dir if the partition was empty before inserts or if this // partition is just now being converted to ACID. baseDir = dir.getBaseDirectory(); if (baseDir == null) { List<HdfsFileStatusWithId> originalFiles = dir.getOriginalFiles(); if (!(originalFiles == null) && !(originalFiles.size() == 0)) { // There are original format files for (HdfsFileStatusWithId stat : originalFiles) { Path path = stat.getFileStatus().getPath(); dirsToSearch.add(path); LOG.debug("Adding original file " + path + " to dirs to search"); } // Set base to the location so that the input format reads the original files. baseDir = new Path(sd.getLocation()); } } else { // add our base to the list of directories to search for files in. LOG.debug("Adding base directory " + baseDir + " to dirs to search"); dirsToSearch.add(baseDir); } } List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories(); if (parsedDeltas == null || parsedDeltas.size() == 0) { // Seriously, no deltas? Can't compact that. LOG.error("No delta files found to compact in " + sd.getLocation()); return; } StringableList deltaDirs = new StringableList(); long minTxn = Long.MAX_VALUE; long maxTxn = Long.MIN_VALUE; for (AcidUtils.ParsedDelta delta : parsedDeltas) { LOG.debug("Adding delta " + delta.getPath() + " to directories to search"); dirsToSearch.add(delta.getPath()); deltaDirs.add(delta.getPath()); minTxn = Math.min(minTxn, delta.getMinTransaction()); maxTxn = Math.max(maxTxn, delta.getMaxTransaction()); } if (baseDir != null) job.set(BASE_DIR, baseDir.toString()); job.set(DELTA_DIRS, deltaDirs.toString()); job.set(DIRS_TO_SEARCH, dirsToSearch.toString()); job.setLong(MIN_TXN, minTxn); job.setLong(MAX_TXN, maxTxn); LOG.debug("Setting minimum transaction to " + minTxn); LOG.debug("Setting maximume transaction to " + maxTxn); RunningJob rj = JobClient.runJob(job); LOG.info( "Submitted " + (isMajor ? CompactionType.MAJOR : CompactionType.MINOR) + " compaction job '" + jobName + "' with jobID=" + rj.getID() + " to " + job.getQueueName() + " queue. " + "(current delta dirs count=" + dir.getCurrentDirectories().size() + ", obsolete delta dirs count=" + dir.getObsolete()); rj.waitForCompletion(); su.gatherStats(); }