/** * Run a compactor job. * * @param conf Hive configuration file * @param jobName name to run this job with * @param t metastore table * @param sd metastore storage descriptor * @param txns list of valid transactions * @param isMajor is this a major compaction? * @throws java.io.IOException if the job fails */ void run( HiveConf conf, String jobName, Table t, StorageDescriptor sd, ValidTxnList txns, boolean isMajor, Worker.StatsUpdater su) throws IOException { JobConf job = new JobConf(conf); job.setJobName(jobName); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setJarByClass(CompactorMR.class); LOG.debug("User jar set to " + job.getJar()); job.setMapperClass(CompactorMap.class); job.setNumReduceTasks(0); job.setInputFormat(CompactorInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setOutputCommitter(CompactorOutputCommitter.class); String queueName = conf.getVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE); if (queueName != null && queueName.length() > 0) { job.setQueueName(queueName); } job.set(FINAL_LOCATION, sd.getLocation()); job.set(TMP_LOCATION, sd.getLocation() + "/" + TMPDIR + "_" + UUID.randomUUID().toString()); job.set(INPUT_FORMAT_CLASS_NAME, sd.getInputFormat()); job.set(OUTPUT_FORMAT_CLASS_NAME, sd.getOutputFormat()); job.setBoolean(IS_MAJOR, isMajor); job.setBoolean(IS_COMPRESSED, sd.isCompressed()); job.set(TABLE_PROPS, new StringableMap(t.getParameters()).toString()); job.setInt(NUM_BUCKETS, sd.getNumBuckets()); job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString()); setColumnTypes(job, sd.getCols()); // Figure out and encode what files we need to read. We do this here (rather than in // getSplits below) because as part of this we discover our minimum and maximum transactions, // and discovering that in getSplits is too late as we then have no way to pass it to our // mapper. AcidUtils.Directory dir = AcidUtils.getAcidState(new Path(sd.getLocation()), conf, txns, false); StringableList dirsToSearch = new StringableList(); Path baseDir = null; if (isMajor) { // There may not be a base dir if the partition was empty before inserts or if this // partition is just now being converted to ACID. baseDir = dir.getBaseDirectory(); if (baseDir == null) { List<HdfsFileStatusWithId> originalFiles = dir.getOriginalFiles(); if (!(originalFiles == null) && !(originalFiles.size() == 0)) { // There are original format files for (HdfsFileStatusWithId stat : originalFiles) { Path path = stat.getFileStatus().getPath(); dirsToSearch.add(path); LOG.debug("Adding original file " + path + " to dirs to search"); } // Set base to the location so that the input format reads the original files. baseDir = new Path(sd.getLocation()); } } else { // add our base to the list of directories to search for files in. LOG.debug("Adding base directory " + baseDir + " to dirs to search"); dirsToSearch.add(baseDir); } } List<AcidUtils.ParsedDelta> parsedDeltas = dir.getCurrentDirectories(); if (parsedDeltas == null || parsedDeltas.size() == 0) { // Seriously, no deltas? Can't compact that. LOG.error("No delta files found to compact in " + sd.getLocation()); return; } StringableList deltaDirs = new StringableList(); long minTxn = Long.MAX_VALUE; long maxTxn = Long.MIN_VALUE; for (AcidUtils.ParsedDelta delta : parsedDeltas) { LOG.debug("Adding delta " + delta.getPath() + " to directories to search"); dirsToSearch.add(delta.getPath()); deltaDirs.add(delta.getPath()); minTxn = Math.min(minTxn, delta.getMinTransaction()); maxTxn = Math.max(maxTxn, delta.getMaxTransaction()); } if (baseDir != null) job.set(BASE_DIR, baseDir.toString()); job.set(DELTA_DIRS, deltaDirs.toString()); job.set(DIRS_TO_SEARCH, dirsToSearch.toString()); job.setLong(MIN_TXN, minTxn); job.setLong(MAX_TXN, maxTxn); LOG.debug("Setting minimum transaction to " + minTxn); LOG.debug("Setting maximume transaction to " + maxTxn); RunningJob rj = JobClient.runJob(job); LOG.info( "Submitted " + (isMajor ? CompactionType.MAJOR : CompactionType.MINOR) + " compaction job '" + jobName + "' with jobID=" + rj.getID() + " to " + job.getQueueName() + " queue. " + "(current delta dirs count=" + dir.getCurrentDirectories().size() + ", obsolete delta dirs count=" + dir.getObsolete()); rj.waitForCompletion(); su.gatherStats(); }
/** Test if {@link CurrentJHParser} can read events from current JH files. */ @Test public void testCurrentJHParser() throws Exception { final Configuration conf = new Configuration(); final FileSystem lfs = FileSystem.getLocal(conf); final Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(lfs); final Path tempDir = new Path(rootTempDir, "TestCurrentJHParser"); lfs.delete(tempDir, true); String queueName = "testQueue"; // Run a MR job // create a MR cluster conf.setInt("mapred.tasktracker.map.tasks.maximum", 1); conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1); conf.set("mapred.queue.names", queueName); MiniMRCluster mrCluster = new MiniMRCluster(1, "file:///", 1, null, null, new JobConf(conf)); // run a job Path inDir = new Path(tempDir, "input"); Path outDir = new Path(tempDir, "output"); JobHistoryParser parser = null; RewindableInputStream ris = null; ArrayList<String> seenEvents = new ArrayList<String>(10); RunningJob rJob = null; try { JobConf jobConf = mrCluster.createJobConf(); jobConf.setQueueName(queueName); // construct a job with 1 map and 1 reduce task. rJob = UtilsForTests.runJob(jobConf, inDir, outDir, 1, 1); rJob.waitForCompletion(); assertTrue("Job failed", rJob.isSuccessful()); JobID id = rJob.getID(); // get the jobhistory filepath Path inputPath = new Path(JobHistory.getHistoryFilePath(org.apache.hadoop.mapred.JobID.downgrade(id))); // wait for 10 secs for the jobhistory file to move into the done folder for (int i = 0; i < 100; ++i) { if (lfs.exists(inputPath)) { break; } TimeUnit.MILLISECONDS.wait(100); } assertTrue("Missing job history file", lfs.exists(inputPath)); InputDemuxer inputDemuxer = new DefaultInputDemuxer(); inputDemuxer.bindTo(inputPath, conf); Pair<String, InputStream> filePair = inputDemuxer.getNext(); assertNotNull(filePair); ris = new RewindableInputStream(filePair.second()); // Test if the JobHistoryParserFactory can detect the parser correctly parser = JobHistoryParserFactory.getParser(ris); // Get ParsedJob String jobId = TraceBuilder.extractJobID(filePair.first()); JobBuilder builder = new JobBuilder(jobId); HistoryEvent e; while ((e = parser.nextEvent()) != null) { String eventString = e.getEventType().toString(); System.out.println(eventString); seenEvents.add(eventString); if (builder != null) { builder.process(e); } } ParsedJob parsedJob = builder.build(); // validate the obtainXXX api of ParsedJob, ParsedTask and // ParsedTaskAttempt. validateParsedJob(parsedJob, 1, 1, queueName); } finally { // stop the MR cluster mrCluster.shutdown(); if (ris != null) { ris.close(); } if (parser != null) { parser.close(); } // cleanup the filesystem lfs.delete(tempDir, true); } // Check against the gold standard System.out.println("testCurrentJHParser validating using gold std "); String[] goldLines = new String[] { "JOB_SUBMITTED", "JOB_PRIORITY_CHANGED", "JOB_STATUS_CHANGED", "JOB_INITED", "JOB_INFO_CHANGED", "TASK_STARTED", "MAP_ATTEMPT_STARTED", "MAP_ATTEMPT_FINISHED", "MAP_ATTEMPT_FINISHED", "TASK_UPDATED", "TASK_FINISHED", "JOB_STATUS_CHANGED", "TASK_STARTED", "MAP_ATTEMPT_STARTED", "MAP_ATTEMPT_FINISHED", "MAP_ATTEMPT_FINISHED", "TASK_UPDATED", "TASK_FINISHED", "TASK_STARTED", "MAP_ATTEMPT_STARTED", "MAP_ATTEMPT_FINISHED", "REDUCE_ATTEMPT_FINISHED", "TASK_UPDATED", "TASK_FINISHED", "TASK_STARTED", "MAP_ATTEMPT_STARTED", "MAP_ATTEMPT_FINISHED", "MAP_ATTEMPT_FINISHED", "TASK_UPDATED", "TASK_FINISHED", "JOB_STATUS_CHANGED", "JOB_FINISHED" }; // Check the output with gold std assertEquals("Size mismatch", goldLines.length, seenEvents.size()); int index = 0; for (String goldLine : goldLines) { assertEquals("Content mismatch", goldLine, seenEvents.get(index++)); } }