/** * Configures the partitioner for generating HFiles. * * <p>Each generated HFile should fit within a region of of the target table. Additionally, it's * optimal to have only one HFile to load into each region, since a read from that region will * require reading from each HFile under management (until compaction happens and merges them all * back into one HFile). * * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the records output * from the Mapper based on their rank in a total ordering of the keys. The <code>startKeys</code> * argument should contain a list of the first key in each of those partitions. * * @param job The job to configure. * @param startKeys A list of keys that will mark the boundaries between the partitions for the * sorted map output records. * @throws IOException If there is an error. */ private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException { job.setPartitionerClass(TotalOrderPartitioner.class); LOG.info("Configuring " + startKeys.size() + " reduce partitions."); job.setNumReduceTasks(startKeys.size()); // Write the file that the TotalOrderPartitioner reads to determine where to partition records. Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionFilePath); final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration()); partitionFilePath = partitionFilePath.makeQualified(fs); writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys); // Add it to the distributed cache. try { final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(cacheUri, job.getConfiguration()); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.createSymlink(job.getConfiguration()); }
//// jobSubmitDir is /tmp staging dir. copy lib 、files and so on into jobSubmitDir. private void copyAndConfigureFiles(Job job, Path jobSubmitDir) throws IOException { Configuration conf = job.getConfiguration(); short replication = (short) conf.getInt(Job.SUBMIT_REPLICATION, 10); copyAndConfigureFiles(job, jobSubmitDir, replication); // Set the working directory if (job.getWorkingDirectory() == null) { job.setWorkingDirectory(jtFs.getWorkingDirectory()); } }
public static void cleanup(Job job) throws IOException { final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); RuntimeException e = null; try { JobHelper.deleteWithRetry(fs, jobDir, true); } catch (RuntimeException ex) { e = ex; } try { JobHelper.deleteWithRetry( fs, getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true); } catch (RuntimeException ex) { if (e == null) { e = ex; } else { e.addSuppressed(ex); } } if (e != null) { throw e; } }
/** Hadoop {@link Tool} implementation */ @Override public int run(String[] args) throws Exception { Options options = new Options(); configureOptions(options); CommandLineParser parser = new GnuParser(); try { CommandLine commandLine = parser.parse(options, args); if (commandLine.hasOption(VERBOSE)) { Logger.getGlobal().setLevel(Level.FINEST); } if (commandLine.hasOption(QUIET)) { Logger.getGlobal().setLevel(Level.OFF); } String transformationLocation = commandLine.getOptionValue(TRANSFORMATION); String sourcemmLocation = commandLine.getOptionValue(SOURCE_PACKAGE); String targetmmLocation = commandLine.getOptionValue(TARGET_PACKAGE); String recordsLocation = commandLine.getOptionValue(RECORDS_FILE); String inputLocation = commandLine.getOptionValue(INPUT_MODEL); String outputLocation = commandLine.getOptionValue( OUTPUT_MODEL, new Path(inputLocation).suffix(".out.xmi").toString()); int recommendedMappers = 1; if (commandLine.hasOption(RECOMMENDED_MAPPERS)) { recommendedMappers = ((Number) commandLine.getParsedOptionValue(RECOMMENDED_MAPPERS)).intValue(); } Configuration conf = this.getConf(); Job job = Job.getInstance(conf, JOB_NAME); // Configure classes job.setJarByClass(ATLMRMaster.class); job.setMapperClass(ATLMRMapper.class); job.setReducerClass(ATLMRReducer.class); job.setInputFormatClass(NLineInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); // Configure MapReduce input/outputs Path recordsPath = new Path(recordsLocation); FileInputFormat.setInputPaths(job, recordsPath); String timestamp = new SimpleDateFormat("yyyyMMddhhmm").format(new Date()); String outDirName = "atlmr-out-" + timestamp + "-" + UUID.randomUUID(); FileOutputFormat.setOutputPath( job, new Path(job.getWorkingDirectory().suffix(Path.SEPARATOR + outDirName).toUri())); // Configure records per map FileSystem fileSystem = FileSystem.get(recordsPath.toUri(), conf); InputStream inputStream = fileSystem.open(recordsPath); long linesPerMap = (long) Math.ceil((double) countLines(inputStream) / (double) recommendedMappers); job.getConfiguration().setLong(NLineInputFormat.LINES_PER_MAP, linesPerMap); // Configure ATL related inputs/outputs job.getConfiguration().set(TRANSFORMATION, transformationLocation); job.getConfiguration().set(SOURCE_PACKAGE, sourcemmLocation); job.getConfiguration().set(TARGET_PACKAGE, targetmmLocation); job.getConfiguration().set(INPUT_MODEL, inputLocation); job.getConfiguration().set(OUTPUT_MODEL, outputLocation); Logger.getGlobal().log(Level.INFO, "Starting Job execution"); long begin = System.currentTimeMillis(); int returnValue = job.waitForCompletion(true) ? STATUS_OK : STATUS_ERROR; long end = System.currentTimeMillis(); Logger.getGlobal() .log( Level.INFO, MessageFormat.format( "Job execution ended in {0}s with status code {1}", (end - begin) / 1000, returnValue)); return returnValue; } catch (ParseException e) { System.err.println(e.getLocalizedMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setOptionComparator(new OptionComarator<>()); try { formatter.setWidth(Math.max(Terminal.getTerminal().getTerminalWidth(), 80)); } catch (Throwable t) { // Nothing to do... } ; formatter.printHelp("yarn jar <this-file.jar>", options, true); return STATUS_ERROR; } }
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); } final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath( JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath( getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error( "Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList.copyOf( Lists.transform( goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE( "Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue( stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }