@Override public void configure(JobConf conf) { this.threshold = conf.getFloat(PARAM_APS_THRESHOLD, DEFAULT_THRESHOLD); int reducerID = conf.getInt("mapred.task.partition", -1); int max = conf.getInt(PARAM_APS_MAXKEY, 0); int nstripes = conf.getInt(PARAM_APS_STRIPES, 1); int spread = conf.getInt(PARAM_APS_REDUCER_PER_STRIPE, 1); if (reducerID < 0 || max == 0) { LOG.error("Could not find stripe ID, reverting to whole rest file loading"); LOG.debug("reducer = " + reducerID + "\t max = " + max + "\t nstripes = " + nstripes); // open the pruned part file in the DistrubutedCache haspruned = FileUtils.readRestFile(conf, pruned); } else { int stripe = GenericKey.StripePartitioner.findStripe(reducerID, spread); int from = GenericKey.StripePartitioner.minKeyInStripe(stripe, nstripes, max); int to = from + GenericKey.StripePartitioner.numKeysInStripe(stripe, nstripes, max); // read from 'from' included, to 'to' excluded LOG.info( "Reducer " + reducerID + " loading stripe " + stripe + " of " + nstripes + " (" + from + "," + (to - 1) + ")"); haspruned = FileUtils.readRestFile(conf, pruned, from, to); } if (!haspruned) LOG.warn("No pruned file provided in DistributedCache"); else LOG.info("Read " + pruned.size() + " entries from pruned file"); }
/** Save the values out of the configuaration that we need to write the data. */ @Override public void configure(JobConf job) { numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 20); minKeySize = job.getInt("test.randomwrite.min_key", 100); keySizeRange = job.getInt("test.randomwrite.max_key", 1000) - minKeySize; minValueSize = job.getInt("test.randomwrite.min_value", 0); valueSizeRange = job.getInt("test.randomwrite.max_value", 20000) - minValueSize; }
@VisibleForTesting Fetcher( JobConf job, TaskAttemptID reduceId, ShuffleSchedulerImpl<K, V> scheduler, MergeManager<K, V> merger, Reporter reporter, ShuffleClientMetrics metrics, ExceptionReporter exceptionReporter, SecretKey shuffleKey, int id) { this.jobConf = job; this.reporter = reporter; this.scheduler = scheduler; this.merger = merger; this.metrics = metrics; this.exceptionReporter = exceptionReporter; this.id = id; this.reduce = reduceId.getTaskID().getId(); this.shuffleSecretKey = shuffleKey; ioErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.IO_ERROR.toString()); wrongLengthErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_LENGTH.toString()); badIdErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.BAD_ID.toString()); wrongMapErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_MAP.toString()); connectionErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.CONNECTION.toString()); wrongReduceErrs = reporter.getCounter(SHUFFLE_ERR_GRP_NAME, ShuffleErrors.WRONG_REDUCE.toString()); this.connectionTimeout = job.getInt(MRJobConfig.SHUFFLE_CONNECT_TIMEOUT, DEFAULT_STALLED_COPY_TIMEOUT); this.readTimeout = job.getInt(MRJobConfig.SHUFFLE_READ_TIMEOUT, DEFAULT_READ_TIMEOUT); setName("fetcher#" + id); setDaemon(true); synchronized (Fetcher.class) { sslShuffle = job.getBoolean(MRConfig.SHUFFLE_SSL_ENABLED_KEY, MRConfig.SHUFFLE_SSL_ENABLED_DEFAULT); if (sslShuffle && sslFactory == null) { sslFactory = new SSLFactory(SSLFactory.Mode.CLIENT, job); try { sslFactory.init(); } catch (Exception ex) { sslFactory.destroy(); throw new RuntimeException(ex); } } } }
/** * Generate the requested number of file splits, with the filename set to the filename of the * output file. */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { /** 设置输入分片的个数* */ JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; } System.out.println("numMaps-------" + numMaps); InputSplit[] result = new InputSplit[numMaps]; Path outDir = FileOutputFormat.getOutputPath(job); for (int i = 0; i < result.length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null); } return result; }
/** * This is the main routine for launching a distributed random write job. It runs 10 maps/node and * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); /** 建议型的 * */ System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println( "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
/** * passing data and popsize so that they would be available before a call to the mapper * * @param job this mapreduce job */ @Override public void configure(JobConf job) { popsize = job.getInt("popsize", popsize); data = job.get("dataset", data); new Population(data, popsize); }
@Override public void configure(JobConf job) { numberOfNodes = job.getInt("numberOfNodes", 0); if (numberOfNodes == 0) { System.exit(0); } }
public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); curTime = job.getLong("injector.current.time", System.currentTimeMillis()); }
/** * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster). * * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static void setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min( numMaps, job.getInt( MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); job.setNumMapTasks(Math.max(numMaps, 1)); }
public void configure(JobConf job) { _jobConf = job; crawlerCount = job.getInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, CrawlEnvironment.CRAWLERS.length); partitionNumber = job.getInt("mapred.task.partition", -1); try { FileSystem fs = FileSystem.get(job); Path workPath = FileOutputFormat.getOutputPath(job); debugURLStream = fs.create(new Path(workPath, "debugURLS-" + NUMBER_FORMAT.format(partitionNumber))); urlDebugURLWriter = new OutputStreamWriter(debugURLStream, Charset.forName("UTF-8")); _emittedURLSFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
public JobBuilder setAffinityNoBalancing(Path affinityPath, Set<String> exclusionSet) throws IOException { // set node affinity ... String affinityMask = NodeAffinityMaskBuilder.buildNodeAffinityMask( FileSystem.get(_jobConf), affinityPath, null, exclusionSet, _jobConf.getInt("mapred.tasktracker.reduce.tasks.maximum", -1), true); NodeAffinityMaskBuilder.setNodeAffinityMask(_jobConf, affinityMask); return this; }
/** * Mapper configuration. Extracts source and destination file system, as well as top-level paths * on source and destination directories. Gets the named file systems, to be used later in map. */ public void configure(JobConf job) { destPath = new Path(job.get(DST_DIR_LABEL, "/")); try { destFileSys = destPath.getFileSystem(job); } catch (IOException ex) { throw new RuntimeException("Unable to get the named file system.", ex); } sizeBuf = job.getInt("copy.buf.size", 128 * 1024); buffer = new byte[sizeBuf]; ignoreReadFailures = job.getBoolean(Options.IGNORE_READ_FAILURES.propertyname, false); preserve_status = job.getBoolean(Options.PRESERVE_STATUS.propertyname, false); if (preserve_status) { preseved = FileAttribute.parse(job.get(PRESERVE_STATUS_LABEL)); } update = job.getBoolean(Options.UPDATE.propertyname, false); overwrite = !update && job.getBoolean(Options.OVERWRITE.propertyname, false); this.job = job; }
@Override public void configure(JobConf job) { super.configure(job); this.totalBytes = 0; this.chunkId = job.getInt( "mapred.task.partition", -1); // http://www.mail-archive.com/[email protected]/msg05749.html if (this.chunkId < 0) { throw new RuntimeException("Incorrect chunk id "); } this.nodeId = this.chunkId / getNumChunks(); this.client = new AdminClient(getCluster(), new AdminClientConfig()); logger.info("Reducer for Node id - " + this.nodeId + " and chunk id - " + this.chunkId); }
public void configure(JobConf job) { // 'key' == sortInput for sort-input; key == sortOutput for sort-output key = deduceInputFile(job); if (key == sortOutput) { partitioner = new HashPartitioner<WritableComparable, Writable>(); // Figure the 'current' partition and no. of reduces of the 'sort' try { URI inputURI = new URI(job.get("map.input.file")); String inputFile = inputURI.getPath(); partition = Integer.valueOf(inputFile.substring(inputFile.lastIndexOf("part") + 5)).intValue(); noSortReducers = job.getInt("sortvalidate.sort.reduce.tasks", -1); } catch (Exception e) { System.err.println("Caught: " + e); System.exit(-1); } } }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { final int srcCount = job.getInt(OP_COUNT_LABEL, -1); final int targetcount = srcCount / numSplits; String srclist = job.get(OP_LIST_LABEL, ""); if (srcCount < 0 || "".equals(srclist)) { throw new RuntimeException( "Invalid metadata: #files(" + srcCount + ") listuri(" + srclist + ")"); } Path srcs = new Path(srclist); FileSystem fs = srcs.getFileSystem(job); List<FileSplit> splits = new ArrayList<FileSplit>(numSplits); Text key = new Text(); PolicyInfo value = new PolicyInfo(); SequenceFile.Reader in = null; long prev = 0L; int count = 0; // count src try { for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value); ) { long curr = in.getPosition(); long delta = curr - prev; if (++count > targetcount) { count = 0; splits.add(new FileSplit(srcs, prev, delta, (String[]) null)); prev = curr; } } } finally { in.close(); } long remaining = fs.getFileStatus(srcs).getLen() - prev; if (remaining != 0) { splits.add(new FileSplit(srcs, prev, remaining, (String[]) null)); } LOG.info( "jobname= " + jobName + " numSplits=" + numSplits + ", splits.size()=" + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
public void configure(JobConf conf) { try { Path vInput; FileSystem fs; URI[] fvector; nsize = conf.getInt("DIMENTION", 0); sumVec = new double[nsize]; resVec = new double[nsize]; diaVec = new double[nsize]; Arrays.fill(sumVec, 0); Arrays.fill(resVec, 0); Arrays.fill(diaVec, 0); fvector = DistributedCache.getCacheFiles(conf); vInput = new Path(fvector[0].getPath()); fs = FileSystem.get(URI.create("hdfs://node17.cs.rochester.edu:9000"), conf); FSDataInputStream fdis = fs.open(vInput); String line; while ((line = fdis.readLine()) != null) { StringTokenizer tokenizer = new StringTokenizer(line); int rowIdx = Integer.parseInt(tokenizer.nextToken()); int colIdx = Integer.parseInt(tokenizer.nextToken()); double matVar = Double.parseDouble(tokenizer.nextToken()); if (rowIdx == colIdx) { diaVec[rowIdx] = matVar; } else if (colIdx == nsize) { resVec[rowIdx] = matVar; } else { sumVec[rowIdx] += matVar; } } } catch (IOException e) { e.printStackTrace(); } }
public void configure(JobConf job) { _attemptID = TaskAttemptID.forName(job.get("mapred.task.id")); _maxAttemptsPerTask = job.getInt("mapred.max.tracker.failures", 4); _splitDetails = job.get(ARCSplitReader.SPLIT_DETAILS, "Spit Details Unknown"); }
public void configure(JobConf job) { nSamples = job.getInt("nSamples", 100); ONE.set(1); }
public void configure(JobConf job) { mNodeCnt = job.getInt("NodeCount", 0); }
public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: " + getClass().getName() + " <input> <output> <nPopulation>"); // ToolRunner.printGenericCommandUsage(System.err); return -1; } // Create a JobConf using the processed <code>conf</code> final JobConf jobConf = new JobConf(getConf(), getClass()); // Specify various job-specific parameters jobConf.setJobName(MapreduceStringFinder.class.getSimpleName()); // setting the input format jobConf.setInputFormat(Individuals.class); // setting the output ke and value class jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(BooleanWritable.class); // setting the mapper class jobConf.setMapperClass(CIMapper.class); jobConf.setNumMapTasks(3); // setting number of maptasks // setting the reducer class jobConf.setReducerClass(CIReducer.class); // setup input/output directories final String dataset = args[0]; FileInputFormat.setInputPaths(jobConf, new Path(dataset)); FileOutputFormat.setOutputPath(jobConf, new Path(args[1])); final int pop = Integer.parseInt(args[2]); // based on the configuration, make this job threadable if (jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2) == 1) { jobConf.setMapRunnerClass(MultithreadedMapRunner.class); jobConf.setInt("mapred.map.multithreadedrunner.threads", 100); } jobConf.setInt("mapred.map.multithreadedrunner.threads", 100); // for computation intensive data, do not allow the job to fail if the task tracker does not // respond // with a heatbeat message before the timeout value final int timeout = 9000000; jobConf.setInt("mapred.task.timeout", timeout); // set the parameters to be available before a call to the mapper jobConf.setInt("popsize", pop); jobConf.setStrings("dataset", dataset); // int map = jobConf.getNumMapTasks(); // System.out.println("Number of Maps"+ map); // start the map/reduce job System.out.println("Starting Job"); // get the start time for this job final long startTime = System.currentTimeMillis(); // Submit the job, then poll for progress until the job is complete JobClient.runJob(jobConf); // get the end time for this job final long endTime = System.currentTimeMillis(); // get the duration of this job final double duration = (endTime - startTime) / 1000.0; // System.out.println("Job Finished in " + duration + " seconds"); // getElapsedTime(startTime - endTime); return 0; }
public void configure(JobConf conf) { delim = conf.get("delim", ","); keyFieldIndexTwo = conf.getInt("keyFieldIndexTwo", 0); }
/** Execute a query plan using Hadoop. */ @SuppressWarnings({"deprecation", "unchecked"}) @Override public int execute(DriverContext driverContext) { IOPrepareCache ioPrepareCache = IOPrepareCache.get(); ioPrepareCache.clear(); boolean success = true; Context ctx = driverContext.getCtx(); boolean ctxCreated = false; Path emptyScratchDir; MapWork mWork = work.getMapWork(); ReduceWork rWork = work.getReduceWork(); try { if (ctx == null) { ctx = new Context(job); ctxCreated = true; } emptyScratchDir = ctx.getMRTmpPath(); FileSystem fs = emptyScratchDir.getFileSystem(job); fs.mkdirs(emptyScratchDir); } catch (IOException e) { e.printStackTrace(); console.printError( "Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); return 5; } HiveFileFormatUtils.prepareJobOutput(job); // See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput() job.setOutputFormat(HiveOutputFormatImpl.class); job.setMapperClass(ExecMapper.class); job.setMapOutputKeyClass(HiveKey.class); job.setMapOutputValueClass(BytesWritable.class); try { String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER); job.setPartitionerClass(JavaUtils.loadClass(partitioner)); } catch (ClassNotFoundException e) { throw new RuntimeException(e.getMessage(), e); } if (mWork.getNumMapTasks() != null) { job.setNumMapTasks(mWork.getNumMapTasks().intValue()); } if (mWork.getMaxSplitSize() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mWork.getMaxSplitSize().longValue()); } if (mWork.getMinSplitSize() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mWork.getMinSplitSize().longValue()); } if (mWork.getMinSplitSizePerNode() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mWork.getMinSplitSizePerNode().longValue()); } if (mWork.getMinSplitSizePerRack() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mWork.getMinSplitSizePerRack().longValue()); } job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0); job.setReducerClass(ExecReducer.class); // set input format information if necessary setInputAttributes(job); // Turn on speculative execution for reducers boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS); HiveConf.setBoolVar( job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS, useSpeculativeExecReducers); String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT); if (mWork.isUseBucketizedHiveInputFormat()) { inpFormat = BucketizedHiveInputFormat.class.getName(); } LOG.info("Using " + inpFormat); try { job.setInputFormat(JavaUtils.loadClass(inpFormat)); } catch (ClassNotFoundException e) { throw new RuntimeException(e.getMessage(), e); } // No-Op - we don't really write anything here .. job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands // it String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS); String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS); if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) { String allJars = StringUtils.isNotBlank(auxJars) ? (StringUtils.isNotBlank(addedJars) ? addedJars + "," + auxJars : auxJars) : addedJars; LOG.info("adding libjars: " + allJars); initializeFiles("tmpjars", allJars); } // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES); if (StringUtils.isNotBlank(addedFiles)) { initializeFiles("tmpfiles", addedFiles); } int returnVal = 0; boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME)); if (noName) { // This is for a special case to ensure unit tests pass HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt()); } String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES); // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it if (StringUtils.isNotBlank(addedArchives)) { initializeFiles("tmparchives", addedArchives); } try { MapredLocalWork localwork = mWork.getMapRedLocalWork(); if (localwork != null && localwork.hasStagedAlias()) { if (!ShimLoader.getHadoopShims().isLocalMode(job)) { Path localPath = localwork.getTmpPath(); Path hdfsPath = mWork.getTmpHDFSPath(); FileSystem hdfs = hdfsPath.getFileSystem(job); FileSystem localFS = localPath.getFileSystem(job); FileStatus[] hashtableFiles = localFS.listStatus(localPath); int fileNumber = hashtableFiles.length; String[] fileNames = new String[fileNumber]; for (int i = 0; i < fileNumber; i++) { fileNames[i] = hashtableFiles[i].getPath().getName(); } // package and compress all the hashtable files to an archive file String stageId = this.getId(); String archiveFileName = Utilities.generateTarFileName(stageId); localwork.setStageID(stageId); CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName); Path archivePath = Utilities.generateTarPath(localPath, stageId); LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath); // upload archive file to hdfs Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId); short replication = (short) job.getInt("mapred.submit.replication", 10); hdfs.copyFromLocalFile(archivePath, hdfsFilePath); hdfs.setReplication(hdfsFilePath, replication); LOG.info("Upload 1 archive file from" + archivePath + " to: " + hdfsFilePath); // add the archive file to distributed cache DistributedCache.createSymlink(job); DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job); LOG.info( "Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri()); } } work.configureJobConf(job); List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false); Utilities.setInputPaths(job, inputPaths); Utilities.setMapRedWork(job, work, ctx.getMRTmpPath()); if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) { try { handleSampling(ctx, mWork, job); job.setPartitionerClass(HiveTotalOrderPartitioner.class); } catch (IllegalStateException e) { console.printInfo("Not enough sampling data.. Rolling back to single reducer task"); rWork.setNumReduceTasks(1); job.setNumReduceTasks(1); } catch (Exception e) { LOG.error("Sampling error", e); console.printError( e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); rWork.setNumReduceTasks(1); job.setNumReduceTasks(1); } } // remove the pwd from conf file so that job tracker doesn't show this // logs String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD); if (pwd != null) { HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); } JobClient jc = new JobClient(job); // make this client wait if job tracker is not behaving well. Throttle.checkJobTracker(job, LOG); if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) { // initialize stats publishing table StatsPublisher statsPublisher; StatsFactory factory = StatsFactory.newFactory(job); if (factory != null) { statsPublisher = factory.getStatsPublisher(); List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job); if (rWork != null) { statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job)); } StatsCollectionContext sc = new StatsCollectionContext(job); sc.setStatsTmpDirs(statsTmpDir); if (!statsPublisher.init(sc)) { // creating stats table if not exists if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) { throw new HiveException( ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg()); } } } } Utilities.createTmpDirs(job, mWork); Utilities.createTmpDirs(job, rWork); SessionState ss = SessionState.get(); if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && ss != null) { TezSessionState session = ss.getTezSession(); TezSessionPoolManager.getInstance().close(session, true); } // Finally SUBMIT the JOB! rj = jc.submitJob(job); // replace it back if (pwd != null) { HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd); } returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager()); success = (returnVal == 0); } catch (Exception e) { e.printStackTrace(); String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; if (rj != null) { mesg = "Ended Job = " + rj.getJobID() + mesg; } else { mesg = "Job Submission failed" + mesg; } // Has to use full name to make sure it does not conflict with // org.apache.commons.lang.StringUtils console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); success = false; returnVal = 1; } finally { Utilities.clearWork(job); try { if (ctxCreated) { ctx.clear(); } if (rj != null) { if (returnVal != 0) { rj.killJob(); } jobID = rj.getID().toString(); } } catch (Exception e) { LOG.warn("Failed while cleaning up ", e); } finally { HadoopJobExecHelper.runningJobs.remove(rj); } } // get the list of Dynamic partition paths try { if (rj != null) { if (mWork.getAliasToWork() != null) { for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) { op.jobClose(job, success); } } if (rWork != null) { rWork.getReducer().jobClose(job, success); } } } catch (Exception e) { // jobClose needs to execute successfully otherwise fail task if (success) { success = false; returnVal = 3; String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'"; console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); } } return (returnVal); }
public void configure(JobConf job) { int n = job.getInt("NodeCnt", 0); node.setType(PageRankNode.TYPE_COMPLETE); node.setPageRank((float) -StrictMath.log(n)); }
static int getMaxConcurrentSteps(JobConf jobConf) { return jobConf.getInt(MAX_CONCURRENT_STEPS, 0); }
public void configure(JobConf job) { memoryMXBean = ManagementFactory.getMemoryMXBean(); l4j.info("maximum memory = " + memoryMXBean.getHeapMemoryUsage().getMax()); try { l4j.info( "conf classpath = " + Arrays.asList(((URLClassLoader) job.getClassLoader()).getURLs())); l4j.info( "thread classpath = " + Arrays.asList( ((URLClassLoader) Thread.currentThread().getContextClassLoader()).getURLs())); } catch (Exception e) { l4j.info("cannot get classpath: " + e.getMessage()); } try { jc = job; int estCountBucketSize = jc.getInt("hive.exec.estdistinct.bucketsize.log", 15); int estCountBufferSize = jc.getInt("hive.exec.estdistinct.buffsize.log", 8); GenericUDAFCardinalityEstimation.initParams(estCountBucketSize, estCountBufferSize); boolean isChangSizeZero2Null = jc.getBoolean( HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_ZERO_SIZE_2_NULL.varname, HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_ZERO_SIZE_2_NULL.defaultBoolVal); boolean isChangeNull2Null = jc.getBoolean( HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_NULL_2_NULL.varname, HiveConf.ConfVars.HIVE_UDTF_EXPLODE_CHANGE_NULL_2_NULL.defaultBoolVal); GenericUDTFExplode.isChangSizeZero2Null = isChangSizeZero2Null; GenericUDTFExplode.isChangNull2Null = isChangeNull2Null; GenericUDTFPosExplode.isChangSizeZero2Null = isChangSizeZero2Null; GenericUDTFPosExplode.isChangNull2Null = isChangeNull2Null; mapredWork mrwork = Utilities.getMapRedWork(job); mo = new MapOperator(); mo.setConf(mrwork); mo.setChildren(job); l4j.info(mo.dump(0)); mo.initialize(jc, null); localWork = mrwork.getMapLocalWork(); if (localWork == null) { return; } fetchOperators = new HashMap<String, FetchOperator>(); for (Map.Entry<String, fetchWork> entry : localWork.getAliasToFetchWork().entrySet()) { fetchOperators.put(entry.getKey(), new FetchOperator(entry.getValue(), job)); l4j.info("fetchoperator for " + entry.getKey() + " created"); } for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) { Operator<? extends Serializable> forwardOp = localWork.getAliasToWork().get(entry.getKey()); forwardOp.initialize( jc, new ObjectInspector[] {entry.getValue().getOutputObjectInspector()}); l4j.info("fetchoperator for " + entry.getKey() + " initialized"); } } catch (Throwable e) { abort = true; if (e instanceof OutOfMemoryError) { throw (OutOfMemoryError) e; } else { throw new RuntimeException("Map operator initialization failed", e); } } }
public MergeManagerImpl( TaskAttemptID reduceId, JobConf jobConf, FileSystem localFS, LocalDirAllocator localDirAllocator, Reporter reporter, CompressionCodec codec, Class<? extends Reducer> combinerClass, CombineOutputCollector<K, V> combineCollector, Counters.Counter spilledRecordsCounter, Counters.Counter reduceCombineInputCounter, Counters.Counter mergedMapOutputsCounter, ExceptionReporter exceptionReporter, Progress mergePhase, MapOutputFile mapOutputFile) { this.reduceId = reduceId; this.jobConf = jobConf; this.localDirAllocator = localDirAllocator; this.exceptionReporter = exceptionReporter; this.reporter = reporter; this.codec = codec; this.combinerClass = combinerClass; this.combineCollector = combineCollector; this.reduceCombineInputCounter = reduceCombineInputCounter; this.spilledRecordsCounter = spilledRecordsCounter; this.mergedMapOutputsCounter = mergedMapOutputsCounter; this.mapOutputFile = mapOutputFile; this.mapOutputFile.setConf(jobConf); this.localFS = localFS; this.rfs = ((LocalFileSystem) localFS).getRaw(); final float maxInMemCopyUse = jobConf.getFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.90f); if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) { throw new IllegalArgumentException( "Invalid value for " + MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT + ": " + maxInMemCopyUse); } // Allow unit tests to fix Runtime memory this.memoryLimit = (long) (jobConf.getLong( MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES, Math.min(Runtime.getRuntime().maxMemory(), Integer.MAX_VALUE)) * maxInMemCopyUse); this.ioSortFactor = jobConf.getInt(MRJobConfig.IO_SORT_FACTOR, 100); final float singleShuffleMemoryLimitPercent = jobConf.getFloat( MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, DEFAULT_SHUFFLE_MEMORY_LIMIT_PERCENT); if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) { throw new IllegalArgumentException( "Invalid value for " + MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT + ": " + singleShuffleMemoryLimitPercent); } usedMemory = 0L; commitMemory = 0L; this.maxSingleShuffleLimit = (long) (memoryLimit * singleShuffleMemoryLimitPercent); this.memToMemMergeOutputsThreshold = jobConf.getInt(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, ioSortFactor); this.mergeThreshold = (long) (this.memoryLimit * jobConf.getFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.90f)); LOG.info( "MergerManager: memoryLimit=" + memoryLimit + ", " + "maxSingleShuffleLimit=" + maxSingleShuffleLimit + ", " + "mergeThreshold=" + mergeThreshold + ", " + "ioSortFactor=" + ioSortFactor + ", " + "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold); if (this.maxSingleShuffleLimit >= this.mergeThreshold) { throw new RuntimeException( "Invlaid configuration: " + "maxSingleShuffleLimit should be less than mergeThreshold" + "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit + "mergeThreshold: " + this.mergeThreshold); } boolean allowMemToMemMerge = jobConf.getBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false); if (allowMemToMemMerge) { this.memToMemMerger = new IntermediateMemoryToMemoryMerger(this, memToMemMergeOutputsThreshold); this.memToMemMerger.start(); } else { this.memToMemMerger = null; } this.inMemoryMerger = createInMemoryMerger(); this.inMemoryMerger.start(); this.onDiskMerger = new OnDiskMerger(this); this.onDiskMerger.start(); this.mergePhase = mergePhase; }
@Override public void configure(JobConf conf) { nstripes = conf.getInt(PARAM_APS_STRIPES, 1); }
public void configure(JobConf job) { // get buckets per crawler ... bucketsPerCrawler = job.getInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, 8); }
/** * Copy a file to a destination. * * @param srcstat src path and metadata * @param dstpath dst path * @param reporter */ private void copy( FileStatus srcstat, Path relativedst, OutputCollector<WritableComparable<?>, Text> outc, Reporter reporter) throws IOException { Path absdst = new Path(destPath, relativedst); int totfiles = job.getInt(SRC_COUNT_LABEL, -1); assert totfiles >= 0 : "Invalid file count " + totfiles; // if a directory, ensure created even if empty if (srcstat.isDir()) { if (destFileSys.exists(absdst)) { if (!destFileSys.getFileStatus(absdst).isDir()) { throw new IOException("Failed to mkdirs: " + absdst + " is a file."); } } else if (!destFileSys.mkdirs(absdst)) { throw new IOException("Failed to mkdirs " + absdst); } // TODO: when modification times can be set, directories should be // emitted to reducers so they might be preserved. Also, mkdirs does // not currently return an error when the directory already exists; // if this changes, all directory work might as well be done in reduce return; } if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) { outc.collect(null, new Text("SKIP: " + srcstat.getPath())); ++skipcount; reporter.incrCounter(Counter.SKIP, 1); updateStatus(reporter); return; } Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst); long cbcopied = 0L; FSDataInputStream in = null; FSDataOutputStream out = null; try { // open src file try { in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath()); } catch (IOException e) { LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return"); in = null; return; } reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen()); // open tmp file out = create(tmpfile, reporter, srcstat); // copy file for (int cbread; (cbread = in.read(buffer)) >= 0; ) { out.write(buffer, 0, cbread); cbcopied += cbread; reporter.setStatus( String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen()) + absdst + " [ " + StringUtils.humanReadableInt(cbcopied) + " / " + StringUtils.humanReadableInt(srcstat.getLen()) + " ]"); } } finally { checkAndClose(in); checkAndClose(out); } if (cbcopied != srcstat.getLen()) { if (srcstat.getLen() == 0 && cbcopied > 0) { LOG.info("most likely see a WAL file corruption: " + srcstat.getPath()); } else { throw new IOException( "File size not matched: copied " + bytesString(cbcopied) + " to tmpfile (=" + tmpfile + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } } else { if (totfiles == 1) { // Copying a single file; use dst path provided by user as destination // rather than destination directory, if a file Path dstparent = absdst.getParent(); if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) { absdst = dstparent; } } if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) { throw new IOException(absdst + " is a directory"); } if (!destFileSys.mkdirs(absdst.getParent())) { throw new IOException("Failed to craete parent dir: " + absdst.getParent()); } rename(tmpfile, absdst); FileStatus dststat = destFileSys.getFileStatus(absdst); if (dststat.getLen() != srcstat.getLen()) { destFileSys.delete(absdst, false); throw new IOException( "File size not matched: copied " + bytesString(dststat.getLen()) + " to dst (=" + absdst + ") but expected " + bytesString(srcstat.getLen()) + " from " + srcstat.getPath()); } updatePermissions(srcstat, dststat); } // report at least once for each file ++copycount; reporter.incrCounter(Counter.BYTESCOPIED, cbcopied); reporter.incrCounter(Counter.COPY, 1); updateStatus(reporter); }