@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
protected void configJob(JobConf conf) { conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setPartitionerClass(PKPartitioner.class); conf.setOutputValueGroupingComparator(PVComparator.class); }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setMapperClass(SortByDepartmentAndAgeMapper.class); conf.setMapOutputKeyClass(IntPair.class); conf.setPartitionerClass(FirstPartitioner.class); conf.setOutputValueGroupingComparator(FirstGroupingComparator.class); conf.setReducerClass(SortByDepartmentAndAgeReducer.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setMapperClass(MaxTemperatureMapper.class); /*[*/ conf.setPartitionerClass(FirstPartitioner.class); /*]*/ /*[*/ conf.setOutputKeyComparatorClass(KeyComparator.class); /*]*/ /*[*/ conf.setOutputValueGroupingComparator(GroupComparator.class); /*]*/ conf.setReducerClass(MaxTemperatureReducer.class); conf.setOutputKeyClass(IntPair.class); conf.setOutputValueClass(NullWritable.class); JobClient.runJob(conf); return 0; }
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub JobConf jobConf = new JobConf(getConf(), DailyDedupMr.class); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(UniqueKeyReduce.class); jobConf.setJobName("DailyDedup"); jobConf.setPartitionerClass(DailyPartitioner.class); jobConf.setOutputFormat(org.apache.hadoop.mapred.DailyOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(LogRecord.class); jobConf.setMapOutputValueClass(LogRecord.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setNumReduceTasks(3); FileInputFormat.setInputPaths(jobConf, args[0]); FileOutputFormat.setOutputPath(jobConf, new Path(args[1])); JobClient.runJob(jobConf); return 0; }
private static void compactASinglePartition( String existingInputPath, String deltaInputPath, String primaryKeyList, String maxColumns, String outputPath, String numberOfReducers) throws IOException { JobConf conf = new JobConf(new Configuration(), PartitionCompactor.class); // hadoop conf.setJobName("PartitionCompactor"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(BytesRefArrayWritable.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(BytesRefArrayWritable.class); conf.setMapperClass(CustomMapper.class); conf.setReducerClass(CustomReducer.class); conf.setPartitionerClass(CustomPartitioner.class); conf.setInputFormat(RCFileInputFormat.class); conf.setNumReduceTasks(Integer.parseInt(numberOfReducers)); conf.set(EXISTING_FILE_PATH_CONF, existingInputPath); conf.set(DELTA_FILE_PATH_CONF, deltaInputPath); conf.set(PRIMARY_KEYS_CONF, primaryKeyList); conf.set(RCFile.COLUMN_NUMBER_CONF_STR, maxColumns); RCFileInputFormat.addInputPath(conf, new Path(existingInputPath)); RCFileInputFormat.addInputPath(conf, new Path(deltaInputPath)); conf.setOutputFormat(RCFileOutputFormat.class); RCFileOutputFormat.setOutputPath(conf, new Path(outputPath)); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); }
public static void main(String[] args) throws IOException { if (args.length != 3) { System.out.println("Parameters: inputDir outputDir parallel"); System.exit(1); } String inputDir = args[0]; String outputDir = args[1]; String parallel = args[2]; JobConf lp = new JobConf(L10.class); lp.setJobName("L10 Load Page Views"); lp.setInputFormat(TextInputFormat.class); lp.setOutputKeyClass(MyType.class); lp.setOutputValueClass(Text.class); lp.setMapperClass(ReadPageViews.class); lp.setReducerClass(Group.class); lp.setPartitionerClass(MyPartitioner.class); Properties props = System.getProperties(); for (Map.Entry<Object, Object> entry : props.entrySet()) { lp.set((String) entry.getKey(), (String) entry.getValue()); } FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views")); FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L10out")); // Hardcode the parallel to 40 since MyPartitioner assumes it lp.setNumReduceTasks(40); Job group = new Job(lp); JobControl jc = new JobControl("L10 join"); jc.addJob(group); new Thread(jc).start(); int i = 0; while (!jc.allFinished()) { ArrayList<Job> failures = jc.getFailedJobs(); if (failures != null && failures.size() > 0) { for (Job failure : failures) { System.err.println(failure.getMessage()); } break; } try { Thread.sleep(5000); } catch (InterruptedException e) { } if (i % 10000 == 0) { System.out.println("Running jobs"); ArrayList<Job> running = jc.getRunningJobs(); if (running != null && running.size() > 0) { for (Job r : running) { System.out.println(r.getJobName()); } } System.out.println("Ready jobs"); ArrayList<Job> ready = jc.getReadyJobs(); if (ready != null && ready.size() > 0) { for (Job r : ready) { System.out.println(r.getJobName()); } } System.out.println("Waiting jobs"); ArrayList<Job> waiting = jc.getWaitingJobs(); if (waiting != null && waiting.size() > 0) { for (Job r : ready) { System.out.println(r.getJobName()); } } System.out.println("Successful jobs"); ArrayList<Job> success = jc.getSuccessfulJobs(); if (success != null && success.size() > 0) { for (Job r : ready) { System.out.println(r.getJobName()); } } } i++; } ArrayList<Job> failures = jc.getFailedJobs(); if (failures != null && failures.size() > 0) { for (Job failure : failures) { System.err.println(failure.getMessage()); } } jc.stop(); }
/** Execute a query plan using Hadoop. */ @SuppressWarnings({"deprecation", "unchecked"}) @Override public int execute(DriverContext driverContext) { IOPrepareCache ioPrepareCache = IOPrepareCache.get(); ioPrepareCache.clear(); boolean success = true; Context ctx = driverContext.getCtx(); boolean ctxCreated = false; Path emptyScratchDir; MapWork mWork = work.getMapWork(); ReduceWork rWork = work.getReduceWork(); try { if (ctx == null) { ctx = new Context(job); ctxCreated = true; } emptyScratchDir = ctx.getMRTmpPath(); FileSystem fs = emptyScratchDir.getFileSystem(job); fs.mkdirs(emptyScratchDir); } catch (IOException e) { e.printStackTrace(); console.printError( "Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); return 5; } HiveFileFormatUtils.prepareJobOutput(job); // See the javadoc on HiveOutputFormatImpl and HadoopShims.prepareJobOutput() job.setOutputFormat(HiveOutputFormatImpl.class); job.setMapperClass(ExecMapper.class); job.setMapOutputKeyClass(HiveKey.class); job.setMapOutputValueClass(BytesWritable.class); try { String partitioner = HiveConf.getVar(job, ConfVars.HIVEPARTITIONER); job.setPartitionerClass(JavaUtils.loadClass(partitioner)); } catch (ClassNotFoundException e) { throw new RuntimeException(e.getMessage(), e); } if (mWork.getNumMapTasks() != null) { job.setNumMapTasks(mWork.getNumMapTasks().intValue()); } if (mWork.getMaxSplitSize() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mWork.getMaxSplitSize().longValue()); } if (mWork.getMinSplitSize() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mWork.getMinSplitSize().longValue()); } if (mWork.getMinSplitSizePerNode() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mWork.getMinSplitSizePerNode().longValue()); } if (mWork.getMinSplitSizePerRack() != null) { HiveConf.setLongVar( job, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mWork.getMinSplitSizePerRack().longValue()); } job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0); job.setReducerClass(ExecReducer.class); // set input format information if necessary setInputAttributes(job); // Turn on speculative execution for reducers boolean useSpeculativeExecReducers = HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVESPECULATIVEEXECREDUCERS); HiveConf.setBoolVar( job, HiveConf.ConfVars.HADOOPSPECULATIVEEXECREDUCERS, useSpeculativeExecReducers); String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT); if (mWork.isUseBucketizedHiveInputFormat()) { inpFormat = BucketizedHiveInputFormat.class.getName(); } LOG.info("Using " + inpFormat); try { job.setInputFormat(JavaUtils.loadClass(inpFormat)); } catch (ClassNotFoundException e) { throw new RuntimeException(e.getMessage(), e); } // No-Op - we don't really write anything here .. job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Transfer HIVEAUXJARS and HIVEADDEDJARS to "tmpjars" so hadoop understands // it String auxJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEAUXJARS); String addedJars = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDJARS); if (StringUtils.isNotBlank(auxJars) || StringUtils.isNotBlank(addedJars)) { String allJars = StringUtils.isNotBlank(auxJars) ? (StringUtils.isNotBlank(addedJars) ? addedJars + "," + auxJars : auxJars) : addedJars; LOG.info("adding libjars: " + allJars); initializeFiles("tmpjars", allJars); } // Transfer HIVEADDEDFILES to "tmpfiles" so hadoop understands it String addedFiles = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDFILES); if (StringUtils.isNotBlank(addedFiles)) { initializeFiles("tmpfiles", addedFiles); } int returnVal = 0; boolean noName = StringUtils.isEmpty(HiveConf.getVar(job, HiveConf.ConfVars.HADOOPJOBNAME)); if (noName) { // This is for a special case to ensure unit tests pass HiveConf.setVar(job, HiveConf.ConfVars.HADOOPJOBNAME, "JOB" + Utilities.randGen.nextInt()); } String addedArchives = HiveConf.getVar(job, HiveConf.ConfVars.HIVEADDEDARCHIVES); // Transfer HIVEADDEDARCHIVES to "tmparchives" so hadoop understands it if (StringUtils.isNotBlank(addedArchives)) { initializeFiles("tmparchives", addedArchives); } try { MapredLocalWork localwork = mWork.getMapRedLocalWork(); if (localwork != null && localwork.hasStagedAlias()) { if (!ShimLoader.getHadoopShims().isLocalMode(job)) { Path localPath = localwork.getTmpPath(); Path hdfsPath = mWork.getTmpHDFSPath(); FileSystem hdfs = hdfsPath.getFileSystem(job); FileSystem localFS = localPath.getFileSystem(job); FileStatus[] hashtableFiles = localFS.listStatus(localPath); int fileNumber = hashtableFiles.length; String[] fileNames = new String[fileNumber]; for (int i = 0; i < fileNumber; i++) { fileNames[i] = hashtableFiles[i].getPath().getName(); } // package and compress all the hashtable files to an archive file String stageId = this.getId(); String archiveFileName = Utilities.generateTarFileName(stageId); localwork.setStageID(stageId); CompressionUtils.tar(localPath.toUri().getPath(), fileNames, archiveFileName); Path archivePath = Utilities.generateTarPath(localPath, stageId); LOG.info("Archive " + hashtableFiles.length + " hash table files to " + archivePath); // upload archive file to hdfs Path hdfsFilePath = Utilities.generateTarPath(hdfsPath, stageId); short replication = (short) job.getInt("mapred.submit.replication", 10); hdfs.copyFromLocalFile(archivePath, hdfsFilePath); hdfs.setReplication(hdfsFilePath, replication); LOG.info("Upload 1 archive file from" + archivePath + " to: " + hdfsFilePath); // add the archive file to distributed cache DistributedCache.createSymlink(job); DistributedCache.addCacheArchive(hdfsFilePath.toUri(), job); LOG.info( "Add 1 archive file to distributed cache. Archive file: " + hdfsFilePath.toUri()); } } work.configureJobConf(job); List<Path> inputPaths = Utilities.getInputPaths(job, mWork, emptyScratchDir, ctx, false); Utilities.setInputPaths(job, inputPaths); Utilities.setMapRedWork(job, work, ctx.getMRTmpPath()); if (mWork.getSamplingType() > 0 && rWork != null && job.getNumReduceTasks() > 1) { try { handleSampling(ctx, mWork, job); job.setPartitionerClass(HiveTotalOrderPartitioner.class); } catch (IllegalStateException e) { console.printInfo("Not enough sampling data.. Rolling back to single reducer task"); rWork.setNumReduceTasks(1); job.setNumReduceTasks(1); } catch (Exception e) { LOG.error("Sampling error", e); console.printError( e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); rWork.setNumReduceTasks(1); job.setNumReduceTasks(1); } } // remove the pwd from conf file so that job tracker doesn't show this // logs String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD); if (pwd != null) { HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE"); } JobClient jc = new JobClient(job); // make this client wait if job tracker is not behaving well. Throttle.checkJobTracker(job, LOG); if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) { // initialize stats publishing table StatsPublisher statsPublisher; StatsFactory factory = StatsFactory.newFactory(job); if (factory != null) { statsPublisher = factory.getStatsPublisher(); List<String> statsTmpDir = Utilities.getStatsTmpDirs(mWork, job); if (rWork != null) { statsTmpDir.addAll(Utilities.getStatsTmpDirs(rWork, job)); } StatsCollectionContext sc = new StatsCollectionContext(job); sc.setStatsTmpDirs(statsTmpDir); if (!statsPublisher.init(sc)) { // creating stats table if not exists if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) { throw new HiveException( ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg()); } } } } Utilities.createTmpDirs(job, mWork); Utilities.createTmpDirs(job, rWork); SessionState ss = SessionState.get(); if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez") && ss != null) { TezSessionState session = ss.getTezSession(); TezSessionPoolManager.getInstance().close(session, true); } // Finally SUBMIT the JOB! rj = jc.submitJob(job); // replace it back if (pwd != null) { HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, pwd); } returnVal = jobExecHelper.progress(rj, jc, ctx.getHiveTxnManager()); success = (returnVal == 0); } catch (Exception e) { e.printStackTrace(); String mesg = " with exception '" + Utilities.getNameMessage(e) + "'"; if (rj != null) { mesg = "Ended Job = " + rj.getJobID() + mesg; } else { mesg = "Job Submission failed" + mesg; } // Has to use full name to make sure it does not conflict with // org.apache.commons.lang.StringUtils console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); success = false; returnVal = 1; } finally { Utilities.clearWork(job); try { if (ctxCreated) { ctx.clear(); } if (rj != null) { if (returnVal != 0) { rj.killJob(); } jobID = rj.getID().toString(); } } catch (Exception e) { LOG.warn("Failed while cleaning up ", e); } finally { HadoopJobExecHelper.runningJobs.remove(rj); } } // get the list of Dynamic partition paths try { if (rj != null) { if (mWork.getAliasToWork() != null) { for (Operator<? extends OperatorDesc> op : mWork.getAliasToWork().values()) { op.jobClose(job, success); } } if (rWork != null) { rWork.getReducer().jobClose(job, success); } } } catch (Exception e) { // jobClose needs to execute successfully otherwise fail task if (success) { success = false; returnVal = 3; String mesg = "Job Commit failed with exception '" + Utilities.getNameMessage(e) + "'"; console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); } } return (returnVal); }
@Override public int run(String[] args) throws IOException { OptionParser p = new OptionParser(); OptionSpec<String> maxwiOpt = p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED") .withRequiredArg() .ofType(String.class); OptionSpec<Float> thresholdOpt = p.accepts(thresholdOptName, "similarity threshold") .withRequiredArg() .ofType(Float.class) .defaultsTo(DEFAULT_THRESHOLD); OptionSpec<Integer> stripesOpt = p.accepts(stripesOptName, "number of stripes to divide the similarity matrix") .withRequiredArg() .ofType(Integer.class) .defaultsTo(1); OptionSpec<Integer> spreadOpt = p.accepts(spreadOptName, "number of reducers per stripe") .withRequiredArg() .ofType(Integer.class) .defaultsTo(DEFAULT_SPREAD); OptionSpec<Integer> factorOpt = p.accepts(factorOptName, "number of mappers per reducer") .withRequiredArg() .ofType(Integer.class) .defaultsTo(DEFAULT_FACTOR); OptionSpec<Integer> maxVectorIDOpt = p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class); p.acceptsAll(Arrays.asList("h", "?"), "show help"); OptionSet options = parseOptions(p, args); // to distinguish indexes built in successive runs DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss"); Date date = new Date(); float threshold = options.valueOf(thresholdOpt); // threshold if (threshold < 0 || threshold >= 1) { System.err.println(thresholdOptName + " should be between 0 and 1"); System.exit(1); } int numStripes = options.valueOf(stripesOpt); // number of stripes if (numStripes < 1) { System.err.println(stripesOptName + " should be > 0"); System.exit(1); } // MapReduce parameters int spread = options.valueOf(spreadOpt); // how many reducers per stripe if (spread < 1) { System.err.println(spreadOptName + " should be > 0"); System.exit(1); } int factor = options.valueOf(factorOpt); // how many mappers per reducer if (factor < 1) { System.err.println(factorOptName + " should be > 0"); System.exit(1); } int maxKey = 0; if (options.has(maxVectorIDOpt)) { maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID if (maxKey < 1) { System.err.println(maxVectorIDOptName + " should be > 0"); System.exit(1); } } int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread); int numMappers = numReducers * factor; int numBuckets = numMappers; // pick the file with max weights from command line String maxWiDir = options.valueOf(maxwiOpt); List<String> nonOptArgs = options.nonOptionArguments(); LOG.info("Threshold set to " + threshold); LOG.info( String.format( "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s", numBuckets, factor, numStripes, spread, numReducers)); // start building the jobs JobConf conf1 = new JobConf(getConf(), Similarity.class); conf1.setFloat(PARAM_APS_THRESHOLD, threshold); conf1.setInt(PARAM_APS_STRIPES, numStripes); DistributedCache.addCacheFile(URI.create(maxWiDir), conf1); Path inputPath = new Path(nonOptArgs.get(0)); Path indexPath = new Path( nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date)); // index filtering pruned nested directory Path indexOnlyPath = new Path(indexPath, "part*"); Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes); FileInputFormat.setInputPaths(conf1, inputPath); FileOutputFormat.setOutputPath(conf1, indexPath); conf1.setInputFormat(SequenceFileInputFormat.class); conf1.setOutputFormat(SequenceFileOutputFormat.class); conf1.setMapOutputKeyClass(LongWritable.class); conf1.setMapOutputValueClass(IndexItem.class); conf1.setOutputKeyClass(LongWritable.class); conf1.setOutputValueClass(IndexItemArrayWritable.class); conf1.setMapperClass(IndexerMapper.class); conf1.setReducerClass(IndexerReducer.class); // assuming input is sorted according to the key (vectorID) so that the // part files are locally sorted MultipleOutputs.addNamedOutput( conf1, PRUNED, SequenceFileOutputFormat.class, IntWritable.class, VectorComponentArrayWritable.class); // remove the stuff we added from the job name conf1.set( "mapred.job.name", "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16)); conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse conf1.setSpeculativeExecution(false); conf1.setCompressMapOutput(true); // hash the posting lists in different buckets to distribute the load conf1.setNumReduceTasks(numBuckets); RunningJob job1 = JobClient.runJob(conf1); // part 2 JobConf conf2 = new JobConf(getConf(), Similarity.class); if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL); MultipleInputs.addInputPath( conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class); MultipleInputs.addInputPath( conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class); FileOutputFormat.setOutputPath(conf2, outputPath); conf2.setCombinerClass(SimilarityCombiner.class); conf2.setReducerClass(SimilarityReducer.class); conf2.setPartitionerClass(GenericKey.StripePartitioner.class); conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class); conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class); conf2.setMapOutputKeyClass(GenericKey.class); conf2.setMapOutputValueClass(GenericValue.class); conf2.setOutputKeyClass(VectorPair.class); conf2.setOutputValueClass(NullWritable.class); Counter numDocs = job1.getCounters() .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS"); maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue(); LOG.info("Setting max key value in input to " + maxKey); conf2.setInt(PARAM_APS_MAXKEY, maxKey); conf2.setInt(PARAM_APS_STRIPES, numStripes); conf2.setFloat(PARAM_APS_THRESHOLD, threshold); conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread); conf2.set("mapred.job.name", "APS-" + outputPath.getName()); conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse conf2.setSpeculativeExecution(false); conf2.setCompressMapOutput(true); conf2.setNumReduceTasks(numReducers); JobClient.runJob(conf2); return 0; }
@Override public int run(String[] args) throws Exception { if (!createJobConfAndParseArgs(args)) { return 0; } setFileSystem(FileSystem.get(job)); FileStatus status = fs.getFileStatus(srcDir); if (null == status || !status.isDir()) { throw new IllegalArgumentException("No such directory: " + srcDir); } if (Mode.STAND_ALONE == mode) { standAlone(); } else { writeDirs(); MultipleInputs.addInputPath( job, bucketFiles, SequenceFileInputFormat.class, IdentityMapper.class); MultipleInputs.addInputPath(job, counters, CountersInputFormat.class, CountersMapper.class); job.setPartitionerClass(CrushPartitioner.class); job.setReducerClass(CrushReducer.class); job.setOutputKeyComparatorClass(Text.Comparator.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, bucketFiles); FileOutputFormat.setOutputPath(job, outDir); job.set("crush.partition.map", partitionMap.toString()); if (0 != nBuckets) { print(Verbosity.INFO, "\n\nInvoking map reduce\n\n"); RunningJob completed = JobClient.runJob(job); jobCounters = completed.getCounters(); } long eligible = jobCounters.getCounter(MapperCounter.FILES_ELIGIBLE); long crushed = jobCounters.getCounter(ReducerCounter.FILES_CRUSHED); /* * There's no way this cannot hold true if Hadoop is working correctly. */ if (eligible != crushed) { throw new AssertionError( format("Files eligible (%d) != files crushed (%d)", eligible, crushed)); } if (Mode.CLONE == mode) { cloneOutput(); } else { moveOutput(); } } print(Verbosity.INFO, "\n\nDeleting temporary directory"); fs.delete(tmpDir, true); /* * If we have printed anything to the console at all, then add a line wrap to bring the cursor back to the beginning. */ print(Verbosity.INFO, "\n\n"); return 0; }
public static void main(String[] args) throws Exception { if (!validArgs(args)) { printUsage(); return; } // These are the temp paths that are created on HDFS String dir1 = "/user/miyuru/csrconverter-output"; String dir2 = "/user/miyuru/csrconverter-output-sorted"; // We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); System.out.println("Deleting the dir : " + dir1); if (fs1.exists(new Path(dir1))) { fs1.delete(new Path(dir1), true); } System.out.println("Done deleting the dir : " + dir1); System.out.println("Deleting the dir : " + dir2); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } Path notinPath = new Path("/user/miyuru/notinverts/notinverts"); if (!fs1.exists(notinPath)) { fs1.create(notinPath); } System.out.println("Done deleting the dir : " + dir2); // Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why. VertexCounterClient.setDefaultGraphID(args[3], args[2]); // First job creates the inverted index JobConf conf = new JobConf(CSRConverter.class); conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]); conf.set("org.acacia.partitioner.hbase.table", args[2]); conf.set("org.acacia.partitioner.hbase.contacthost", args[3]); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); // conf.setMapperClass(InvertedMapper.class); conf.setReducerClass(InvertedReducer.class); // conf.setInputFormat(TextInputFormat.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // FileInputFormat.setInputPaths(conf, new Path(args[0])); MultipleInputs.addInputPath( conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class); MultipleInputs.addInputPath( conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class, InvertedMapper.class); FileOutputFormat.setOutputPath(conf, new Path(dir1)); // Also for the moment we turn-off the speculative execution conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setNumMapTasks(96); conf.setNumReduceTasks(96); conf.setPartitionerClass(VertexPartitioner.class); conf.set("vertex-count", args[4]); conf.set("zero-flag", args[5]); Job job = new Job(conf, "csr_inverter"); job.setSortComparatorClass(SortComparator.class); job.waitForCompletion(true); }
public JobBuilder partition(Class<? extends Partitioner> partitioner) throws IOException { _jobConf.setPartitionerClass(partitioner); return this; }