/** * Configure the job * * @param conf Job to configure * @param rules classification rules to evaluate * @param target label value to evaluate the rules for * @param inpath input path (the dataset) * @param outpath output <code>Path</code> * @param split DatasetSplit used to separate training and testing input */ private static void configureJob( JobConf conf, List<? extends Rule> rules, int target, Path inpath, Path outpath, DatasetSplit split) { split.storeJobParameters(conf); FileInputFormat.setInputPaths(conf, inpath); FileOutputFormat.setOutputPath(conf, outpath); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(CDFitness.class); conf.setMapperClass(CDMapper.class); conf.setCombinerClass(CDReducer.class); conf.setReducerClass(CDReducer.class); conf.setInputFormat(DatasetTextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // store the parameters conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules)); conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet())); conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target); }
/** * set up input file which has the list of input files. * * @return boolean * @throws IOException */ private boolean setup() throws IOException { estimateSavings(); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobconf); Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId); LOG.info(JOB_DIR_LABEL + "=" + jobdir); jobconf.set(JOB_DIR_LABEL, jobdir.toString()); Path log = new Path(jobdir, "_logs"); // The control file should have small size blocks. This helps // in spreading out the load from mappers that will be spawned. jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE); FileOutputFormat.setOutputPath(jobconf, log); LOG.info("log=" + log); // create operation list FileSystem fs = jobdir.getFileSystem(jobconf); Path opList = new Path(jobdir, "_" + OP_LIST_LABEL); jobconf.set(OP_LIST_LABEL, opList.toString()); int opCount = 0, synCount = 0; SequenceFile.Writer opWriter = null; try { opWriter = SequenceFile.createWriter( fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE); for (RaidPolicyPathPair p : raidPolicyPathPairList) { // If a large set of files are Raided for the first time, files // in the same directory that tend to have the same size will end up // with the same map. This shuffle mixes things up, allowing a better // mix of files. java.util.Collections.shuffle(p.srcPaths); for (FileStatus st : p.srcPaths) { opWriter.append(new Text(st.getPath().toString()), p.policy); opCount++; if (++synCount > SYNC_FILE_MAX) { opWriter.sync(); synCount = 0; } } } } finally { if (opWriter != null) { opWriter.close(); } fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file } raidPolicyPathPairList.clear(); jobconf.setInt(OP_COUNT_LABEL, opCount); LOG.info("Number of files=" + opCount); jobconf.setNumMapTasks( getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers())); LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks()); return opCount != 0; }
/** * Driver to copy srcPath to destPath depending on required protocol. * * @param args arguments */ static void copy(final Configuration conf, final Arguments args) throws IOException { LOG.info("srcPaths=" + args.srcs); LOG.info("destPath=" + args.dst); checkSrcPath(conf, args.srcs); JobConf job = createJobConf(conf); if (args.preservedAttributes != null) { job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes); } if (args.mapredSslConf != null) { job.set("dfs.https.client.keystore.resource", args.mapredSslConf); } // Initialize the mapper try { setup(conf, job, args); JobClient.runJob(job); finalize(conf, job, args.dst, args.preservedAttributes); } finally { // delete tmp fullyDelete(job.get(TMP_DIR_LABEL), job); // delete jobDirectory fullyDelete(job.get(JOB_DIR_LABEL), job); } }
/** * Run the job * * @param params The Job parameters containing the gramSize, input output folders, defaultCat, * encoding */ public static void runJob(Parameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesClassifierDriver.class); conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath")); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath"))); Path outPath = new Path(params.get("testDirPath") + "-output"); FileOutputFormat.setOutputPath(conf, outPath); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setMapperClass(BayesClassifierMapper.class); conf.setCombinerClass(BayesClassifierReducer.class); conf.setReducerClass(BayesClassifierReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); HadoopUtil.overwriteOutput(outPath); conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); Path outputFiles = new Path(outPath, "part*"); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params); log.info("{}", matrix.summarize()); }
public void runParseTest( String fieldTerminator, String lineTerminator, String encloser, String escape, boolean encloseRequired) throws IOException { ClassLoader prevClassLoader = null; String[] argv = getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); runImport(argv); try { String tableClassName = getTableName(); argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true); CompilationManager compileMgr = new CompilationManager(opts); String jarFileName = compileMgr.getJarFilename(); // Make sure the user's class is loaded into our address space. prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName); JobConf job = new JobConf(); job.setJar(jarFileName); // Tell the job what class we're testing. job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName); // use local mode in the same JVM. ConfigurationHelper.setJobtrackerAddr(job, "local"); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } String warehouseDir = getWarehouseDir(); Path warehousePath = new Path(warehouseDir); Path inputPath = new Path(warehousePath, getTableName()); Path outputPath = new Path(warehousePath, getTableName() + "-out"); job.setMapperClass(ReparseMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); JobClient.runJob(job); } catch (InvalidOptionsException ioe) { fail(ioe.toString()); } catch (ParseException pe) { fail(pe.toString()); } finally { if (null != prevClassLoader) { ClassLoaderStack.setCurrentClassLoader(prevClassLoader); } } }
public void runMR(String myMultiLocs, String sortKey) throws ParseException, IOException, Exception, org.apache.hadoop.zebra.parser.ParseException { JobConf jobConf = new JobConf(conf); jobConf.setJobName("TestMultipleOutputs4"); jobConf.setJarByClass(TestMultipleOutputs4.class); jobConf.set("table.output.tfile.compression", "gz"); jobConf.set("sortKey", sortKey); // input settings jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(TestMultipleOutputs4.MapClass.class); jobConf.setMapOutputKeyClass(BytesWritable.class); jobConf.setMapOutputValueClass(ZebraTuple.class); FileInputFormat.setInputPaths(jobConf, inputPath); jobConf.setNumMapTasks(1); // output settings jobConf.setOutputFormat(BasicTableOutputFormat.class); BasicTableOutputFormat.setMultipleOutputs( jobConf, myMultiLocs, TestMultipleOutputs4.OutputPartitionerClass.class); // set the logical schema with 2 columns BasicTableOutputFormat.setSchema(jobConf, "word:string, count:int"); // for demo purposes, create 2 physical column groups BasicTableOutputFormat.setStorageHint(jobConf, "[word];[count]"); BasicTableOutputFormat.setSortInfo(jobConf, sortKey); System.out.println("in runMR, sortkey: " + sortKey); // set map-only job. jobConf.setNumReduceTasks(1); JobClient.runJob(jobConf); BasicTableOutputFormat.close(jobConf); }
/** * run a distributed job and verify that TokenCache is available * * @throws IOException */ @Test public void testTokenCache() throws IOException { System.out.println("running dist job"); // make sure JT starts jConf = mrCluster.createJobConf(); // provide namenodes names for the job to get the delegation tokens for String nnUri = dfsCluster.getURI(0).toString(); jConf.set(MRJobConfig.JOB_NAMENODES, nnUri + "," + nnUri); // job tracker principla id.. jConf.set(JTConfig.JT_USER_NAME, "jt_id/foo@BAR"); // using argument to pass the file name String[] args = { "-tokenCacheFile", tokenFileName.toString(), "-m", "1", "-r", "1", "-mt", "1", "-rt", "1" }; int res = -1; try { res = ToolRunner.run(jConf, new MySleepJob(), args); } catch (Exception e) { System.out.println("Job failed with" + e.getLocalizedMessage()); e.printStackTrace(System.out); fail("Job failed"); } assertEquals("dist job res is not 0", res, 0); }
/** * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce * job. * * @throws IOException When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCountSeqOutput.class); conf.setJobName("wordcount_seqOF"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) // conf.setOutputValueClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Combiner.class); conf.setReducerClass(Reduce.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // // compress Mapper output // conf.setCompressMapOutput(true); // conf.setMapOutputCompressorClass(org.apache.hadoop.io.compress.GzipCodec.class); // compress final output conf.set("mapred.output.compress", conf.get("mapred.output.compress", "true")); conf.set("mapred.output.compression.type", conf.get("mapred.output.compression.type", "BLOCK")); conf.set( "mapred.output.compression.codec", conf.get("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println( "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
private RunningJob _test(String... arg) throws Exception { Path actionDir = getFsTestCaseDir(); File jar = IOUtils.createJar( new File(getTestCaseDir()), "launcher.jar", LauncherMapper.class, LauncherSecurityManager.class, LauncherException.class, LauncherMainTester.class); FileSystem fs = getFileSystem(); Path launcherJar = new Path(actionDir, "launcher.jar"); fs.copyFromLocalFile(new Path(jar.toString()), launcherJar); JobConf jobConf = new JobConf(); jobConf.set("user.name", getTestUser()); jobConf.set("group.name", getTestGroup()); jobConf.setInt("mapred.map.tasks", 1); jobConf.setInt("mapred.map.max.attempts", 1); jobConf.setInt("mapred.reduce.max.attempts", 1); jobConf.set("mapred.job.tracker", getJobTrackerUri()); jobConf.set("fs.default.name", getNameNodeUri()); injectKerberosInfo(jobConf); LauncherMapper lm = new LauncherMapper(); lm.setupMainClass(jobConf, LauncherMainTester.class.getName()); lm.setupMainArguments(jobConf, arg); Configuration actionConf = new XConfiguration(); lm.setupLauncherInfo(jobConf, "1", "1@a", actionDir, "1@a-0", actionConf); assertEquals("1", actionConf.get("oozie.job.id")); assertEquals("1@a", actionConf.get("oozie.action.id")); DistributedCache.addFileToClassPath(new Path(launcherJar.toUri().getPath()), jobConf); JobClient jobClient = createJobClient(); final RunningJob runningJob = jobClient.submitJob(jobConf); System.out.println("Action Dir: " + actionDir); System.out.println("LauncherMapper ID: " + runningJob.getJobID().toString()); waitFor( 180 * 1000, new Predicate() { public boolean evaluate() throws Exception { return runningJob.isComplete(); } }); return runningJob; }
public void setUpJobConf(JobConf job) { job.set(TezRuntimeFrameworkConfigs.LOCAL_DIRS, workDir.toString()); job.set(MRConfig.LOCAL_DIR, workDir.toString()); job.setClass( Constants.TEZ_RUNTIME_TASK_OUTPUT_MANAGER, TezLocalTaskOutputFiles.class, TezTaskOutput.class); job.set(TezJobConfig.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName()); job.setNumReduceTasks(1); }
public MiniMrShim(Configuration conf, int numberOfTaskTrackers, String nameNode, int numDir) throws IOException { this.conf = conf; JobConf jConf = new JobConf(conf); jConf.set("yarn.scheduler.capacity.root.queues", "default"); jConf.set("yarn.scheduler.capacity.root.default.capacity", "100"); mr = new MiniMRCluster(numberOfTaskTrackers, nameNode, numDir, null, null, jConf); }
@Override protected void parseArgs(JobConf conf, String[] args) { if (args.length != 1) { throw new RuntimeException("Required arguments <inputFileName>"); } String inFileName = args[0]; LOGGER.info("inFileName: %s", inFileName); conf.set("in", inFileName); conf.set(ImportMRMapper.CONFIG_SOURCE_FILE_NAME, new File(inFileName).getName()); }
@Override public int run(String[] args) throws Exception { System.out.println("\n\nConvolutionJob\n"); JobConf conf = new JobConf(getConf(), ConvolutionJob.class); conf.setJobName("ConvolutionJob"); this.cacheKernel(conf); this.CreateRats(conf); conf.setMapperClass(ConvolutionMapper.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println( "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } conf.setNumReduceTasks(0); conf.setInputFormat(NonSplittableTextInputFormat.class); conf.setOutputFormat(MultiFileOutput.class); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(Text.class); conf.setCompressMapOutput(true); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.type", "BLOCK"); FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); // FileOutputFormat.setCompressOutput(conf, true); JobClient.runJob(conf); return 0; }
protected void initConfig(Map<Object, Object> properties, JobConf parentConfig) { if (properties != null) parentConfig = createConfig(properties, parentConfig); if (parentConfig == null) // this is ok, getJobConf will pass a default parent in return; jobConf = HadoopUtil.copyJobConf(parentConfig); // prevent local values from being shared jobConf.set("fs.http.impl", HttpFileSystem.class.getName()); jobConf.set("fs.https.impl", HttpFileSystem.class.getName()); syncPaths = HadoopUtil.addToClassPath(jobConf, getClassPath()); }
public int run(String[] args) throws Exception { JobConf conf = new JobConf(this.getClass()); conf.setJobName("Domain-MR2"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); // conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setOutputFormat(MultiFileOutput.class); FileSystem fs = FileSystem.get(conf); fs.delete(new Path(args[1]), true); // delete output dir FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); int reducers = 272; int mappers = 272; conf.setNumMapTasks(reducers); conf.setNumReduceTasks(mappers); // set parameters conf.set("k", "" + k); conf.set("r", "" + k); conf.set("parts", "" + parts); // number of partitions per dimension System.out.println( "running DOMAIN with k=" + k + " r=" + r + " parts=" + parts + " " + "useCellBasedAlgo=" + useCellBasedAlgo + " reducers=" + reducers + " mappers=" + mappers); JobClient.runJob(conf); return 0; }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(AccessProcessJob.class); conf.set(nameNode, hdfsURL); conf.setJobName("AccessProcessJob"); Job job = Job.getInstance(conf, "AccessProcessJob"); new Path(outputPath).getFileSystem(conf).delete(new Path(outputPath), true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(AccessProcessMap.class); conf.setReducerClass(AccessProcessReduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); JobClient.runJob(conf); }
public JobBuilder compressor(CompressionType type, Class<? extends CompressionCodec> codec) throws IOException { _jobConf.setBoolean("mapred.output.compress", true); _jobConf.set("mapred.output.compression.type", type.toString()); _jobConf.setClass("mapred.output.compression.codec", codec, CompressionCodec.class); return this; }
@Override protected void setConfigProperty(JobConf config, Object key, Object value) { // don't let these objects pass, even though toString is called below. if (value instanceof Class || value instanceof JobConf) return; config.set(key.toString(), value.toString()); }
public void run() throws Exception { long startTime = System.currentTimeMillis(); JobConf conf = new JobConf(ItemCFJob.class); conf.setJobName("ItemCF" + System.currentTimeMillis()); conf.setNumMapTasks(10); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); StringBuilder sb = new StringBuilder(); sb.append("--input ").append(input); sb.append(" --output ").append(output); if (flag) { sb.append(" --booleanData true"); } else { sb.append(" --booleanData false"); } sb.append(" --similarityClassname " + Constants.mahout_similarityclassname); sb.append(" --tempDir ").append(tmp); String[] args = sb.toString().split(" "); RecommenderJob job = new RecommenderJob(); job.setConf(conf); job.run(args); long endTime = System.currentTimeMillis(); logger.info( "recommdation job [" + conf.getJobName() + "] run finish. it costs" + (endTime - startTime) / 1000 + "s."); }
private void setPageRankLinksOptions(JobConf job) throws URISyntaxException { job.setLong("pages", options.getNumPages()); job.setLong("slotpages", options.getNumSlotPages()); job.set("delimiter", cdelim); Utils.shareLinkZipfCore(options, job); }
public static void getData(CloudataConf conf, Path keyPath) throws IOException { JobConf jobConf = new JobConf(TeraReadJob.class); jobConf.set("user.name", conf.getUserId()); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); Path tempOutputPath = new Path("ManyTableJob_Get_" + System.currentTimeMillis()); jobConf.setJobName("ManyTableJob_Get_" + "(" + new Date() + ")"); TextOutputFormat.setOutputPath(jobConf, tempOutputPath); // <MAP> jobConf.setMapperClass(ManyTableGetMap.class); jobConf.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(jobConf, keyPath); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0); // </MAP> // <REDUCE> jobConf.setNumReduceTasks(0); // </REDUCE> try { // Run Job JobClient.runJob(jobConf); } finally { // delete temp output path FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
public RunningJob run(String inputPath, String outputPath) throws Exception { JobConf conf = new JobConf(BuildIndex.class); conf.setJobName("BuildIndex"); FileInputFormat.addInputPath(conf, new Path(inputPath)); // multiple path FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(LongWritable.class); conf.setMapOutputValueClass(LongWritable.class); conf.set("delim", delim); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setInt("keyFieldIndexTwo", keyFieldIndexTwo); conf.setMapperClass(BuildIndexMapper.class); conf.setNumReduceTasks(1); conf.setReducerClass(BuildIndexReducer.class); conf.setInputFormat(TextInputFormat.class); // conf.setInputFormat(CustomInputFormat.class); // FileOutputFormat.setCompressOutput(conf,true); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
/* * Creates the configuration object necessary to run a specific vertex from * map work. This includes input formats, input processor, etc. */ private JobConf initializeVertexConf(JobConf baseConf, MapWork mapWork) { JobConf conf = new JobConf(baseConf); if (mapWork.getNumMapTasks() != null) { conf.setInt(MRJobConfig.NUM_MAPS, mapWork.getNumMapTasks().intValue()); } if (mapWork.getMaxSplitSize() != null) { HiveConf.setLongVar( conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mapWork.getMaxSplitSize().longValue()); } if (mapWork.getMinSplitSize() != null) { HiveConf.setLongVar( conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mapWork.getMinSplitSize().longValue()); } if (mapWork.getMinSplitSizePerNode() != null) { HiveConf.setLongVar( conf, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mapWork.getMinSplitSizePerNode().longValue()); } if (mapWork.getMinSplitSizePerRack() != null) { HiveConf.setLongVar( conf, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mapWork.getMinSplitSizePerRack().longValue()); } Utilities.setInputAttributes(conf, mapWork); String inpFormat = HiveConf.getVar(conf, HiveConf.ConfVars.HIVETEZINPUTFORMAT); if ((inpFormat == null) || (!StringUtils.isNotBlank(inpFormat))) { inpFormat = ShimLoader.getHadoopShims().getInputFormatClassName(); } if (mapWork.isUseBucketizedHiveInputFormat()) { inpFormat = BucketizedHiveInputFormat.class.getName(); } conf.set("mapred.mapper.class", ExecMapper.class.getName()); conf.set("mapred.input.format.class", inpFormat); return conf; }
@Override public int run(String[] args) throws Exception { final int ret = parseArgs(args); if (ret < 0) { return ret; } JobConf config = new JobConf(getConf(), TfIdfNovelty.class); config.setJobName("Influence-TfIdfNovelty"); config.set(Fields.BASIS.get(), basisPath); if (datesPath != null) { config.set(Fields.DOC_DATES.get(), datesPath); } config.setBoolean(Fields.IGNORE.get(), ignoreDocs); if (bands > 0) { config.setInt(Fields.BANDS.get(), bands); } if (rows > 0) { config.setInt(Fields.ROWS.get(), rows); } SetupHelper.getInstance() .setSequenceInput(config, inputPath) .setSequenceOutput(config, outputPath); config.setMapOutputKeyClass(HashBandWritable.class); config.setMapOutputValueClass(DocumentWithVectorWritable.class); config.setMapperClass(TfIdfNoveltyLshMapper.class); if (outputBuckets) { config.setOutputKeyClass(HashBandWritable.class); config.setOutputValueClass(IntArrayWritable.class); config.setReducerClass(TfIdfNoveltyIdentityReducer.class); } else { config.setOutputKeyClass(Text.class); config.setOutputValueClass(VectorWritable.class); config.setReducerClass(TfIdfNoveltyReducer.class); } // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); JobClient.runJob(config); return 0; }
@SuppressWarnings({"deprecation", "null"}) public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output) throws IOException { sLogger.setLevel(Level.INFO); JobConf conf = new JobConf(c); conf.setJobName("bitext.compile"); boolean useVocabServer = false; Thread vst1 = null; Thread vst2 = null; VocabServer vocabServer1 = null; VocabServer vocabServer2 = null; try { // inputPaths = bi-text given as input in main method of HadoopAlign conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PhrasePair.class); conf.setMapperClass(BitextCompilerMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPaths); conf.set("stream.recordreader.begin", "<pchunk"); conf.set("stream.recordreader.end", "</pchunk>"); conf.set("stream.recordreader.slowmatch", "false"); conf.set("stream.recordreader.maxrec", "100000"); conf.setInputFormat(XMLInput.class); FileOutputFormat.setOutputPath(conf, output); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setJar("/chomes/fture/jars/ivory.jar"); conf.set("mapred.child.java.opts", "-Xmx2048m"); System.out.println("Running job " + conf.getJobName()); System.out.println("Input: " + inputPaths); System.out.println("Output: " + output); JobClient.runJob(conf); } finally { try { if (vst1 != null) vocabServer1.stopServer(); if (vst2 != null) vocabServer2.stopServer(); if (vst1 != null) vst1.join(); if (vst2 != null) vst2.join(); } catch (InterruptedException e) { } } }
@Before public void createMockKeyValues() throws Exception { // Make a MockInstance here, by setting the instance name to be the same as this mock instance // we can "trick" the InputFormat into using a MockInstance mockInstance = new MockInstance(test.getMethodName()); inputformat = new HiveAccumuloTableInputFormat(); conf = new JobConf(); conf.set(AccumuloSerDeParameters.TABLE_NAME, TEST_TABLE); conf.set(AccumuloSerDeParameters.USE_MOCK_INSTANCE, "true"); conf.set(AccumuloSerDeParameters.INSTANCE_NAME, test.getMethodName()); conf.set(AccumuloSerDeParameters.USER_NAME, USER); conf.set(AccumuloSerDeParameters.USER_PASS, PASS); conf.set(AccumuloSerDeParameters.ZOOKEEPERS, "localhost:2181"); // not used for mock, but // required by input format. columnNames = Arrays.asList("name", "sid", "dgrs", "mills"); columnTypes = Arrays.<TypeInfo>asList( TypeInfoFactory.stringTypeInfo, TypeInfoFactory.intTypeInfo, TypeInfoFactory.doubleTypeInfo, TypeInfoFactory.longTypeInfo); conf.set(AccumuloSerDeParameters.COLUMN_MAPPINGS, "cf:name,cf:sid,cf:dgrs,cf:mills"); conf.set(serdeConstants.LIST_COLUMNS, "name,sid,dgrs,mills"); conf.set(serdeConstants.LIST_COLUMN_TYPES, "string,int,double,bigint"); con = mockInstance.getConnector(USER, new PasswordToken(PASS.getBytes())); con.tableOperations().create(TEST_TABLE); con.securityOperations().changeUserAuthorizations(USER, new Authorizations("blah")); BatchWriterConfig writerConf = new BatchWriterConfig(); BatchWriter writer = con.createBatchWriter(TEST_TABLE, writerConf); Mutation m1 = new Mutation(new Text("r1")); m1.put(COLUMN_FAMILY, NAME, new Value("brian".getBytes())); m1.put(COLUMN_FAMILY, SID, new Value(parseIntBytes("1"))); m1.put(COLUMN_FAMILY, DEGREES, new Value(parseDoubleBytes("44.5"))); m1.put(COLUMN_FAMILY, MILLIS, new Value(parseLongBytes("555"))); Mutation m2 = new Mutation(new Text("r2")); m2.put(COLUMN_FAMILY, NAME, new Value("mark".getBytes())); m2.put(COLUMN_FAMILY, SID, new Value(parseIntBytes("2"))); m2.put(COLUMN_FAMILY, DEGREES, new Value(parseDoubleBytes("55.5"))); m2.put(COLUMN_FAMILY, MILLIS, new Value(parseLongBytes("666"))); Mutation m3 = new Mutation(new Text("r3")); m3.put(COLUMN_FAMILY, NAME, new Value("dennis".getBytes())); m3.put(COLUMN_FAMILY, SID, new Value(parseIntBytes("3"))); m3.put(COLUMN_FAMILY, DEGREES, new Value(parseDoubleBytes("65.5"))); m3.put(COLUMN_FAMILY, MILLIS, new Value(parseLongBytes("777"))); writer.addMutation(m1); writer.addMutation(m2); writer.addMutation(m3); writer.close(); }
public static void setAggregatorDescriptors( JobConf job, Class<? extends ValueAggregatorDescriptor>[] descriptors) { job.setInt("aggregator.descriptor.num", descriptors.length); // specify the aggregator descriptors for (int i = 0; i < descriptors.length; i++) { job.set("aggregator.descriptor." + i, "UserDefined," + descriptors[i].getName()); } }
@SuppressWarnings("rawtypes") @Override public void sinkConfInit( FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema); ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class); }
/** * Set the column names and types into the job conf for the input format to use. * * @param job the job to update * @param cols the columns of the table */ private void setColumnTypes(JobConf job, List<FieldSchema> cols) { StringBuilder colNames = new StringBuilder(); StringBuilder colTypes = new StringBuilder(); boolean isFirst = true; for (FieldSchema col : cols) { if (isFirst) { isFirst = false; } else { colNames.append(','); colTypes.append(','); } colNames.append(col.getName()); colTypes.append(col.getType()); } job.set(serdeConstants.LIST_COLUMNS, colNames.toString()); job.set(serdeConstants.LIST_COLUMN_TYPES, colTypes.toString()); }
private void createPageRankLinksDirectly() throws IOException, URISyntaxException { log.info("Creating PageRank links", null); JobConf job = new JobConf(PagerankData.class); String jobname = "Create pagerank links"; Path fout = new Path(options.getResultPath(), EDGES_DIR_NAME); job.setJobName(jobname); setPageRankLinksOptions(job); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); // job.setMapOutputKeyClass(LongWritable.class); // job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, dummy.getPath()); job.setInputFormat(NLineInputFormat.class); job.setMapperClass(DummyToPageRankLinksMapper.class); if (options.isSequenceOut()) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.getCodecClass()) { job.set("mapred.output.compression.type", "BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass()); } FileOutputFormat.setOutputPath(job, fout); log.info("Running Job: " + jobname); log.info("Dummy file " + dummy.getPath() + " as input"); log.info("Edges file " + fout + " as output"); JobClient.runJob(job); log.info("Finished Running Job: " + jobname); }