private String[] getActiveServersList(JobContext context) { String[] servers = null; try { JobClient jc = new JobClient((JobConf) context.getConfiguration()); ClusterStatus status = jc.getClusterStatus(true); Collection<String> atc = status.getActiveTrackerNames(); servers = new String[atc.size()]; int s = 0; for (String serverInfo : atc) { // System.out.println("serverInfo:" + serverInfo); StringTokenizer st = new StringTokenizer(serverInfo, ":"); String trackerName = st.nextToken(); // System.out.println("trackerName:" + trackerName); StringTokenizer st1 = new StringTokenizer(trackerName, "_"); st1.nextToken(); servers[s++] = st1.nextToken(); } } catch (IOException e) { e.printStackTrace(); } return servers; }
/** * set up input file which has the list of input files. * * @return boolean * @throws IOException */ private boolean setup() throws IOException { estimateSavings(); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobconf); Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId); LOG.info(JOB_DIR_LABEL + "=" + jobdir); jobconf.set(JOB_DIR_LABEL, jobdir.toString()); Path log = new Path(jobdir, "_logs"); // The control file should have small size blocks. This helps // in spreading out the load from mappers that will be spawned. jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE); FileOutputFormat.setOutputPath(jobconf, log); LOG.info("log=" + log); // create operation list FileSystem fs = jobdir.getFileSystem(jobconf); Path opList = new Path(jobdir, "_" + OP_LIST_LABEL); jobconf.set(OP_LIST_LABEL, opList.toString()); int opCount = 0, synCount = 0; SequenceFile.Writer opWriter = null; try { opWriter = SequenceFile.createWriter( fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE); for (RaidPolicyPathPair p : raidPolicyPathPairList) { // If a large set of files are Raided for the first time, files // in the same directory that tend to have the same size will end up // with the same map. This shuffle mixes things up, allowing a better // mix of files. java.util.Collections.shuffle(p.srcPaths); for (FileStatus st : p.srcPaths) { opWriter.append(new Text(st.getPath().toString()), p.policy); opCount++; if (++synCount > SYNC_FILE_MAX) { opWriter.sync(); synCount = 0; } } } } finally { if (opWriter != null) { opWriter.close(); } fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file } raidPolicyPathPairList.clear(); jobconf.setInt(OP_COUNT_LABEL, opCount); LOG.info("Number of files=" + opCount); jobconf.setNumMapTasks( getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers())); LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks()); return opCount != 0; }
public static void runSortJob(String... args) throws Exception { Path input = new Path(args[0]); Path output = new Path(args[1]); JobConf job = new JobConf(); job.setNumReduceTasks(2); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(SampleJob.class); output.getFileSystem(job).delete(output, true); JobClient jc = new JobClient(job); JobClient.setTaskOutputFilter(job, JobClient.TaskStatusFilter.ALL); RunningJob rj = jc.submitJob(job); try { if (!jc.monitorAndPrintJob(job, rj)) { System.out.println("Job Failed: " + rj.getFailureInfo()); throw new IOException("Job failed!"); } } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } }
private RunningJob submitAction(Context context, Namespace ns) throws Exception { Hive2ActionExecutor ae = new Hive2ActionExecutor(); WorkflowAction action = context.getAction(); ae.prepareActionDir(getFileSystem(), context); ae.submitLauncher(getFileSystem(), context, action); String jobId = action.getExternalId(); String jobTracker = action.getTrackerUri(); String consoleUrl = action.getConsoleUrl(); assertNotNull(jobId); assertNotNull(jobTracker); assertNotNull(consoleUrl); Element e = XmlUtils.parseXml(action.getConf()); XConfiguration conf = new XConfiguration( new StringReader(XmlUtils.prettyPrint(e.getChild("configuration", ns)).toString())); conf.set("mapred.job.tracker", e.getChildTextTrim("job-tracker", ns)); conf.set("fs.default.name", e.getChildTextTrim("name-node", ns)); conf.set("user.name", context.getProtoActionConf().get("user.name")); conf.set("group.name", getTestGroup()); JobConf jobConf = Services.get().get(HadoopAccessorService.class).createJobConf(jobTracker); XConfiguration.copy(conf, jobConf); String user = jobConf.get("user.name"); JobClient jobClient = Services.get().get(HadoopAccessorService.class).createJobClient(user, jobConf); final RunningJob runningJob = jobClient.getJob(JobID.forName(jobId)); assertNotNull(runningJob); return runningJob; }
/** * Generate the requested number of file splits, with the filename set to the filename of the * output file. */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { /** 设置输入分片的个数* */ JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; } System.out.println("numMaps-------" + numMaps); InputSplit[] result = new InputSplit[numMaps]; Path outDir = FileOutputFormat.getOutputPath(job); for (int i = 0; i < result.length; ++i) { result[i] = new FileSplit(new Path(outDir, "dummy-split-" + i), 0, 1, (String[]) null); } return result; }
public void testActionCheck() throws Exception { JPAService jpaService = Services.get().get(JPAService.class); WorkflowJobBean job = this.addRecordToWfJobTable(WorkflowJob.Status.RUNNING, WorkflowInstance.Status.RUNNING); WorkflowActionBean action = this.addRecordToWfActionTable(job.getId(), "1", WorkflowAction.Status.PREP); WorkflowActionGetJPAExecutor wfActionGetCmd = new WorkflowActionGetJPAExecutor(action.getId()); new ActionStartXCommand(action.getId(), "map-reduce").call(); action = jpaService.execute(wfActionGetCmd); ActionExecutorContext context = new ActionXCommand.ActionExecutorContext(job, action, false, false); MapReduceActionExecutor actionExecutor = new MapReduceActionExecutor(); JobConf conf = actionExecutor.createBaseHadoopConf(context, XmlUtils.parseXml(action.getConf())); String user = conf.get("user.name"); JobClient jobClient = Services.get().get(HadoopAccessorService.class).createJobClient(user, conf); String launcherId = action.getExternalId(); final RunningJob launcherJob = jobClient.getJob(JobID.forName(launcherId)); waitFor( 120 * 1000, new Predicate() { public boolean evaluate() throws Exception { return launcherJob.isComplete(); } }); assertTrue(launcherJob.isSuccessful()); Map<String, String> actionData = LauncherMapperHelper.getActionData(getFileSystem(), context.getActionDir(), conf); assertTrue(LauncherMapperHelper.hasIdSwap(actionData)); new ActionCheckXCommand(action.getId()).call(); action = jpaService.execute(wfActionGetCmd); String mapperId = action.getExternalId(); String childId = action.getExternalChildIDs(); assertTrue(launcherId.equals(mapperId)); final RunningJob mrJob = jobClient.getJob(JobID.forName(childId)); waitFor( 120 * 1000, new Predicate() { public boolean evaluate() throws Exception { return mrJob.isComplete(); } }); assertTrue(mrJob.isSuccessful()); new ActionCheckXCommand(action.getId()).call(); action = jpaService.execute(wfActionGetCmd); assertEquals("SUCCEEDED", action.getExternalStatus()); }
public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
/** * This is the main routine for launching a distributed random write job. It runs 10 maps/node and * each node writes 1 gig of data to a DFS file. The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); /** 如果属性不存在 则返回默认的值 * */ int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong( "test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); /** 建议型的 * */ System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println( "The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
private RunningJob _test(String... arg) throws Exception { Path actionDir = getFsTestCaseDir(); File jar = IOUtils.createJar( new File(getTestCaseDir()), "launcher.jar", LauncherMapper.class, LauncherSecurityManager.class, LauncherException.class, LauncherMainTester.class); FileSystem fs = getFileSystem(); Path launcherJar = new Path(actionDir, "launcher.jar"); fs.copyFromLocalFile(new Path(jar.toString()), launcherJar); JobConf jobConf = new JobConf(); jobConf.set("user.name", getTestUser()); jobConf.set("group.name", getTestGroup()); jobConf.setInt("mapred.map.tasks", 1); jobConf.setInt("mapred.map.max.attempts", 1); jobConf.setInt("mapred.reduce.max.attempts", 1); jobConf.set("mapred.job.tracker", getJobTrackerUri()); jobConf.set("fs.default.name", getNameNodeUri()); injectKerberosInfo(jobConf); LauncherMapper lm = new LauncherMapper(); lm.setupMainClass(jobConf, LauncherMainTester.class.getName()); lm.setupMainArguments(jobConf, arg); Configuration actionConf = new XConfiguration(); lm.setupLauncherInfo(jobConf, "1", "1@a", actionDir, "1@a-0", actionConf); assertEquals("1", actionConf.get("oozie.job.id")); assertEquals("1@a", actionConf.get("oozie.action.id")); DistributedCache.addFileToClassPath(new Path(launcherJar.toUri().getPath()), jobConf); JobClient jobClient = createJobClient(); final RunningJob runningJob = jobClient.submitJob(jobConf); System.out.println("Action Dir: " + actionDir); System.out.println("LauncherMapper ID: " + runningJob.getJobID().toString()); waitFor( 180 * 1000, new Predicate() { public boolean evaluate() throws Exception { return runningJob.isComplete(); } }); return runningJob; }
public static Token< org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier> getJobTrackerDelegationToken(Configuration conf, String userName) throws Exception { // LOG.info("getJobTrackerDelegationToken("+conf+","+userName+")"); JobClient jcl = new JobClient(new JobConf(conf, HCatOutputFormat.class)); Token<org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier> t = jcl.getDelegationToken(new Text(userName)); // LOG.info("got "+t); return t; // return null; }
private void prepareDeleteIndex(String indexerName) { // We do not have to take a lock on the indexer, since once in delete state the indexer cannot // be modified anymore by ordinary users. boolean canBeDeleted = false; try { // Read current situation of record and assure it is still actual IndexerDefinition indexer = indexerModel.getFreshIndexer(indexerName); if (indexer.getLifecycleState() == IndexerDefinition.LifecycleState.DELETE_REQUESTED) { canBeDeleted = true; String queueSubscriptionId = indexer.getSubscriptionId(); if (queueSubscriptionId != null) { sepModel.removeSubscription(indexer.getSubscriptionId()); // We leave the subscription ID in the indexer definition FYI } if (indexer.getActiveBatchBuildInfo() != null) { JobClient jobClient = getJobClient(); String jobId = indexer.getActiveBatchBuildInfo().getJobId(); RunningJob job = jobClient.getJob(jobId); if (job != null) { job.killJob(); log.info("Kill indexer build job for indexer " + indexerName + ", job ID = " + jobId); } // Just to be sure... jobStatusWatcher.assureWatching( indexer.getName(), indexer.getActiveBatchBuildInfo().getJobId()); canBeDeleted = false; } if (!canBeDeleted) { indexer = new IndexerDefinitionBuilder() .startFrom(indexer) .lifecycleState(IndexerDefinition.LifecycleState.DELETING) .build(); indexerModel.updateIndexerInternal(indexer); } } else if (indexer.getLifecycleState() == IndexerDefinition.LifecycleState.DELETING) { // Check if the build job is already finished, if so, allow delete if (indexer.getActiveBatchBuildInfo() == null) { canBeDeleted = true; } } } catch (Throwable t) { log.error("Error preparing deletion of indexer " + indexerName, t); } if (canBeDeleted) { deleteIndexer(indexerName); } }
private void checkGeneratedDataAndJobStatus(long inputSize) throws IOException { LOG.info("Verify the generated data size."); long dataSizeInMB = getDataSizeInMB(new Path(gridmixDir, "input")); Assert.assertTrue( "Generate data has not matched with given size", dataSizeInMB + 0.1 > inputSize || dataSizeInMB - 0.1 < inputSize); JobClient jobClient = jtClient.getClient(); int len = jobClient.getAllJobs().length; LOG.info("Verify the job status after completion of job."); Assert.assertEquals( "Job has not succeeded.", JobStatus.SUCCEEDED, jobClient.getAllJobs()[len - 1].getRunState()); }
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf( "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); // vv MaxTemperatureDriverV6 conf.setProfileEnabled(true); conf.setProfileParams( "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s"); conf.setProfileTaskRange(true, "0-2"); // ^^ MaxTemperatureDriverV6 JobClient.runJob(conf); return 0; }
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("UFO count"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: avro UFO counter <in> <out>"); System.exit(2); } FileInputFormat.addInputPath(conf, new Path(otherArgs[0])); Path outputPath = new Path(otherArgs[1]); FileOutputFormat.setOutputPath(conf, outputPath); outputPath.getFileSystem(conf).delete(outputPath); Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc")); AvroJob.setInputSchema(conf, input_schema); AvroJob.setMapOutputSchema( conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG))); AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA); AvroJob.setMapperClass(conf, AvroRecordMapper.class); AvroJob.setReducerClass(conf, AvroRecordReducer.class); conf.setInputFormat(AvroInputFormat.class); JobClient.runJob(conf); return 0; }
/** * Run the job * * @param params The Job parameters containing the gramSize, input output folders, defaultCat, * encoding */ public static void runJob(Parameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesClassifierDriver.class); conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath")); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath"))); Path outPath = new Path(params.get("testDirPath") + "-output"); FileOutputFormat.setOutputPath(conf, outPath); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setMapperClass(BayesClassifierMapper.class); conf.setCombinerClass(BayesClassifierReducer.class); conf.setReducerClass(BayesClassifierReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set( "io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); HadoopUtil.overwriteOutput(outPath); conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); Path outputFiles = new Path(outPath, "part*"); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params); log.info("{}", matrix.summarize()); }
/** * Driver to copy srcPath to destPath depending on required protocol. * * @param args arguments */ static void copy(final Configuration conf, final Arguments args) throws IOException { LOG.info("srcPaths=" + args.srcs); LOG.info("destPath=" + args.dst); checkSrcPath(conf, args.srcs); JobConf job = createJobConf(conf); if (args.preservedAttributes != null) { job.set(PRESERVE_STATUS_LABEL, args.preservedAttributes); } if (args.mapredSslConf != null) { job.set("dfs.https.client.keystore.resource", args.mapredSslConf); } // Initialize the mapper try { setup(conf, job, args); JobClient.runJob(job); finalize(conf, job, args.dst, args.preservedAttributes); } finally { // delete tmp fullyDelete(job.get(TMP_DIR_LABEL), job); // delete jobDirectory fullyDelete(job.get(JOB_DIR_LABEL), job); } }
public static void main(String[] args) throws Exception { String input = "hdfs://centos:9000/access.log.10"; String output = "hdfs://centos:9000/out_kpitime"; JobConf conf = new JobConf(KPITime.class); conf.setJobName("KPITime"); // conf.addResource("classpath:/hadoop/core-site.xml"); // conf.addResource("classpath:/hadoop/hdfs-site.xml"); // conf.addResource("classpath:/hadoop/mapred-site.xml"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(KPITimeMapper.class); conf.setCombinerClass(KPITimeReducer.class); conf.setReducerClass(KPITimeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); System.exit(0); }
public static void run(Map<String, String> path) throws IOException { JobConf conf = Recommend.config(); String input = path.get("Step2Input"); String output = path.get("Step2Output"); HdfsDAO hdfs = new HdfsDAO(Recommend.HDFS, conf); hdfs.rmr(output); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Step2_UserVectorToCooccurrenceMapper.class); // conf.setCombinerClass(Step2_UserVectorToConoccurrenceReducer.class); // conf.setReducerClass(Step2_UserVectorToConoccurrenceReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); RunningJob job = JobClient.runJob(conf); while (!job.isComplete()) { job.waitForCompletion(); } }
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: BuildGraph"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(BuildGraph.class); conf.setJobName("BuildGraph " + inputPath + " " + ContrailConfig.K); ContrailConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(BuildGraphMapper.class); conf.setReducerClass(BuildGraphReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
public void runParseTest( String fieldTerminator, String lineTerminator, String encloser, String escape, boolean encloseRequired) throws IOException { ClassLoader prevClassLoader = null; String[] argv = getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); runImport(argv); try { String tableClassName = getTableName(); argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true); CompilationManager compileMgr = new CompilationManager(opts); String jarFileName = compileMgr.getJarFilename(); // Make sure the user's class is loaded into our address space. prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName); JobConf job = new JobConf(); job.setJar(jarFileName); // Tell the job what class we're testing. job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName); // use local mode in the same JVM. ConfigurationHelper.setJobtrackerAddr(job, "local"); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } String warehouseDir = getWarehouseDir(); Path warehousePath = new Path(warehouseDir); Path inputPath = new Path(warehousePath, getTableName()); Path outputPath = new Path(warehousePath, getTableName() + "-out"); job.setMapperClass(ReparseMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); JobClient.runJob(job); } catch (InvalidOptionsException ioe) { fail(ioe.toString()); } catch (ParseException pe) { fail(pe.toString()); } finally { if (null != prevClassLoader) { ClassLoaderStack.setCurrentClassLoader(prevClassLoader); } } }
public static void main(String[] args) throws Exception { String input = "hdfs://192.168.0.110:9000/input/access.log"; String output = "hdfs://192.168.0.110:9000/user/hdfs/pv"; JobConf conf = new JobConf(KPIPV.class); conf.setJobName("KPIPV"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(KPIPVMapper.class); conf.setCombinerClass(KPIPVReducer.class); conf.setReducerClass(KPIPVReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); System.exit(0); }
public static void runJob(String[] args) { JobConf conf = new JobConf(CassandraBulkLoader.class); if (args.length >= 4) { conf.setNumReduceTasks(new Integer(args[3])); } try { // We store the cassandra storage-conf.xml on the HDFS cluster DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf); } catch (URISyntaxException e) { throw new RuntimeException(e); } conf.setInputFormat(KeyValueTextInputFormat.class); conf.setJobName("CassandraBulkLoader_v2"); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, new Path(args[1])); FileOutputFormat.setOutputPath(conf, new Path(args[2])); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } }
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); // set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); // use the defined mapper job.setMapperClass(MapClass.class); // use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; } conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
public RunningJob run(String inputPath, String outputPath) throws Exception { JobConf conf = new JobConf(BuildIndex.class); conf.setJobName("BuildIndex"); FileInputFormat.addInputPath(conf, new Path(inputPath)); // multiple path FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(LongWritable.class); conf.setMapOutputValueClass(LongWritable.class); conf.set("delim", delim); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setInt("keyFieldIndexTwo", keyFieldIndexTwo); conf.setMapperClass(BuildIndexMapper.class); conf.setNumReduceTasks(1); conf.setReducerClass(BuildIndexReducer.class); conf.setInputFormat(TextInputFormat.class); // conf.setInputFormat(CustomInputFormat.class); // FileOutputFormat.setCompressOutput(conf,true); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
public void runMR(String myMultiLocs, String sortKey) throws ParseException, IOException, Exception, org.apache.hadoop.zebra.parser.ParseException { JobConf jobConf = new JobConf(conf); jobConf.setJobName("TestMultipleOutputs4"); jobConf.setJarByClass(TestMultipleOutputs4.class); jobConf.set("table.output.tfile.compression", "gz"); jobConf.set("sortKey", sortKey); // input settings jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(TestMultipleOutputs4.MapClass.class); jobConf.setMapOutputKeyClass(BytesWritable.class); jobConf.setMapOutputValueClass(ZebraTuple.class); FileInputFormat.setInputPaths(jobConf, inputPath); jobConf.setNumMapTasks(1); // output settings jobConf.setOutputFormat(BasicTableOutputFormat.class); BasicTableOutputFormat.setMultipleOutputs( jobConf, myMultiLocs, TestMultipleOutputs4.OutputPartitionerClass.class); // set the logical schema with 2 columns BasicTableOutputFormat.setSchema(jobConf, "word:string, count:int"); // for demo purposes, create 2 physical column groups BasicTableOutputFormat.setStorageHint(jobConf, "[word];[count]"); BasicTableOutputFormat.setSortInfo(jobConf, sortKey); System.out.println("in runMR, sortkey: " + sortKey); // set map-only job. jobConf.setNumReduceTasks(1); JobClient.runJob(jobConf); BasicTableOutputFormat.close(jobConf); }
/** * {@inheritDoc} * * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ @Override public int run(String[] args) throws Exception { JobConf configuration = new JobConf(getConf(), WordCountExtended.class); configuration.setJobName(JOB_NAME); configuration.setOutputKeyClass(Text.class); configuration.setOutputValueClass(IntWritable.class); configuration.setMapperClass(Map.class); configuration.setCombinerClass(Reduce.class); configuration.setReducerClass(Reduce.class); configuration.setInputFormat(TextInputFormat.class); configuration.setOutputFormat(TextOutputFormat.class); List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { if (JOB_SKIP_ARGUMENT.equals(args[i])) { DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration); configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true); } else { otherArgs.add(args[i]); } } FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0))); FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1))); JobClient.runJob(configuration); return 0; }
@Override public void runStep(Path outputPathLocation) throws IOException { ImmutableList<Path> pathList = ImmutableList.of( getOutputDirForStep(PartitionCrawlDBStep.class), getOutputDirForStep(GenFeedUrlsStep.class), getOutputDirForStep(GenHomepageUrlsStep.class), getOutputDirForStep(GenBlogPlatformUrlsStep.class), getOutputDirForStep(PartitionCrawlStatsStep.class), getOutputDirForStep(PartitionRedirectDataStep.class)); JobConf jobConf = new JobBuilder("BundleWriter Step", getConf()) .inputs(pathList) .inputFormat(MultiFileMergeInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(SegmentGeneratorBundleKey.class, SegmentGeneratorItemBundle.class) .outputFormat(SequenceFileOutputFormat.class) .reducer(GenBundlesStep.class, false) .partition(MultiFileMergePartitioner.class) .numReducers(CrawlListGeneratorTask.NUM_SHARDS) .speculativeExecution(false) .output(outputPathLocation) .compressMapOutput(false) .compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); LOG.info("Starting JOB"); JobClient.runJob(jobConf); LOG.info("Finsihed JOB"); }
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: Compressible"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); // JobConf conf = new JobConf(Stats.class); JobConf conf = new JobConf(Compressible.class); conf.setJobName("Compressible " + inputPath); BrushConfig.initializeConfiguration(conf); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CompressibleMapper.class); conf.setReducerClass(CompressibleReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(AccessProcessJob.class); conf.set(nameNode, hdfsURL); conf.setJobName("AccessProcessJob"); Job job = Job.getInstance(conf, "AccessProcessJob"); new Path(outputPath).getFileSystem(conf).delete(new Path(outputPath), true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(AccessProcessMap.class); conf.setReducerClass(AccessProcessReduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); JobClient.runJob(conf); }