/** * Instantiates a new DistCp. * * @param configuration the hadoop configuration * @param user the user */ public DistCp(Configuration configuration, String user) { Assert.notNull(configuration, "configuration required"); this.configuration = ConfigurationUtils.createFrom(configuration, null); // disable GenericOptionsParser this.configuration.setBoolean("mapred.used.genericoptionsparser", true); this.configuration.setBoolean("mapreduce.client.genericoptionsparser.used", true); this.user = user; }
@SuppressWarnings("rawtypes") public void run() { TextLine scheme = new TextLine(new Fields("line")); // Tap input = inputPath.matches("^[^:]+://.*") ? new Hfs(scheme, inputPath) : new Lfs(scheme, // inputPath); Tap input = new Hfs(scheme, inputPath); // extract the tags through regex and save content in group 1 -> as fields tags String tagJsonRegex = "\"tags\":\\[([^\\]]*)"; Function parse = new RegexParser(new Fields("tags"), tagJsonRegex, new int[] {1}); // for each line get the tags using a regex Pipe assembly = new Each("import", new Fields("line"), parse, Fields.RESULTS); // split "tags" into "tag" Function split = new RegexSplitGenerator(new Fields("tag"), ","); assembly = new Each(assembly, new Fields("tags"), split); assembly = new Each(assembly, new Fields("tag"), new RegexFilter(".+")); // group each tag by name assembly = new GroupBy(assembly, new Fields("tag")); // count each tag under "count" field Aggregator count = new Count(new Fields("count")); assembly = new Every(assembly, count); // create a SINK tap to write to the default filesystem // by default, TextLine writes all fields out new TextLine(new Fields("tag", "count")); // Tap output = outputPath.matches("^[^:]+://.*") ? new Hfs(sinkScheme, outputPath, // SinkMode.REPLACE) : new Lfs( // sinkScheme, outputPath, SinkMode.REPLACE); Tap output = new Lfs(scheme, outputPath, SinkMode.REPLACE); // wire the existing Hadoop config into HadoopFlow Properties properties = ConfigurationUtils.asProperties(hadoopConfiguration); FlowConnector flowConnector = new HadoopFlowConnector(properties); FlowConnectorProps.setDebugLevel(properties, DebugLevel.VERBOSE); Flow flow = flowConnector.connect("hashtagcount", input, output, assembly); flow.start(); flow.complete(); }
@SuppressWarnings("rawtypes") public void afterPropertiesSet() throws Exception { final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); buildGenericOptions(cfg); if (StringUtils.hasText(user)) { UserGroupInformation ugi = UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser()); ugi.doAs( new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { job = new Job(cfg); return null; } }); } else { job = new Job(cfg); } ClassLoader loader = (beanClassLoader != null ? beanClassLoader : org.springframework.util.ClassUtils.getDefaultClassLoader()); if (jar != null) { JobConf conf = (JobConf) job.getConfiguration(); conf.setJar(jar.getURI().toString()); loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg); conf.setClassLoader(loader); } // set first to enable auto-detection of K/V to skip the key/value types to be specified if (mapper != null) { Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class); job.setMapperClass(mapperClass); configureMapperTypesIfPossible(job, mapperClass); } if (reducer != null) { Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class); job.setReducerClass(reducerClass); configureReducerTypesIfPossible(job, reducerClass); } if (StringUtils.hasText(name)) { job.setJobName(name); } if (combiner != null) { job.setCombinerClass(resolveClass(combiner, loader, Reducer.class)); } if (groupingComparator != null) { job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class)); } if (inputFormat != null) { job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class)); } if (mapKey != null) { job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class)); } if (mapValue != null) { job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class)); } if (numReduceTasks != null) { job.setNumReduceTasks(numReduceTasks); } if (key != null) { job.setOutputKeyClass(resolveClass(key, loader, Object.class)); } if (value != null) { job.setOutputValueClass(resolveClass(value, loader, Object.class)); } if (outputFormat != null) { job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class)); } if (partitioner != null) { job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class)); } if (sortComparator != null) { job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class)); } if (StringUtils.hasText(workingDir)) { job.setWorkingDirectory(new Path(workingDir)); } if (jarClass != null) { job.setJarByClass(jarClass); } if (!CollectionUtils.isEmpty(inputPaths)) { for (String path : inputPaths) { FileInputFormat.addInputPath(job, new Path(path)); } } if (StringUtils.hasText(outputPath)) { FileOutputFormat.setOutputPath(job, new Path(outputPath)); } if (compressOutput != null) { FileOutputFormat.setCompressOutput(job, compressOutput); } if (codecClass != null) { FileOutputFormat.setOutputCompressorClass( job, resolveClass(codecClass, loader, CompressionCodec.class)); } processJob(job); }
@CliCommand( value = {PREFIX + "props list"}, help = "Returns (all) the Hadoop properties") public String listProps() { return ConfigurationUtils.asProperties(hadoopConfiguration).toString(); }
@Override HadoopFlow createFlow() throws IOException { // copy flowDef FlowDef def = FlowDef.flowDef(); if (flowDef != null) { def.addSinks(flowDef.getSinksCopy()) .addSources(flowDef.getSourcesCopy()) .addTraps(flowDef.getTrapsCopy()) .addTails(flowDef.getTailsArray()) .setAssertionLevel(flowDef.getAssertionLevel()) .setDebugLevel(flowDef.getDebugLevel()) .addCheckpoints(flowDef.getCheckpointsCopy()) .addTags(flowDef.getTags()) .setName(flowDef.getName()); } Set<Pipe> heads = new LinkedHashSet<Pipe>(); if (tails != null) { for (Pipe pipe : tails) { Collections.addAll(heads, pipe.getHeads()); } } Pipe pipe = null; if (heads.size() == 1) { pipe = heads.iterator().next(); } if (sources != null && sources.size() == 1) { Tap tap = sources.remove(MARKER); if (tap != null) { sources.put(pipe.getName(), tap); } } if (sinks != null && sinks.size() == 1) { Tap tap = sinks.remove(MARKER); if (tap != null) { sinks.put(pipe.getName(), tap); } } def.addSources(sources).addSinks(sinks).addTraps(traps); if (tails != null) { def.addTails(tails); } if (StringUtils.hasText(beanName)) { def.addTag(beanName); if (!StringUtils.hasText(def.getName())) { def.setName(beanName); } } Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); Properties props = ConfigurationUtils.asProperties(cfg); if (jarSetup) { if (jar != null) { AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString())); } else if (jarClass != null) { AppProps.setApplicationJarClass(props, jarClass); } else { // auto-detection based on the classpath ClassLoader cascadingCL = Cascade.class.getClassLoader(); Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class); Resource cascadingHadoop = ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class"); // find jgrapht Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class"); Assert.notNull(cascadingCore, "Cannot find cascading-core.jar"); Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar"); Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar"); if (log.isDebugEnabled()) { log.debug( "Auto-detecting Cascading Libs [" + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht}) + "]"); } ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht); // config changed, reinit properties props = ConfigurationUtils.asProperties(cfg); } } if (jobPoolingInterval != null) { FlowProps.setJobPollingInterval(props, jobPoolingInterval); } if (maxConcurrentSteps != null) { FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps); } HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def); return flow; }