@SuppressWarnings("rawtypes") public void run() { TextLine scheme = new TextLine(new Fields("line")); // Tap input = inputPath.matches("^[^:]+://.*") ? new Hfs(scheme, inputPath) : new Lfs(scheme, // inputPath); Tap input = new Hfs(scheme, inputPath); // extract the tags through regex and save content in group 1 -> as fields tags String tagJsonRegex = "\"tags\":\\[([^\\]]*)"; Function parse = new RegexParser(new Fields("tags"), tagJsonRegex, new int[] {1}); // for each line get the tags using a regex Pipe assembly = new Each("import", new Fields("line"), parse, Fields.RESULTS); // split "tags" into "tag" Function split = new RegexSplitGenerator(new Fields("tag"), ","); assembly = new Each(assembly, new Fields("tags"), split); assembly = new Each(assembly, new Fields("tag"), new RegexFilter(".+")); // group each tag by name assembly = new GroupBy(assembly, new Fields("tag")); // count each tag under "count" field Aggregator count = new Count(new Fields("count")); assembly = new Every(assembly, count); // create a SINK tap to write to the default filesystem // by default, TextLine writes all fields out new TextLine(new Fields("tag", "count")); // Tap output = outputPath.matches("^[^:]+://.*") ? new Hfs(sinkScheme, outputPath, // SinkMode.REPLACE) : new Lfs( // sinkScheme, outputPath, SinkMode.REPLACE); Tap output = new Lfs(scheme, outputPath, SinkMode.REPLACE); // wire the existing Hadoop config into HadoopFlow Properties properties = ConfigurationUtils.asProperties(hadoopConfiguration); FlowConnector flowConnector = new HadoopFlowConnector(properties); FlowConnectorProps.setDebugLevel(properties, DebugLevel.VERBOSE); Flow flow = flowConnector.connect("hashtagcount", input, output, assembly); flow.start(); flow.complete(); }
@CliCommand( value = {PREFIX + "props list"}, help = "Returns (all) the Hadoop properties") public String listProps() { return ConfigurationUtils.asProperties(hadoopConfiguration).toString(); }
@Override HadoopFlow createFlow() throws IOException { // copy flowDef FlowDef def = FlowDef.flowDef(); if (flowDef != null) { def.addSinks(flowDef.getSinksCopy()) .addSources(flowDef.getSourcesCopy()) .addTraps(flowDef.getTrapsCopy()) .addTails(flowDef.getTailsArray()) .setAssertionLevel(flowDef.getAssertionLevel()) .setDebugLevel(flowDef.getDebugLevel()) .addCheckpoints(flowDef.getCheckpointsCopy()) .addTags(flowDef.getTags()) .setName(flowDef.getName()); } Set<Pipe> heads = new LinkedHashSet<Pipe>(); if (tails != null) { for (Pipe pipe : tails) { Collections.addAll(heads, pipe.getHeads()); } } Pipe pipe = null; if (heads.size() == 1) { pipe = heads.iterator().next(); } if (sources != null && sources.size() == 1) { Tap tap = sources.remove(MARKER); if (tap != null) { sources.put(pipe.getName(), tap); } } if (sinks != null && sinks.size() == 1) { Tap tap = sinks.remove(MARKER); if (tap != null) { sinks.put(pipe.getName(), tap); } } def.addSources(sources).addSinks(sinks).addTraps(traps); if (tails != null) { def.addTails(tails); } if (StringUtils.hasText(beanName)) { def.addTag(beanName); if (!StringUtils.hasText(def.getName())) { def.setName(beanName); } } Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); Properties props = ConfigurationUtils.asProperties(cfg); if (jarSetup) { if (jar != null) { AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString())); } else if (jarClass != null) { AppProps.setApplicationJarClass(props, jarClass); } else { // auto-detection based on the classpath ClassLoader cascadingCL = Cascade.class.getClassLoader(); Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class); Resource cascadingHadoop = ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class"); // find jgrapht Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class"); Assert.notNull(cascadingCore, "Cannot find cascading-core.jar"); Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar"); Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar"); if (log.isDebugEnabled()) { log.debug( "Auto-detecting Cascading Libs [" + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht}) + "]"); } ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht); // config changed, reinit properties props = ConfigurationUtils.asProperties(cfg); } } if (jobPoolingInterval != null) { FlowProps.setJobPollingInterval(props, jobPoolingInterval); } if (maxConcurrentSteps != null) { FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps); } HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def); return flow; }