public static void main(String[] args) { String docPath = args[0]; String wcPath = args[1]; Properties properties = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties); // create source and sink taps Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath); Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath); // specify a regex operation to split the "document" text lines into a token stream Fields token = new Fields("token"); Fields text = new Fields("text"); RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]"); // only returns "token" Pipe docPipe = new Each("token", text, splitter, Fields.RESULTS); // determine the word counts Pipe wcPipe = new Pipe("wc", docPipe); wcPipe = new GroupBy(wcPipe, token); wcPipe = new Every(wcPipe, Fields.ALL, new Count(), Fields.ALL); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef().setName("wc").addSource(docPipe, docTap).addTailSink(wcPipe, wcTap); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect(flowDef); wcFlow.writeDOT("dot/wc.dot"); wcFlow.complete(); }
@Test public void testWriteToES() throws Exception { Tap in = sourceTap(); Tap out = new EsTap("cascading-hadoop/artists", new Fields("name", "url", "picture")); Pipe pipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef().addSource(pipe, in).addTailSink(pipe, out); StatsUtils.proxy( new HadoopFlowConnector(HdpBootstrap.asProperties(CascadingHadoopSuite.configuration)) .connect(flowDef)) .complete(); }
/** Using a local FileTap try a simple flow dumping output to stdout. */ public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("first argument is mainframe data file, second is output dir"); } Properties properties = new Properties(); AppProps.addApplicationTag(properties, "tutorials"); AppProps.addApplicationTag(properties, "cluster:development"); AppProps.setApplicationName(properties, "Cascading-Copybook Hadoop"); String inPath = args[0]; String outPath = args[1]; Pipe copyPipe = new Pipe("testPipe"); // Source turns the mainframe data into a flat Tuple Scheme sourceScheme = new CustdatScheme(); Tap inTap = new Hfs(sourceScheme, inPath); // Tuples are written to a csv Scheme sinkScheme = new TextDelimited(new Custdat(), false, ","); Tap outTap = new Hfs(sinkScheme, outPath, SinkMode.REPLACE); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap) .setDebugLevel(DebugLevel.VERBOSE) .setName("Cascading Cobol"); AppProps.setApplicationJarClass(properties, Main.class); FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties); flowConnector.connect(flowDef).complete(); }
public static void main(String[] args) { String dataset = args[0]; String outPath = args[1]; Properties properties = new Properties(); properties.setProperty("mapred.max.map.failures.percent", "5"); AppProps.setApplicationJarClass(properties, Main.class); FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties); Fields fmFields = new Fields( "loanid", "monthly_period", "servicer_name", "curr_interest_rate", "unpaid_principal_balance", "loan_age", "remaining_months_legal_maturity", "adjusted_remaining_months_maturity", "maturity_date", "metropolitan_statistical_area", "current_loan_delinq_status", "mod_flag", "zero_bal_code", "zero_bal_effective_date", "repurchase_indicator"); // create the source tap */ Tap inTap = new Hfs(new TextDelimited(fmFields, false, "|"), dataset); // create the sink tap Tap outTap = new Hfs( new TextDelimited( new Fields("monthly_period", "avg-unpaid_principal_balance"), false, "\t"), outPath); /** Specify a new Pipe */ Pipe copyPipe = new Pipe("unique copy"); // Deal with duplicates in data set Pipe mypipe = new Unique( copyPipe, new Fields("loanid", "monthly_period", "unpaid_principal_balance"), Unique.Include.NO_NULLS); // define "ScrubFunction" to clean up the token stream Fields monthArguments = new Fields("loanid", "monthly_period", "unpaid_principal_balance"); // mypipe = new Each( mypipe, monthArguments, new getMonth( monthArguments ), Fields.RESULTS ); // Remove null, i.e. we discad nullsto calculate average Filter filter = new RegexFilter("(\\d+)\\.(\\d+)"); mypipe = new Each(mypipe, filter); // Return the Month in [ January - December] , given a number [ 1 - 12 ] mypipe = new Each(mypipe, monthArguments, new getMonth(monthArguments)); // Group by month, this need to be by month mypipe = new GroupBy(mypipe, new Fields("monthly_period"), new Fields("monthly_period")); Fields groupingFields = new Fields("monthly_period"); Fields valueField = new Fields("unpaid_principal_balance"); Fields avgField = new Fields("avg-unpaid_principal_balance"); mypipe = new AverageBy(mypipe, groupingFields, valueField, avgField); /** connect the taps, pipes, etc., into a flow */ FlowDef flowDef = FlowDef.flowDef().addSource(copyPipe, inTap).addTailSink(mypipe, outTap); /** run the flow */ flowConnector.connect(flowDef).complete(); }
public static void main(String[] args) { String docPath = args[0]; String wcPath = args[1]; String stopPath = args[2]; String tfidfPath = args[3]; String trapPath = args[4]; String checkPath = args[5]; Properties properties = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties); // create source and sink taps Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath); Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath); Fields stop = new Fields("stop"); Tap stopTap = new Hfs(new TextDelimited(stop, true, "\t"), stopPath); Tap tfidfTap = new Hfs(new TextDelimited(true, "\t"), tfidfPath); Tap trapTap = new Hfs(new TextDelimited(true, "\t"), trapPath); Tap checkTap = new Hfs(new TextDelimited(true, "\t"), checkPath); // use a stream assertion to validate the input data Pipe docPipe = new Pipe("token"); AssertMatches assertMatches = new AssertMatches("doc\\d+\\s.*"); docPipe = new Each(docPipe, AssertionLevel.STRICT, assertMatches); // specify a regex operation to split the "document" text lines into a token stream Fields token = new Fields("token"); Fields text = new Fields("text"); RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]"); Fields fieldSelector = new Fields("doc_id", "token"); docPipe = new Each(docPipe, text, splitter, fieldSelector); // define "ScrubFunction" to clean up the token stream Fields scrubArguments = new Fields("doc_id", "token"); docPipe = new Each(docPipe, scrubArguments, new ScrubFunction(scrubArguments), Fields.RESULTS); // perform a left join to remove stop words, discarding the rows // which joined with stop words, i.e., were non-null after left join Pipe stopPipe = new Pipe("stop"); Pipe tokenPipe = new HashJoin(docPipe, token, stopPipe, stop, new LeftJoin()); tokenPipe = new Each(tokenPipe, stop, new RegexFilter("^$")); tokenPipe = new Retain(tokenPipe, fieldSelector); // one branch of the flow tallies the token counts for term frequency (TF) Pipe tfPipe = new Pipe("TF", tokenPipe); Fields tf_count = new Fields("tf_count"); tfPipe = new CountBy(tfPipe, new Fields("doc_id", "token"), tf_count); Fields tf_token = new Fields("tf_token"); tfPipe = new Rename(tfPipe, token, tf_token); // one branch counts the number of documents (D) Fields doc_id = new Fields("doc_id"); Fields tally = new Fields("tally"); Fields rhs_join = new Fields("rhs_join"); Fields n_docs = new Fields("n_docs"); Pipe dPipe = new Unique("D", tokenPipe, doc_id); dPipe = new Each(dPipe, new Insert(tally, 1), Fields.ALL); dPipe = new Each(dPipe, new Insert(rhs_join, 1), Fields.ALL); dPipe = new SumBy(dPipe, rhs_join, tally, n_docs, long.class); // one branch tallies the token counts for document frequency (DF) Pipe dfPipe = new Unique("DF", tokenPipe, Fields.ALL); Fields df_count = new Fields("df_count"); dfPipe = new CountBy(dfPipe, token, df_count); Fields df_token = new Fields("df_token"); Fields lhs_join = new Fields("lhs_join"); dfPipe = new Rename(dfPipe, token, df_token); dfPipe = new Each(dfPipe, new Insert(lhs_join, 1), Fields.ALL); // example use of a debug, to observe tuple stream; turn off below dfPipe = new Each(dfPipe, DebugLevel.VERBOSE, new Debug(true)); // join to bring together all the components for calculating TF-IDF // the D side of the join is smaller, so it goes on the RHS Pipe idfPipe = new HashJoin(dfPipe, lhs_join, dPipe, rhs_join); // create a checkpoint, to observe the intermediate data in DF stream Checkpoint idfCheck = new Checkpoint("checkpoint", idfPipe); // the IDF side of the join is smaller, so it goes on the RHS Pipe tfidfPipe = new CoGroup(tfPipe, tf_token, idfCheck, df_token); // calculate the TF-IDF weights, per token, per document Fields tfidf = new Fields("tfidf"); String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )"; ExpressionFunction tfidfExpression = new ExpressionFunction(tfidf, expression, Double.class); Fields tfidfArguments = new Fields("tf_count", "df_count", "n_docs"); tfidfPipe = new Each(tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL); fieldSelector = new Fields("tf_token", "doc_id", "tfidf"); tfidfPipe = new Retain(tfidfPipe, fieldSelector); tfidfPipe = new Rename(tfidfPipe, tf_token, token); // keep track of the word counts, which are useful for QA Pipe wcPipe = new Pipe("wc", tfPipe); Fields count = new Fields("count"); wcPipe = new SumBy(wcPipe, tf_token, tf_count, count, long.class); wcPipe = new Rename(wcPipe, tf_token, token); // additionally, sort by count wcPipe = new GroupBy(wcPipe, count, count); // connect the taps, pipes, traps, checkpoints, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName("tfidf") .addSource(docPipe, docTap) .addSource(stopPipe, stopTap) .addTailSink(tfidfPipe, tfidfTap) .addTailSink(wcPipe, wcTap) .addTrap(docPipe, trapTap) .addCheckpoint(idfCheck, checkTap); // set to DebugLevel.VERBOSE for trace, or DebugLevel.NONE in production flowDef.setDebugLevel(DebugLevel.VERBOSE); // set to AssertionLevel.STRICT for all assertions, or AssertionLevel.NONE in production flowDef.setAssertionLevel(AssertionLevel.STRICT); // write a DOT file and run the flow Flow tfidfFlow = flowConnector.connect(flowDef); tfidfFlow.writeDOT("dot/tfidf.dot"); tfidfFlow.complete(); }
@Override HadoopFlow createFlow() throws IOException { // copy flowDef FlowDef def = FlowDef.flowDef(); if (flowDef != null) { def.addSinks(flowDef.getSinksCopy()) .addSources(flowDef.getSourcesCopy()) .addTraps(flowDef.getTrapsCopy()) .addTails(flowDef.getTailsArray()) .setAssertionLevel(flowDef.getAssertionLevel()) .setDebugLevel(flowDef.getDebugLevel()) .addCheckpoints(flowDef.getCheckpointsCopy()) .addTags(flowDef.getTags()) .setName(flowDef.getName()); } Set<Pipe> heads = new LinkedHashSet<Pipe>(); if (tails != null) { for (Pipe pipe : tails) { Collections.addAll(heads, pipe.getHeads()); } } Pipe pipe = null; if (heads.size() == 1) { pipe = heads.iterator().next(); } if (sources != null && sources.size() == 1) { Tap tap = sources.remove(MARKER); if (tap != null) { sources.put(pipe.getName(), tap); } } if (sinks != null && sinks.size() == 1) { Tap tap = sinks.remove(MARKER); if (tap != null) { sinks.put(pipe.getName(), tap); } } def.addSources(sources).addSinks(sinks).addTraps(traps); if (tails != null) { def.addTails(tails); } if (StringUtils.hasText(beanName)) { def.addTag(beanName); if (!StringUtils.hasText(def.getName())) { def.setName(beanName); } } Configuration cfg = ConfigurationUtils.createFrom(configuration, properties); Properties props = ConfigurationUtils.asProperties(cfg); if (jarSetup) { if (jar != null) { AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString())); } else if (jarClass != null) { AppProps.setApplicationJarClass(props, jarClass); } else { // auto-detection based on the classpath ClassLoader cascadingCL = Cascade.class.getClassLoader(); Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class); Resource cascadingHadoop = ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class"); // find jgrapht Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class"); Assert.notNull(cascadingCore, "Cannot find cascading-core.jar"); Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar"); Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar"); if (log.isDebugEnabled()) { log.debug( "Auto-detecting Cascading Libs [" + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht}) + "]"); } ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht); // config changed, reinit properties props = ConfigurationUtils.asProperties(cfg); } } if (jobPoolingInterval != null) { FlowProps.setJobPollingInterval(props, jobPoolingInterval); } if (maxConcurrentSteps != null) { FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps); } HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def); return flow; }