@Test public void testJSONFlatten() { Scheme sourceScheme = new TextLine(new Fields("line")); Tap input = new Hfs(sourceScheme, "data/input1.json"); Pipe assembly = new Pipe("json_flatten"); JSONSplitter jsonSplitter = new JSONSplitter(new Fields("name", "age", "phones"), "name", "age", "phones"); assembly = new Each(assembly, new Fields("line"), jsonSplitter, new Fields("name", "age", "phones")); JSONFlatten jsonFlatten = new JSONFlatten(new Fields("phone_number", "phone_type"), "number", "type"); assembly = new Each( assembly, new Fields("phones"), jsonFlatten, new Fields("name", "age", "phone_number", "phone_type")); assembly = new Each(assembly, AssertionLevel.STRICT, new AssertNotNull()); Tap output = new Hfs(new TextLine(), "output/flatten", true); FlowConnector connector = new HadoopFlowConnector(); Flow flow = connector.connect(input, output, assembly); flow.complete(); }
@Test public void testCascadeConnector() { Pipe copy = new Pipe("copy"); Properties cfg = HdpBootstrap.asProperties(CascadingHadoopSuite.configuration); FlowDef flow = new FlowDef() .addSource(copy, sourceTap()) .addTailSink(copy, new EsTap("cascading-hadoop/cascade-connector")); FlowConnector connector = new HadoopFlowConnector(cfg); Flow[] flows = new Flow[] {connector.connect(flow)}; CascadeConnector cascadeConnector = new CascadeConnector(cfg); cascadeConnector.connect(flows).complete(); }
/** * Create a Cascading Flow that will parse a set of mbox files and emit a tab-separated text file * with fields for the msgId, author, email address, etc. * * <p>Note this Flow will only run locally, since we're using the cascading.utils LocalPlatform. * * @param options Settings for the flow * @return Flow suitable for execution * @throws Exception */ public static Flow createFlow(ParseEmailArchivesOptions options) throws Exception { BasePlatform platform = new LocalPlatform(ParseEmailArchivesWorkflow.class); // We'll read individual file paths from the input file. BasePath inputPath = platform.makePath(options.getFileList()); Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath); Pipe emailPipe = new Pipe("emails"); emailPipe = new Each(emailPipe, new Fields("line"), new MboxSplitterFunction()); emailPipe = new Each(emailPipe, new ParseEmail()); BasePath outputPath = platform.makePath(options.getOutputDir()); TextLineScheme scheme = new TextLineScheme(false); Tap sinkTap = platform.makeTap(scheme, outputPath, SinkMode.REPLACE); FlowConnector flowConnector = platform.makeFlowConnector(); Flow flow = flowConnector.connect(sourceTap, sinkTap, emailPipe); return flow; }
@SuppressWarnings("rawtypes") public void run() { TextLine scheme = new TextLine(new Fields("line")); // Tap input = inputPath.matches("^[^:]+://.*") ? new Hfs(scheme, inputPath) : new Lfs(scheme, // inputPath); Tap input = new Hfs(scheme, inputPath); // extract the tags through regex and save content in group 1 -> as fields tags String tagJsonRegex = "\"tags\":\\[([^\\]]*)"; Function parse = new RegexParser(new Fields("tags"), tagJsonRegex, new int[] {1}); // for each line get the tags using a regex Pipe assembly = new Each("import", new Fields("line"), parse, Fields.RESULTS); // split "tags" into "tag" Function split = new RegexSplitGenerator(new Fields("tag"), ","); assembly = new Each(assembly, new Fields("tags"), split); assembly = new Each(assembly, new Fields("tag"), new RegexFilter(".+")); // group each tag by name assembly = new GroupBy(assembly, new Fields("tag")); // count each tag under "count" field Aggregator count = new Count(new Fields("count")); assembly = new Every(assembly, count); // create a SINK tap to write to the default filesystem // by default, TextLine writes all fields out new TextLine(new Fields("tag", "count")); // Tap output = outputPath.matches("^[^:]+://.*") ? new Hfs(sinkScheme, outputPath, // SinkMode.REPLACE) : new Lfs( // sinkScheme, outputPath, SinkMode.REPLACE); Tap output = new Lfs(scheme, outputPath, SinkMode.REPLACE); // wire the existing Hadoop config into HadoopFlow Properties properties = ConfigurationUtils.asProperties(hadoopConfiguration); FlowConnector flowConnector = new HadoopFlowConnector(properties); FlowConnectorProps.setDebugLevel(properties, DebugLevel.VERBOSE); Flow flow = flowConnector.connect("hashtagcount", input, output, assembly); flow.start(); flow.complete(); }
/** Using a local FileTap try a simple flow dumping output to stdout. */ public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("first argument is mainframe data file, second is output dir"); } Properties properties = new Properties(); AppProps.addApplicationTag(properties, "tutorials"); AppProps.addApplicationTag(properties, "cluster:development"); AppProps.setApplicationName(properties, "Cascading-Copybook Hadoop"); String inPath = args[0]; String outPath = args[1]; Pipe copyPipe = new Pipe("testPipe"); // Source turns the mainframe data into a flat Tuple Scheme sourceScheme = new CustdatScheme(); Tap inTap = new Hfs(sourceScheme, inPath); // Tuples are written to a csv Scheme sinkScheme = new TextDelimited(new Custdat(), false, ","); Tap outTap = new Hfs(sinkScheme, outPath, SinkMode.REPLACE); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap) .setDebugLevel(DebugLevel.VERBOSE) .setName("Cascading Cobol"); AppProps.setApplicationJarClass(properties, Main.class); FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties); flowConnector.connect(flowDef).complete(); }
public static void main(String[] args) { String dataset = args[0]; String outPath = args[1]; Properties properties = new Properties(); properties.setProperty("mapred.max.map.failures.percent", "5"); AppProps.setApplicationJarClass(properties, Main.class); FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties); Fields fmFields = new Fields( "loanid", "monthly_period", "servicer_name", "curr_interest_rate", "unpaid_principal_balance", "loan_age", "remaining_months_legal_maturity", "adjusted_remaining_months_maturity", "maturity_date", "metropolitan_statistical_area", "current_loan_delinq_status", "mod_flag", "zero_bal_code", "zero_bal_effective_date", "repurchase_indicator"); // create the source tap */ Tap inTap = new Hfs(new TextDelimited(fmFields, false, "|"), dataset); // create the sink tap Tap outTap = new Hfs( new TextDelimited( new Fields("monthly_period", "avg-unpaid_principal_balance"), false, "\t"), outPath); /** Specify a new Pipe */ Pipe copyPipe = new Pipe("unique copy"); // Deal with duplicates in data set Pipe mypipe = new Unique( copyPipe, new Fields("loanid", "monthly_period", "unpaid_principal_balance"), Unique.Include.NO_NULLS); // define "ScrubFunction" to clean up the token stream Fields monthArguments = new Fields("loanid", "monthly_period", "unpaid_principal_balance"); // mypipe = new Each( mypipe, monthArguments, new getMonth( monthArguments ), Fields.RESULTS ); // Remove null, i.e. we discad nullsto calculate average Filter filter = new RegexFilter("(\\d+)\\.(\\d+)"); mypipe = new Each(mypipe, filter); // Return the Month in [ January - December] , given a number [ 1 - 12 ] mypipe = new Each(mypipe, monthArguments, new getMonth(monthArguments)); // Group by month, this need to be by month mypipe = new GroupBy(mypipe, new Fields("monthly_period"), new Fields("monthly_period")); Fields groupingFields = new Fields("monthly_period"); Fields valueField = new Fields("unpaid_principal_balance"); Fields avgField = new Fields("avg-unpaid_principal_balance"); mypipe = new AverageBy(mypipe, groupingFields, valueField, avgField); /** connect the taps, pipes, etc., into a flow */ FlowDef flowDef = FlowDef.flowDef().addSource(copyPipe, inTap).addTailSink(mypipe, outTap); /** run the flow */ flowConnector.connect(flowDef).complete(); }