@Test
  public void testJSONFlatten() {
    Scheme sourceScheme = new TextLine(new Fields("line"));
    Tap input = new Hfs(sourceScheme, "data/input1.json");
    Pipe assembly = new Pipe("json_flatten");

    JSONSplitter jsonSplitter =
        new JSONSplitter(new Fields("name", "age", "phones"), "name", "age", "phones");
    assembly =
        new Each(assembly, new Fields("line"), jsonSplitter, new Fields("name", "age", "phones"));

    JSONFlatten jsonFlatten =
        new JSONFlatten(new Fields("phone_number", "phone_type"), "number", "type");
    assembly =
        new Each(
            assembly,
            new Fields("phones"),
            jsonFlatten,
            new Fields("name", "age", "phone_number", "phone_type"));

    assembly = new Each(assembly, AssertionLevel.STRICT, new AssertNotNull());

    Tap output = new Hfs(new TextLine(), "output/flatten", true);
    FlowConnector connector = new HadoopFlowConnector();
    Flow flow = connector.connect(input, output, assembly);
    flow.complete();
  }
  @Test
  public void testCascadeConnector() {
    Pipe copy = new Pipe("copy");
    Properties cfg = HdpBootstrap.asProperties(CascadingHadoopSuite.configuration);

    FlowDef flow =
        new FlowDef()
            .addSource(copy, sourceTap())
            .addTailSink(copy, new EsTap("cascading-hadoop/cascade-connector"));

    FlowConnector connector = new HadoopFlowConnector(cfg);
    Flow[] flows = new Flow[] {connector.connect(flow)};

    CascadeConnector cascadeConnector = new CascadeConnector(cfg);
    cascadeConnector.connect(flows).complete();
  }
  /**
   * Create a Cascading Flow that will parse a set of mbox files and emit a tab-separated text file
   * with fields for the msgId, author, email address, etc.
   *
   * <p>Note this Flow will only run locally, since we're using the cascading.utils LocalPlatform.
   *
   * @param options Settings for the flow
   * @return Flow suitable for execution
   * @throws Exception
   */
  public static Flow createFlow(ParseEmailArchivesOptions options) throws Exception {
    BasePlatform platform = new LocalPlatform(ParseEmailArchivesWorkflow.class);

    // We'll read individual file paths from the input file.
    BasePath inputPath = platform.makePath(options.getFileList());
    Tap sourceTap = platform.makeTap(platform.makeTextScheme(), inputPath);

    Pipe emailPipe = new Pipe("emails");
    emailPipe = new Each(emailPipe, new Fields("line"), new MboxSplitterFunction());
    emailPipe = new Each(emailPipe, new ParseEmail());

    BasePath outputPath = platform.makePath(options.getOutputDir());
    TextLineScheme scheme = new TextLineScheme(false);
    Tap sinkTap = platform.makeTap(scheme, outputPath, SinkMode.REPLACE);

    FlowConnector flowConnector = platform.makeFlowConnector();
    Flow flow = flowConnector.connect(sourceTap, sinkTap, emailPipe);
    return flow;
  }
  @SuppressWarnings("rawtypes")
  public void run() {
    TextLine scheme = new TextLine(new Fields("line"));
    // Tap input = inputPath.matches("^[^:]+://.*") ? new Hfs(scheme, inputPath) : new Lfs(scheme,
    // inputPath);
    Tap input = new Hfs(scheme, inputPath);

    // extract the tags through regex and save content in group 1 -> as fields tags
    String tagJsonRegex = "\"tags\":\\[([^\\]]*)";
    Function parse = new RegexParser(new Fields("tags"), tagJsonRegex, new int[] {1});
    // for each line get the tags using a regex
    Pipe assembly = new Each("import", new Fields("line"), parse, Fields.RESULTS);

    // split "tags" into "tag"
    Function split = new RegexSplitGenerator(new Fields("tag"), ",");
    assembly = new Each(assembly, new Fields("tags"), split);
    assembly = new Each(assembly, new Fields("tag"), new RegexFilter(".+"));
    // group each tag by name
    assembly = new GroupBy(assembly, new Fields("tag"));
    // count each tag under "count" field
    Aggregator count = new Count(new Fields("count"));
    assembly = new Every(assembly, count);

    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    new TextLine(new Fields("tag", "count"));
    //		Tap output = outputPath.matches("^[^:]+://.*") ? new Hfs(sinkScheme, outputPath,
    // SinkMode.REPLACE) : new Lfs(
    //				sinkScheme, outputPath, SinkMode.REPLACE);

    Tap output = new Lfs(scheme, outputPath, SinkMode.REPLACE);

    // wire the existing Hadoop config into HadoopFlow
    Properties properties = ConfigurationUtils.asProperties(hadoopConfiguration);

    FlowConnector flowConnector = new HadoopFlowConnector(properties);
    FlowConnectorProps.setDebugLevel(properties, DebugLevel.VERBOSE);
    Flow flow = flowConnector.connect("hashtagcount", input, output, assembly);

    flow.start();
    flow.complete();
  }
Esempio n. 5
0
  /** Using a local FileTap try a simple flow dumping output to stdout. */
  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      throw new Exception("first argument is mainframe data file, second is output dir");
    }

    Properties properties = new Properties();

    AppProps.addApplicationTag(properties, "tutorials");
    AppProps.addApplicationTag(properties, "cluster:development");
    AppProps.setApplicationName(properties, "Cascading-Copybook Hadoop");

    String inPath = args[0];
    String outPath = args[1];

    Pipe copyPipe = new Pipe("testPipe");

    // Source turns the mainframe data into a flat Tuple
    Scheme sourceScheme = new CustdatScheme();
    Tap inTap = new Hfs(sourceScheme, inPath);

    // Tuples are written to a csv
    Scheme sinkScheme = new TextDelimited(new Custdat(), false, ",");
    Tap outTap = new Hfs(sinkScheme, outPath, SinkMode.REPLACE);

    FlowDef flowDef =
        FlowDef.flowDef()
            .addSource(copyPipe, inTap)
            .addTailSink(copyPipe, outTap)
            .setDebugLevel(DebugLevel.VERBOSE)
            .setName("Cascading Cobol");

    AppProps.setApplicationJarClass(properties, Main.class);
    FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);
    flowConnector.connect(flowDef).complete();
  }
Esempio n. 6
0
  public static void main(String[] args) {
    String dataset = args[0];
    String outPath = args[1];

    Properties properties = new Properties();
    properties.setProperty("mapred.max.map.failures.percent", "5");

    AppProps.setApplicationJarClass(properties, Main.class);
    FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);

    Fields fmFields =
        new Fields(
            "loanid",
            "monthly_period",
            "servicer_name",
            "curr_interest_rate",
            "unpaid_principal_balance",
            "loan_age",
            "remaining_months_legal_maturity",
            "adjusted_remaining_months_maturity",
            "maturity_date",
            "metropolitan_statistical_area",
            "current_loan_delinq_status",
            "mod_flag",
            "zero_bal_code",
            "zero_bal_effective_date",
            "repurchase_indicator");

    // create the source tap */
    Tap inTap = new Hfs(new TextDelimited(fmFields, false, "|"), dataset);

    // create the sink tap
    Tap outTap =
        new Hfs(
            new TextDelimited(
                new Fields("monthly_period", "avg-unpaid_principal_balance"), false, "\t"),
            outPath);

    /** Specify a new Pipe */
    Pipe copyPipe = new Pipe("unique copy");

    // Deal with duplicates in data set
    Pipe mypipe =
        new Unique(
            copyPipe,
            new Fields("loanid", "monthly_period", "unpaid_principal_balance"),
            Unique.Include.NO_NULLS);

    // define "ScrubFunction" to clean up the token stream
    Fields monthArguments = new Fields("loanid", "monthly_period", "unpaid_principal_balance");
    // mypipe = new Each( mypipe, monthArguments, new getMonth( monthArguments ), Fields.RESULTS );

    // Remove null, i.e. we discad nullsto calculate average
    Filter filter = new RegexFilter("(\\d+)\\.(\\d+)");
    mypipe = new Each(mypipe, filter);

    // Return the Month in [ January - December] , given a number [ 1 - 12 ]
    mypipe = new Each(mypipe, monthArguments, new getMonth(monthArguments));

    // Group by month, this need to be by month
    mypipe = new GroupBy(mypipe, new Fields("monthly_period"), new Fields("monthly_period"));

    Fields groupingFields = new Fields("monthly_period");
    Fields valueField = new Fields("unpaid_principal_balance");
    Fields avgField = new Fields("avg-unpaid_principal_balance");

    mypipe = new AverageBy(mypipe, groupingFields, valueField, avgField);

    /** connect the taps, pipes, etc., into a flow */
    FlowDef flowDef = FlowDef.flowDef().addSource(copyPipe, inTap).addTailSink(mypipe, outTap);

    /** run the flow */
    flowConnector.connect(flowDef).complete();
  }