Exemple #1
0
  public static void main(String[] args) {
    String docPath = args[0];
    String wcPath = args[1];

    Properties properties = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties);

    // create source and sink taps
    Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath);
    Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath);

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields("token");
    Fields text = new Fields("text");
    RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]");
    // only returns "token"
    Pipe docPipe = new Each("token", text, splitter, Fields.RESULTS);

    // determine the word counts
    Pipe wcPipe = new Pipe("wc", docPipe);
    wcPipe = new GroupBy(wcPipe, token);
    wcPipe = new Every(wcPipe, Fields.ALL, new Count(), Fields.ALL);

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef =
        FlowDef.flowDef().setName("wc").addSource(docPipe, docTap).addTailSink(wcPipe, wcTap);

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect(flowDef);
    wcFlow.writeDOT("dot/wc.dot");
    wcFlow.complete();
  }
  @Test
  public void testWriteToES() throws Exception {
    Tap in = sourceTap();
    Tap out = new EsTap("cascading-hadoop/artists", new Fields("name", "url", "picture"));
    Pipe pipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef().addSource(pipe, in).addTailSink(pipe, out);
    StatsUtils.proxy(
            new HadoopFlowConnector(HdpBootstrap.asProperties(CascadingHadoopSuite.configuration))
                .connect(flowDef))
        .complete();
  }
Exemple #3
0
  /** Using a local FileTap try a simple flow dumping output to stdout. */
  public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      throw new Exception("first argument is mainframe data file, second is output dir");
    }

    Properties properties = new Properties();

    AppProps.addApplicationTag(properties, "tutorials");
    AppProps.addApplicationTag(properties, "cluster:development");
    AppProps.setApplicationName(properties, "Cascading-Copybook Hadoop");

    String inPath = args[0];
    String outPath = args[1];

    Pipe copyPipe = new Pipe("testPipe");

    // Source turns the mainframe data into a flat Tuple
    Scheme sourceScheme = new CustdatScheme();
    Tap inTap = new Hfs(sourceScheme, inPath);

    // Tuples are written to a csv
    Scheme sinkScheme = new TextDelimited(new Custdat(), false, ",");
    Tap outTap = new Hfs(sinkScheme, outPath, SinkMode.REPLACE);

    FlowDef flowDef =
        FlowDef.flowDef()
            .addSource(copyPipe, inTap)
            .addTailSink(copyPipe, outTap)
            .setDebugLevel(DebugLevel.VERBOSE)
            .setName("Cascading Cobol");

    AppProps.setApplicationJarClass(properties, Main.class);
    FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);
    flowConnector.connect(flowDef).complete();
  }
Exemple #4
0
  public static void main(String[] args) {
    String dataset = args[0];
    String outPath = args[1];

    Properties properties = new Properties();
    properties.setProperty("mapred.max.map.failures.percent", "5");

    AppProps.setApplicationJarClass(properties, Main.class);
    FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);

    Fields fmFields =
        new Fields(
            "loanid",
            "monthly_period",
            "servicer_name",
            "curr_interest_rate",
            "unpaid_principal_balance",
            "loan_age",
            "remaining_months_legal_maturity",
            "adjusted_remaining_months_maturity",
            "maturity_date",
            "metropolitan_statistical_area",
            "current_loan_delinq_status",
            "mod_flag",
            "zero_bal_code",
            "zero_bal_effective_date",
            "repurchase_indicator");

    // create the source tap */
    Tap inTap = new Hfs(new TextDelimited(fmFields, false, "|"), dataset);

    // create the sink tap
    Tap outTap =
        new Hfs(
            new TextDelimited(
                new Fields("monthly_period", "avg-unpaid_principal_balance"), false, "\t"),
            outPath);

    /** Specify a new Pipe */
    Pipe copyPipe = new Pipe("unique copy");

    // Deal with duplicates in data set
    Pipe mypipe =
        new Unique(
            copyPipe,
            new Fields("loanid", "monthly_period", "unpaid_principal_balance"),
            Unique.Include.NO_NULLS);

    // define "ScrubFunction" to clean up the token stream
    Fields monthArguments = new Fields("loanid", "monthly_period", "unpaid_principal_balance");
    // mypipe = new Each( mypipe, monthArguments, new getMonth( monthArguments ), Fields.RESULTS );

    // Remove null, i.e. we discad nullsto calculate average
    Filter filter = new RegexFilter("(\\d+)\\.(\\d+)");
    mypipe = new Each(mypipe, filter);

    // Return the Month in [ January - December] , given a number [ 1 - 12 ]
    mypipe = new Each(mypipe, monthArguments, new getMonth(monthArguments));

    // Group by month, this need to be by month
    mypipe = new GroupBy(mypipe, new Fields("monthly_period"), new Fields("monthly_period"));

    Fields groupingFields = new Fields("monthly_period");
    Fields valueField = new Fields("unpaid_principal_balance");
    Fields avgField = new Fields("avg-unpaid_principal_balance");

    mypipe = new AverageBy(mypipe, groupingFields, valueField, avgField);

    /** connect the taps, pipes, etc., into a flow */
    FlowDef flowDef = FlowDef.flowDef().addSource(copyPipe, inTap).addTailSink(mypipe, outTap);

    /** run the flow */
    flowConnector.connect(flowDef).complete();
  }
Exemple #5
0
  public static void main(String[] args) {
    String docPath = args[0];
    String wcPath = args[1];
    String stopPath = args[2];
    String tfidfPath = args[3];
    String trapPath = args[4];
    String checkPath = args[5];

    Properties properties = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties);

    // create source and sink taps
    Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath);
    Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath);

    Fields stop = new Fields("stop");
    Tap stopTap = new Hfs(new TextDelimited(stop, true, "\t"), stopPath);
    Tap tfidfTap = new Hfs(new TextDelimited(true, "\t"), tfidfPath);

    Tap trapTap = new Hfs(new TextDelimited(true, "\t"), trapPath);
    Tap checkTap = new Hfs(new TextDelimited(true, "\t"), checkPath);

    // use a stream assertion to validate the input data
    Pipe docPipe = new Pipe("token");
    AssertMatches assertMatches = new AssertMatches("doc\\d+\\s.*");
    docPipe = new Each(docPipe, AssertionLevel.STRICT, assertMatches);

    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields("token");
    Fields text = new Fields("text");
    RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]");
    Fields fieldSelector = new Fields("doc_id", "token");
    docPipe = new Each(docPipe, text, splitter, fieldSelector);

    // define "ScrubFunction" to clean up the token stream
    Fields scrubArguments = new Fields("doc_id", "token");
    docPipe = new Each(docPipe, scrubArguments, new ScrubFunction(scrubArguments), Fields.RESULTS);

    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe("stop");
    Pipe tokenPipe = new HashJoin(docPipe, token, stopPipe, stop, new LeftJoin());
    tokenPipe = new Each(tokenPipe, stop, new RegexFilter("^$"));
    tokenPipe = new Retain(tokenPipe, fieldSelector);

    // one branch of the flow tallies the token counts for term frequency (TF)
    Pipe tfPipe = new Pipe("TF", tokenPipe);
    Fields tf_count = new Fields("tf_count");
    tfPipe = new CountBy(tfPipe, new Fields("doc_id", "token"), tf_count);

    Fields tf_token = new Fields("tf_token");
    tfPipe = new Rename(tfPipe, token, tf_token);

    // one branch counts the number of documents (D)
    Fields doc_id = new Fields("doc_id");
    Fields tally = new Fields("tally");
    Fields rhs_join = new Fields("rhs_join");
    Fields n_docs = new Fields("n_docs");
    Pipe dPipe = new Unique("D", tokenPipe, doc_id);
    dPipe = new Each(dPipe, new Insert(tally, 1), Fields.ALL);
    dPipe = new Each(dPipe, new Insert(rhs_join, 1), Fields.ALL);
    dPipe = new SumBy(dPipe, rhs_join, tally, n_docs, long.class);

    // one branch tallies the token counts for document frequency (DF)
    Pipe dfPipe = new Unique("DF", tokenPipe, Fields.ALL);
    Fields df_count = new Fields("df_count");
    dfPipe = new CountBy(dfPipe, token, df_count);

    Fields df_token = new Fields("df_token");
    Fields lhs_join = new Fields("lhs_join");
    dfPipe = new Rename(dfPipe, token, df_token);
    dfPipe = new Each(dfPipe, new Insert(lhs_join, 1), Fields.ALL);

    // example use of a debug, to observe tuple stream; turn off below
    dfPipe = new Each(dfPipe, DebugLevel.VERBOSE, new Debug(true));

    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin(dfPipe, lhs_join, dPipe, rhs_join);

    // create a checkpoint, to observe the intermediate data in DF stream
    Checkpoint idfCheck = new Checkpoint("checkpoint", idfPipe);

    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup(tfPipe, tf_token, idfCheck, df_token);

    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields("tfidf");
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction(tfidf, expression, Double.class);
    Fields tfidfArguments = new Fields("tf_count", "df_count", "n_docs");
    tfidfPipe = new Each(tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL);

    fieldSelector = new Fields("tf_token", "doc_id", "tfidf");
    tfidfPipe = new Retain(tfidfPipe, fieldSelector);
    tfidfPipe = new Rename(tfidfPipe, tf_token, token);

    // keep track of the word counts, which are useful for QA
    Pipe wcPipe = new Pipe("wc", tfPipe);

    Fields count = new Fields("count");
    wcPipe = new SumBy(wcPipe, tf_token, tf_count, count, long.class);
    wcPipe = new Rename(wcPipe, tf_token, token);

    // additionally, sort by count
    wcPipe = new GroupBy(wcPipe, count, count);

    // connect the taps, pipes, traps, checkpoints, etc., into a flow
    FlowDef flowDef =
        FlowDef.flowDef()
            .setName("tfidf")
            .addSource(docPipe, docTap)
            .addSource(stopPipe, stopTap)
            .addTailSink(tfidfPipe, tfidfTap)
            .addTailSink(wcPipe, wcTap)
            .addTrap(docPipe, trapTap)
            .addCheckpoint(idfCheck, checkTap);

    // set to DebugLevel.VERBOSE for trace, or DebugLevel.NONE in production
    flowDef.setDebugLevel(DebugLevel.VERBOSE);

    // set to AssertionLevel.STRICT for all assertions, or AssertionLevel.NONE in production
    flowDef.setAssertionLevel(AssertionLevel.STRICT);

    // write a DOT file and run the flow
    Flow tfidfFlow = flowConnector.connect(flowDef);
    tfidfFlow.writeDOT("dot/tfidf.dot");
    tfidfFlow.complete();
  }
  @Override
  HadoopFlow createFlow() throws IOException {
    // copy flowDef
    FlowDef def = FlowDef.flowDef();

    if (flowDef != null) {
      def.addSinks(flowDef.getSinksCopy())
          .addSources(flowDef.getSourcesCopy())
          .addTraps(flowDef.getTrapsCopy())
          .addTails(flowDef.getTailsArray())
          .setAssertionLevel(flowDef.getAssertionLevel())
          .setDebugLevel(flowDef.getDebugLevel())
          .addCheckpoints(flowDef.getCheckpointsCopy())
          .addTags(flowDef.getTags())
          .setName(flowDef.getName());
    }

    Set<Pipe> heads = new LinkedHashSet<Pipe>();

    if (tails != null) {
      for (Pipe pipe : tails) {
        Collections.addAll(heads, pipe.getHeads());
      }
    }

    Pipe pipe = null;

    if (heads.size() == 1) {
      pipe = heads.iterator().next();
    }

    if (sources != null && sources.size() == 1) {
      Tap tap = sources.remove(MARKER);
      if (tap != null) {
        sources.put(pipe.getName(), tap);
      }
    }

    if (sinks != null && sinks.size() == 1) {
      Tap tap = sinks.remove(MARKER);
      if (tap != null) {
        sinks.put(pipe.getName(), tap);
      }
    }

    def.addSources(sources).addSinks(sinks).addTraps(traps);

    if (tails != null) {
      def.addTails(tails);
    }

    if (StringUtils.hasText(beanName)) {
      def.addTag(beanName);

      if (!StringUtils.hasText(def.getName())) {
        def.setName(beanName);
      }
    }

    Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);
    Properties props = ConfigurationUtils.asProperties(cfg);

    if (jarSetup) {
      if (jar != null) {
        AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString()));
      } else if (jarClass != null) {
        AppProps.setApplicationJarClass(props, jarClass);
      } else {
        // auto-detection based on the classpath
        ClassLoader cascadingCL = Cascade.class.getClassLoader();
        Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class);
        Resource cascadingHadoop =
            ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class");
        // find jgrapht
        Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class");

        Assert.notNull(cascadingCore, "Cannot find cascading-core.jar");
        Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar");
        Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar");

        if (log.isDebugEnabled()) {
          log.debug(
              "Auto-detecting Cascading Libs ["
                  + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht})
                  + "]");
        }

        ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht);

        // config changed, reinit properties
        props = ConfigurationUtils.asProperties(cfg);
      }
    }

    if (jobPoolingInterval != null) {
      FlowProps.setJobPollingInterval(props, jobPoolingInterval);
    }

    if (maxConcurrentSteps != null) {
      FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps);
    }

    HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def);

    return flow;
  }