コード例 #1
0
  @SuppressWarnings("rawtypes")
  public void run() {
    TextLine scheme = new TextLine(new Fields("line"));
    // Tap input = inputPath.matches("^[^:]+://.*") ? new Hfs(scheme, inputPath) : new Lfs(scheme,
    // inputPath);
    Tap input = new Hfs(scheme, inputPath);

    // extract the tags through regex and save content in group 1 -> as fields tags
    String tagJsonRegex = "\"tags\":\\[([^\\]]*)";
    Function parse = new RegexParser(new Fields("tags"), tagJsonRegex, new int[] {1});
    // for each line get the tags using a regex
    Pipe assembly = new Each("import", new Fields("line"), parse, Fields.RESULTS);

    // split "tags" into "tag"
    Function split = new RegexSplitGenerator(new Fields("tag"), ",");
    assembly = new Each(assembly, new Fields("tags"), split);
    assembly = new Each(assembly, new Fields("tag"), new RegexFilter(".+"));
    // group each tag by name
    assembly = new GroupBy(assembly, new Fields("tag"));
    // count each tag under "count" field
    Aggregator count = new Count(new Fields("count"));
    assembly = new Every(assembly, count);

    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    new TextLine(new Fields("tag", "count"));
    //		Tap output = outputPath.matches("^[^:]+://.*") ? new Hfs(sinkScheme, outputPath,
    // SinkMode.REPLACE) : new Lfs(
    //				sinkScheme, outputPath, SinkMode.REPLACE);

    Tap output = new Lfs(scheme, outputPath, SinkMode.REPLACE);

    // wire the existing Hadoop config into HadoopFlow
    Properties properties = ConfigurationUtils.asProperties(hadoopConfiguration);

    FlowConnector flowConnector = new HadoopFlowConnector(properties);
    FlowConnectorProps.setDebugLevel(properties, DebugLevel.VERBOSE);
    Flow flow = flowConnector.connect("hashtagcount", input, output, assembly);

    flow.start();
    flow.complete();
  }
コード例 #2
0
 @CliCommand(
     value = {PREFIX + "props list"},
     help = "Returns (all) the Hadoop properties")
 public String listProps() {
   return ConfigurationUtils.asProperties(hadoopConfiguration).toString();
 }
コード例 #3
0
  @Override
  HadoopFlow createFlow() throws IOException {
    // copy flowDef
    FlowDef def = FlowDef.flowDef();

    if (flowDef != null) {
      def.addSinks(flowDef.getSinksCopy())
          .addSources(flowDef.getSourcesCopy())
          .addTraps(flowDef.getTrapsCopy())
          .addTails(flowDef.getTailsArray())
          .setAssertionLevel(flowDef.getAssertionLevel())
          .setDebugLevel(flowDef.getDebugLevel())
          .addCheckpoints(flowDef.getCheckpointsCopy())
          .addTags(flowDef.getTags())
          .setName(flowDef.getName());
    }

    Set<Pipe> heads = new LinkedHashSet<Pipe>();

    if (tails != null) {
      for (Pipe pipe : tails) {
        Collections.addAll(heads, pipe.getHeads());
      }
    }

    Pipe pipe = null;

    if (heads.size() == 1) {
      pipe = heads.iterator().next();
    }

    if (sources != null && sources.size() == 1) {
      Tap tap = sources.remove(MARKER);
      if (tap != null) {
        sources.put(pipe.getName(), tap);
      }
    }

    if (sinks != null && sinks.size() == 1) {
      Tap tap = sinks.remove(MARKER);
      if (tap != null) {
        sinks.put(pipe.getName(), tap);
      }
    }

    def.addSources(sources).addSinks(sinks).addTraps(traps);

    if (tails != null) {
      def.addTails(tails);
    }

    if (StringUtils.hasText(beanName)) {
      def.addTag(beanName);

      if (!StringUtils.hasText(def.getName())) {
        def.setName(beanName);
      }
    }

    Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);
    Properties props = ConfigurationUtils.asProperties(cfg);

    if (jarSetup) {
      if (jar != null) {
        AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString()));
      } else if (jarClass != null) {
        AppProps.setApplicationJarClass(props, jarClass);
      } else {
        // auto-detection based on the classpath
        ClassLoader cascadingCL = Cascade.class.getClassLoader();
        Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class);
        Resource cascadingHadoop =
            ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class");
        // find jgrapht
        Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class");

        Assert.notNull(cascadingCore, "Cannot find cascading-core.jar");
        Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar");
        Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar");

        if (log.isDebugEnabled()) {
          log.debug(
              "Auto-detecting Cascading Libs ["
                  + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht})
                  + "]");
        }

        ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht);

        // config changed, reinit properties
        props = ConfigurationUtils.asProperties(cfg);
      }
    }

    if (jobPoolingInterval != null) {
      FlowProps.setJobPollingInterval(props, jobPoolingInterval);
    }

    if (maxConcurrentSteps != null) {
      FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps);
    }

    HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def);

    return flow;
  }