Example #1
0
  /**
   * Instantiates a new DistCp.
   *
   * @param configuration the hadoop configuration
   * @param user the user
   */
  public DistCp(Configuration configuration, String user) {
    Assert.notNull(configuration, "configuration required");
    this.configuration = ConfigurationUtils.createFrom(configuration, null);
    // disable GenericOptionsParser
    this.configuration.setBoolean("mapred.used.genericoptionsparser", true);
    this.configuration.setBoolean("mapreduce.client.genericoptionsparser.used", true);

    this.user = user;
  }
  @SuppressWarnings("rawtypes")
  public void run() {
    TextLine scheme = new TextLine(new Fields("line"));
    // Tap input = inputPath.matches("^[^:]+://.*") ? new Hfs(scheme, inputPath) : new Lfs(scheme,
    // inputPath);
    Tap input = new Hfs(scheme, inputPath);

    // extract the tags through regex and save content in group 1 -> as fields tags
    String tagJsonRegex = "\"tags\":\\[([^\\]]*)";
    Function parse = new RegexParser(new Fields("tags"), tagJsonRegex, new int[] {1});
    // for each line get the tags using a regex
    Pipe assembly = new Each("import", new Fields("line"), parse, Fields.RESULTS);

    // split "tags" into "tag"
    Function split = new RegexSplitGenerator(new Fields("tag"), ",");
    assembly = new Each(assembly, new Fields("tags"), split);
    assembly = new Each(assembly, new Fields("tag"), new RegexFilter(".+"));
    // group each tag by name
    assembly = new GroupBy(assembly, new Fields("tag"));
    // count each tag under "count" field
    Aggregator count = new Count(new Fields("count"));
    assembly = new Every(assembly, count);

    // create a SINK tap to write to the default filesystem
    // by default, TextLine writes all fields out
    new TextLine(new Fields("tag", "count"));
    //		Tap output = outputPath.matches("^[^:]+://.*") ? new Hfs(sinkScheme, outputPath,
    // SinkMode.REPLACE) : new Lfs(
    //				sinkScheme, outputPath, SinkMode.REPLACE);

    Tap output = new Lfs(scheme, outputPath, SinkMode.REPLACE);

    // wire the existing Hadoop config into HadoopFlow
    Properties properties = ConfigurationUtils.asProperties(hadoopConfiguration);

    FlowConnector flowConnector = new HadoopFlowConnector(properties);
    FlowConnectorProps.setDebugLevel(properties, DebugLevel.VERBOSE);
    Flow flow = flowConnector.connect("hashtagcount", input, output, assembly);

    flow.start();
    flow.complete();
  }
  @SuppressWarnings("rawtypes")
  public void afterPropertiesSet() throws Exception {
    final Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);

    buildGenericOptions(cfg);

    if (StringUtils.hasText(user)) {
      UserGroupInformation ugi =
          UserGroupInformation.createProxyUser(user, UserGroupInformation.getLoginUser());
      ugi.doAs(
          new PrivilegedExceptionAction<Void>() {

            @Override
            public Void run() throws Exception {
              job = new Job(cfg);
              return null;
            }
          });
    } else {
      job = new Job(cfg);
    }

    ClassLoader loader =
        (beanClassLoader != null
            ? beanClassLoader
            : org.springframework.util.ClassUtils.getDefaultClassLoader());

    if (jar != null) {
      JobConf conf = (JobConf) job.getConfiguration();
      conf.setJar(jar.getURI().toString());
      loader = ExecutionUtils.createParentLastClassLoader(jar, beanClassLoader, cfg);
      conf.setClassLoader(loader);
    }

    // set first to enable auto-detection of K/V to skip the key/value types to be specified
    if (mapper != null) {
      Class<? extends Mapper> mapperClass = resolveClass(mapper, loader, Mapper.class);
      job.setMapperClass(mapperClass);
      configureMapperTypesIfPossible(job, mapperClass);
    }

    if (reducer != null) {
      Class<? extends Reducer> reducerClass = resolveClass(reducer, loader, Reducer.class);
      job.setReducerClass(reducerClass);
      configureReducerTypesIfPossible(job, reducerClass);
    }

    if (StringUtils.hasText(name)) {
      job.setJobName(name);
    }
    if (combiner != null) {
      job.setCombinerClass(resolveClass(combiner, loader, Reducer.class));
    }
    if (groupingComparator != null) {
      job.setGroupingComparatorClass(resolveClass(groupingComparator, loader, RawComparator.class));
    }
    if (inputFormat != null) {
      job.setInputFormatClass(resolveClass(inputFormat, loader, InputFormat.class));
    }
    if (mapKey != null) {
      job.setMapOutputKeyClass(resolveClass(mapKey, loader, Object.class));
    }
    if (mapValue != null) {
      job.setMapOutputValueClass(resolveClass(mapValue, loader, Object.class));
    }
    if (numReduceTasks != null) {
      job.setNumReduceTasks(numReduceTasks);
    }
    if (key != null) {
      job.setOutputKeyClass(resolveClass(key, loader, Object.class));
    }
    if (value != null) {
      job.setOutputValueClass(resolveClass(value, loader, Object.class));
    }
    if (outputFormat != null) {
      job.setOutputFormatClass(resolveClass(outputFormat, loader, OutputFormat.class));
    }
    if (partitioner != null) {
      job.setPartitionerClass(resolveClass(partitioner, loader, Partitioner.class));
    }
    if (sortComparator != null) {
      job.setSortComparatorClass(resolveClass(sortComparator, loader, RawComparator.class));
    }
    if (StringUtils.hasText(workingDir)) {
      job.setWorkingDirectory(new Path(workingDir));
    }
    if (jarClass != null) {
      job.setJarByClass(jarClass);
    }

    if (!CollectionUtils.isEmpty(inputPaths)) {
      for (String path : inputPaths) {
        FileInputFormat.addInputPath(job, new Path(path));
      }
    }

    if (StringUtils.hasText(outputPath)) {
      FileOutputFormat.setOutputPath(job, new Path(outputPath));
    }

    if (compressOutput != null) {
      FileOutputFormat.setCompressOutput(job, compressOutput);
    }

    if (codecClass != null) {
      FileOutputFormat.setOutputCompressorClass(
          job, resolveClass(codecClass, loader, CompressionCodec.class));
    }

    processJob(job);
  }
 @CliCommand(
     value = {PREFIX + "props list"},
     help = "Returns (all) the Hadoop properties")
 public String listProps() {
   return ConfigurationUtils.asProperties(hadoopConfiguration).toString();
 }
  @Override
  HadoopFlow createFlow() throws IOException {
    // copy flowDef
    FlowDef def = FlowDef.flowDef();

    if (flowDef != null) {
      def.addSinks(flowDef.getSinksCopy())
          .addSources(flowDef.getSourcesCopy())
          .addTraps(flowDef.getTrapsCopy())
          .addTails(flowDef.getTailsArray())
          .setAssertionLevel(flowDef.getAssertionLevel())
          .setDebugLevel(flowDef.getDebugLevel())
          .addCheckpoints(flowDef.getCheckpointsCopy())
          .addTags(flowDef.getTags())
          .setName(flowDef.getName());
    }

    Set<Pipe> heads = new LinkedHashSet<Pipe>();

    if (tails != null) {
      for (Pipe pipe : tails) {
        Collections.addAll(heads, pipe.getHeads());
      }
    }

    Pipe pipe = null;

    if (heads.size() == 1) {
      pipe = heads.iterator().next();
    }

    if (sources != null && sources.size() == 1) {
      Tap tap = sources.remove(MARKER);
      if (tap != null) {
        sources.put(pipe.getName(), tap);
      }
    }

    if (sinks != null && sinks.size() == 1) {
      Tap tap = sinks.remove(MARKER);
      if (tap != null) {
        sinks.put(pipe.getName(), tap);
      }
    }

    def.addSources(sources).addSinks(sinks).addTraps(traps);

    if (tails != null) {
      def.addTails(tails);
    }

    if (StringUtils.hasText(beanName)) {
      def.addTag(beanName);

      if (!StringUtils.hasText(def.getName())) {
        def.setName(beanName);
      }
    }

    Configuration cfg = ConfigurationUtils.createFrom(configuration, properties);
    Properties props = ConfigurationUtils.asProperties(cfg);

    if (jarSetup) {
      if (jar != null) {
        AppProps.setApplicationJarPath(props, ResourceUtils.decode(jar.getURI().toString()));
      } else if (jarClass != null) {
        AppProps.setApplicationJarClass(props, jarClass);
      } else {
        // auto-detection based on the classpath
        ClassLoader cascadingCL = Cascade.class.getClassLoader();
        Resource cascadingCore = ResourceUtils.findContainingJar(Cascade.class);
        Resource cascadingHadoop =
            ResourceUtils.findContainingJar(cascadingCL, "cascading/flow/hadoop/HadoopFlow.class");
        // find jgrapht
        Resource jgrapht = ResourceUtils.findContainingJar(cascadingCL, "org/jgrapht/Graph.class");

        Assert.notNull(cascadingCore, "Cannot find cascading-core.jar");
        Assert.notNull(cascadingHadoop, "Cannot find cascading-hadoop.jar");
        Assert.notNull(jgrapht, "Cannot find jgraphts-jdk.jar");

        if (log.isDebugEnabled()) {
          log.debug(
              "Auto-detecting Cascading Libs ["
                  + Arrays.toString(new Resource[] {cascadingCore, cascadingHadoop, jgrapht})
                  + "]");
        }

        ConfigurationUtils.addLibs(cfg, cascadingCore, cascadingHadoop, jgrapht);

        // config changed, reinit properties
        props = ConfigurationUtils.asProperties(cfg);
      }
    }

    if (jobPoolingInterval != null) {
      FlowProps.setJobPollingInterval(props, jobPoolingInterval);
    }

    if (maxConcurrentSteps != null) {
      FlowProps.setMaxConcurrentSteps(props, maxConcurrentSteps);
    }

    HadoopFlow flow = (HadoopFlow) new HadoopFlowConnector(props).connect(def);

    return flow;
  }