Beispiel #1
0
  public static void main(String[] args) throws Exception {
    String input = "hdfs://192.168.0.110:9000/input/access.log";
    String output = "hdfs://192.168.0.110:9000/user/hdfs/pv";

    JobConf conf = new JobConf(KPIPV.class);
    conf.setJobName("KPIPV");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(KPIPVMapper.class);
    conf.setCombinerClass(KPIPVReducer.class);
    conf.setReducerClass(KPIPVReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);
    System.exit(0);
  }
Beispiel #2
0
  public static void main(String[] args) throws Exception {
    String input = "hdfs://centos:9000/access.log.10";
    String output = "hdfs://centos:9000/out_kpitime";

    JobConf conf = new JobConf(KPITime.class);
    conf.setJobName("KPITime");
    //        conf.addResource("classpath:/hadoop/core-site.xml");
    //        conf.addResource("classpath:/hadoop/hdfs-site.xml");
    //        conf.addResource("classpath:/hadoop/mapred-site.xml");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(KPITimeMapper.class);
    conf.setCombinerClass(KPITimeReducer.class);
    conf.setReducerClass(KPITimeReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);
    System.exit(0);
  }
  /**
   * Run the job
   *
   * @param params The Job parameters containing the gramSize, input output folders, defaultCat,
   *     encoding
   */
  public static void runJob(Parameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesClassifierDriver.class);
    conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath"));
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath")));
    Path outPath = new Path(params.get("testDirPath") + "-output");
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setMapperClass(BayesClassifierMapper.class);
    conf.setCombinerClass(BayesClassifierReducer.class);
    conf.setReducerClass(BayesClassifierReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    HadoopUtil.overwriteOutput(outPath);
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

    Path outputFiles = new Path(outPath, "part*");
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params);
    log.info("{}", matrix.summarize());
  }
Beispiel #4
0
  public static void main(String[] args) throws Exception {
    String dir1 = "/user/miyuru/wcout";
    String dir2 = "/user/miyuru/notinverts";
    // We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    if (fs1.exists(new Path(dir2))) {
      fs1.delete(new Path(dir2), true);
    }

    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));
    Job job = new Job(conf, "NotInFinder");
    job.setJarByClass(WordCount.class);
    //   job.setMapperClass(TokenizerMapper.class);
    //   job.setCombinerClass(IntSumReducer.class);
    //   job.setReducerClass(IntSumReducer.class);
    //   job.setOutputKeyClass(LongWritable.class);
    //   job.setOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
  }
  public int run(String[] args) throws Exception {

    if (args.length < 2) {
      printUsage();
      return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    // set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
  }
 public JobBuilder reducer(Class<? extends Reducer> reducer, boolean hasCombiner)
     throws IOException {
   if (reducer != IdentityReducer.class) _jobConf.setReducerClass(reducer);
   if (hasCombiner) _jobConf.setCombinerClass(reducer);
   _jobConf.setJarByClass(reducer);
   return this;
 }
  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }
  /**
   * Configure the job
   *
   * @param conf Job to configure
   * @param rules classification rules to evaluate
   * @param target label value to evaluate the rules for
   * @param inpath input path (the dataset)
   * @param outpath output <code>Path</code>
   * @param split DatasetSplit used to separate training and testing input
   */
  private static void configureJob(
      JobConf conf,
      List<? extends Rule> rules,
      int target,
      Path inpath,
      Path outpath,
      DatasetSplit split) {
    split.storeJobParameters(conf);

    FileInputFormat.setInputPaths(conf, inpath);
    FileOutputFormat.setOutputPath(conf, outpath);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(CDFitness.class);

    conf.setMapperClass(CDMapper.class);
    conf.setCombinerClass(CDReducer.class);
    conf.setReducerClass(CDReducer.class);

    conf.setInputFormat(DatasetTextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    // store the parameters
    conf.set(CDMapper.CLASSDISCOVERY_RULES, StringUtils.toString(rules));
    conf.set(CDMapper.CLASSDISCOVERY_DATASET, StringUtils.toString(DataSet.getDataSet()));
    conf.setInt(CDMapper.CLASSDISCOVERY_TARGET_LABEL, target);
  }
  /**
   * {@inheritDoc}
   *
   * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
   */
  @Override
  public int run(String[] args) throws Exception {
    JobConf configuration = new JobConf(getConf(), WordCountExtended.class);
    configuration.setJobName(JOB_NAME);

    configuration.setOutputKeyClass(Text.class);
    configuration.setOutputValueClass(IntWritable.class);

    configuration.setMapperClass(Map.class);
    configuration.setCombinerClass(Reduce.class);
    configuration.setReducerClass(Reduce.class);

    configuration.setInputFormat(TextInputFormat.class);
    configuration.setOutputFormat(TextOutputFormat.class);

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      if (JOB_SKIP_ARGUMENT.equals(args[i])) {
        DistributedCache.addCacheFile(new Path(args[++i]).toUri(), configuration);
        configuration.setBoolean(JOB_PARAMETER_SKIP_PATTERNS, true);
      } else {
        otherArgs.add(args[i]);
      }
    }

    FileInputFormat.setInputPaths(configuration, new Path(otherArgs.get(0)));
    FileOutputFormat.setOutputPath(configuration, new Path(otherArgs.get(1)));

    JobClient.runJob(configuration);
    return 0;
  }
  /**
   * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce
   * job.
   *
   * @throws IOException When there is communication problems with the job tracker.
   */
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCountSeqOutput.class);
    conf.setJobName("wordcount_seqOF");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);
    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    // conf.setOutputValueClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Combiner.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputFormat(SequenceFileOutputFormat.class);

    //      // compress Mapper output
    //      conf.setCompressMapOutput(true);
    //      conf.setMapOutputCompressorClass(org.apache.hadoop.io.compress.GzipCodec.class);

    // compress final output
    conf.set("mapred.output.compress", conf.get("mapred.output.compress", "true"));
    conf.set("mapred.output.compression.type", conf.get("mapred.output.compression.type", "BLOCK"));
    conf.set(
        "mapred.output.compression.codec",
        conf.get("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"));

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
  }
Beispiel #11
0
  public int run(String[] args) throws Exception {

    if (args.length != 5) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    String stoplistPath = args[4];

    sLogger.info("Tool: AFormatter");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(AFormatterWG.class);
    conf.setJobName("Authority Formatter -- Web Graph");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    // conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setCompressMapOutput(true);
    conf.setSpeculativeExecution(false);
    // InputSampler.Sampler<IntWritable, Text> sampler = new
    // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10);
    // InputSampler.writePartitionFile(conf, sampler);
    // conf.setPartitionerClass(TotalOrderPartitioner.class);
    conf.setMapperClass(AFormatMapperIMC.class);
    conf.setCombinerClass(AFormatReducer.class);
    conf.setReducerClass(AFormatReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    Path stopList = new Path(stoplistPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    sLogger.info("Starting job");
    DistributedCache.addCacheFile(stopList.toUri(), conf);
    JobClient.runJob(conf);
    sLogger.info(
        "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }
Beispiel #12
0
  public static void main(String[] args) throws IOException {

    /*JobConf conf = new JobConf();

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(IpCounterMapper.class);
    conf.setCombinerClass(IpCounterReducer.class);
    conf.setReducerClass(IpCounterReducer.class);


    String inputDir = args[0];
    String outputDir = args[1];

    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    boolean flag = JobClient.runJob(conf).isSuccessful();

    System.out.println(args.length);*/

    if (args.length < 2) {
      System.out.println("args not right!");
      return;
    }

    JobConf conf = new JobConf(IpCount1.class);

    // set output key class
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    // set mapper & reducer class
    conf.setMapperClass(IpCounterMapper.class);
    conf.setCombinerClass(IpCounterReducer.class);
    conf.setReducerClass(IpCounterReducer.class);

    // set format
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    String inputDir = args[0];
    String outputDir = args[1];

    // FileInputFormat.setInputPaths(conf, "/user/hadoop/rongxin/locationinput/");
    FileInputFormat.setInputPaths(conf, inputDir);
    FileOutputFormat.setOutputPath(conf, new Path(outputDir));

    boolean flag = JobClient.runJob(conf).isSuccessful();
  }
  public static void run(Map<String, String> path) throws IOException {
    JobConf conf = Recommend.config();
    conf.setJarByClass(Step1.class);

    String input = "hdfs://192.168.201.11:9000/user/hdfs/recommend/data1"; // 提交的处理训练集的目录
    String input1 = path.get("Step1Input");
    String output = path.get("Step1Output");

    HdfsDAO hdfs = new HdfsDAO(Recommend.HDFS, conf);
    hdfs.rmr(output);
    hdfs.rmr(input1);
    hdfs.mkdirs(input1);
    //       hdfs.copyFile(path.get("data"), input);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Step1_ToUserPreMapper.class);
    conf.setCombinerClass(Step1_ToItemVectorReducer.class);
    conf.setReducerClass(Step1_ToItemVectorReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //        int maxCurrentReduceTasks = conf.getInt("mapred.tasktracker.reduce.tasks.maximum", 1);
    //        int ReduceTasks = (int) (2 * maxCurrentReduceTasks * 0.95);
    //        conf.setNumReduceTasks(ReduceTasks);
    /*
     * reduce的数目最好等于0.95或1.75乘以工作节点数,再乘以mapred.tasktracker.reduce.tasks.maximum(一般是每个节点的CPU数)
     * 因子为0.95时会让所有的reduce任务立即装载,并当map任务结束时复制它们的输出结果。当因子为1.75时,一些reduce任务会立即
     * 被装载,而其他一些则会等待。更早的节点会较早地完成第一轮reduce 任务并开始第二轮。最慢的节点第二轮不需要处理任何的reduce
     * 任务,这样可以带来更好的负载平衡。
     */
    //        conf.setNumReduceTasks(3);  //这里是在虚拟机下测试,就直接设成节点数

    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));

    RunningJob job = JobClient.runJob(conf);
    while (!job.isComplete()) {
      job.waitForCompletion();
    }
  }
  // Main function
  public static void main(String[] args) throws Exception {
    // TODO Auto-generated method stub

    JobConf conf = new JobConf(ProcessUnits.class);
    conf.setJobName("max_eletricityunits");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setMapperClass(EE_Mapper.class);
    conf.setCombinerClass(EE_Reducer.class);
    conf.setReducerClass(EE_Reducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
  public static void main(String args[]) throws IOException {
    JobConf job = new JobConf(WordCountJob.class);
    job.setJobName("Word Count Example");

    FileInputFormat.setInputPaths(job, args[0]);
    job.setInputFormat(TextInputFormat.class);

    job.setMapperClass(MapTask.class);
    job.setCombinerClass(ReduceTask.class);
    job.setReducerClass(ReduceTask.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);
  }
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Beispiel #17
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(MapReduceFour.class);
    conf.setJobName("step-Four");
    conf.setMapOutputKeyClass(VarLongWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);

    conf.setOutputKeyClass(RecommendResult.class);
    conf.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(
        conf, new Path("/Users/bai/Documents/mapreduce/output-third/part-00000"));
    //		DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",
    // "jdbc:mysql://localhost:3306/hotelrecommend","root","root");
    //		DBOutputFormat.setOutput(conf,"recommend","uid","hid","recommendValue");
    conf.setMapperClass(PartialMultiplyMapper.class);
    conf.setCombinerClass(AggregateCombiner.class);
    conf.setReducerClass(AggregateAndRecommendReducer.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    //		conf.setOutputFormat(DBOutputFormat.class);
    JobClient.runJob(conf);
  }
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(Main.class);
    conf.setJobName("feels-analysis");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(TheOutputClass.class);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setNumReduceTasks(1);
    conf.setInputFormat(CSVTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    // TODO: determine whether we need extra output
    MultipleOutputs.addMultiNamedOutput(
        conf, SECOND_OUTPUT, TextOutputFormat.class, Text.class, TheOutputClass.class);
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
  public int run(String[] args) throws Exception {
    String driverClassName = "org.hsqldb.jdbcDriver";
    String url = "jdbc:hsqldb:hsql://localhost/URLAccess";

    if (args.length > 1) {
      driverClassName = args[0];
      url = args[1];
    }

    initialize(driverClassName, url);

    JobConf job = new JobConf(getConf(), DBCountPageView.class);

    job.setJobName("Count Pageviews of URLs");

    job.setMapperClass(PageviewMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(PageviewReducer.class);

    DBConfiguration.configureDB(job, driverClassName, url);

    DBInputFormat.setInput(job, AccessRecord.class, "Access", null, "url", AccessFieldNames);

    DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(PageviewRecord.class);
    job.setOutputValueClass(NullWritable.class);
    try {
      JobClient.runJob(job);

      boolean correct = verify();
      if (!correct) throw new RuntimeException("Evaluation was not correct!");
    } finally {
      shutdown();
    }
    return 0;
  }
Beispiel #20
0
  public static void main(String[] args) throws IOException {
    String inpath = "hdfs://localhost:9000/user/hdfs/in/";
    String outpath = "hdfs://localhost:9000/user/hdfs/pv_out/";
    JobConf conf = new JobConf(StatPV.class);
    conf.setJobName("StatPV");
    conf.setMapperClass(PvMapper.class);
    conf.setCombinerClass(PvReducer.class);
    conf.setReducerClass(PvReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inpath));
    FileOutputFormat.setOutputPath(conf, new Path(outpath));
    JobClient.runJob(conf);
    System.out.println("finish");
    System.exit(0);
  }
  @Override
  public int run(String[] args) throws IOException {
    OptionParser p = new OptionParser();
    OptionSpec<String> maxwiOpt =
        p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED")
            .withRequiredArg()
            .ofType(String.class);
    OptionSpec<Float> thresholdOpt =
        p.accepts(thresholdOptName, "similarity threshold")
            .withRequiredArg()
            .ofType(Float.class)
            .defaultsTo(DEFAULT_THRESHOLD);
    OptionSpec<Integer> stripesOpt =
        p.accepts(stripesOptName, "number of stripes to divide the similarity matrix")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(1);
    OptionSpec<Integer> spreadOpt =
        p.accepts(spreadOptName, "number of reducers per stripe")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_SPREAD);
    OptionSpec<Integer> factorOpt =
        p.accepts(factorOptName, "number of mappers per reducer")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_FACTOR);
    OptionSpec<Integer> maxVectorIDOpt =
        p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class);
    p.acceptsAll(Arrays.asList("h", "?"), "show help");

    OptionSet options = parseOptions(p, args);

    // to distinguish indexes built in successive runs
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    Date date = new Date();

    float threshold = options.valueOf(thresholdOpt); // threshold
    if (threshold < 0 || threshold >= 1) {
      System.err.println(thresholdOptName + " should be between 0 and 1");
      System.exit(1);
    }

    int numStripes = options.valueOf(stripesOpt); // number of stripes
    if (numStripes < 1) {
      System.err.println(stripesOptName + " should be > 0");
      System.exit(1);
    }

    // MapReduce parameters
    int spread = options.valueOf(spreadOpt); // how many reducers per stripe
    if (spread < 1) {
      System.err.println(spreadOptName + " should be > 0");
      System.exit(1);
    }

    int factor = options.valueOf(factorOpt); // how many mappers per reducer
    if (factor < 1) {
      System.err.println(factorOptName + " should be > 0");
      System.exit(1);
    }

    int maxKey = 0;
    if (options.has(maxVectorIDOpt)) {
      maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID
      if (maxKey < 1) {
        System.err.println(maxVectorIDOptName + " should be > 0");
        System.exit(1);
      }
    }

    int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread);
    int numMappers = numReducers * factor;
    int numBuckets = numMappers;

    // pick the file with max weights from command line
    String maxWiDir = options.valueOf(maxwiOpt);
    List<String> nonOptArgs = options.nonOptionArguments();

    LOG.info("Threshold set to " + threshold);
    LOG.info(
        String.format(
            "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s",
            numBuckets, factor, numStripes, spread, numReducers));

    // start building the jobs
    JobConf conf1 = new JobConf(getConf(), Similarity.class);
    conf1.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf1.setInt(PARAM_APS_STRIPES, numStripes);
    DistributedCache.addCacheFile(URI.create(maxWiDir), conf1);

    Path inputPath = new Path(nonOptArgs.get(0));
    Path indexPath =
        new Path(
            nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date));
    // index filtering pruned nested directory
    Path indexOnlyPath = new Path(indexPath, "part*");
    Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes);
    FileInputFormat.setInputPaths(conf1, inputPath);
    FileOutputFormat.setOutputPath(conf1, indexPath);

    conf1.setInputFormat(SequenceFileInputFormat.class);
    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setMapOutputKeyClass(LongWritable.class);
    conf1.setMapOutputValueClass(IndexItem.class);
    conf1.setOutputKeyClass(LongWritable.class);
    conf1.setOutputValueClass(IndexItemArrayWritable.class);
    conf1.setMapperClass(IndexerMapper.class);
    conf1.setReducerClass(IndexerReducer.class);

    // assuming input is sorted according to the key (vectorID) so that the
    // part files are locally sorted
    MultipleOutputs.addNamedOutput(
        conf1,
        PRUNED,
        SequenceFileOutputFormat.class,
        IntWritable.class,
        VectorComponentArrayWritable.class);

    // remove the stuff we added from the job name
    conf1.set(
        "mapred.job.name",
        "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16));
    conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf1.setSpeculativeExecution(false);
    conf1.setCompressMapOutput(true);
    // hash the posting lists in different buckets to distribute the load
    conf1.setNumReduceTasks(numBuckets);

    RunningJob job1 = JobClient.runJob(conf1);

    // part 2
    JobConf conf2 = new JobConf(getConf(), Similarity.class);

    if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL);

    MultipleInputs.addInputPath(
        conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class);
    MultipleInputs.addInputPath(
        conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class);
    FileOutputFormat.setOutputPath(conf2, outputPath);
    conf2.setCombinerClass(SimilarityCombiner.class);
    conf2.setReducerClass(SimilarityReducer.class);
    conf2.setPartitionerClass(GenericKey.StripePartitioner.class);
    conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class);
    conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class);
    conf2.setMapOutputKeyClass(GenericKey.class);
    conf2.setMapOutputValueClass(GenericValue.class);
    conf2.setOutputKeyClass(VectorPair.class);
    conf2.setOutputValueClass(NullWritable.class);

    Counter numDocs =
        job1.getCounters()
            .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS");
    maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue();
    LOG.info("Setting max key value in input to " + maxKey);
    conf2.setInt(PARAM_APS_MAXKEY, maxKey);

    conf2.setInt(PARAM_APS_STRIPES, numStripes);
    conf2.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread);
    conf2.set("mapred.job.name", "APS-" + outputPath.getName());

    conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf2.setSpeculativeExecution(false);
    conf2.setCompressMapOutput(true);
    conf2.setNumReduceTasks(numReducers);

    JobClient.runJob(conf2);

    return 0;
  }
Beispiel #22
0
  public static void main(String[] args) throws IOException {

    JobConf lp = new JobConf(L4.class);
    lp.setJobName("Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setCombinerClass(Combiner.class);
    lp.setReducerClass(Group.class);
    Properties props = System.getProperties();
    String dataDir = props.getProperty("PIGMIX_DIR", "/user/pig/tests/data/pigmix");
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
      lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(dataDir, "page_views"));
    FileOutputFormat.setOutputPath(
        lp, new Path("/user/" + System.getProperty("user.name") + "/L4out"));
    lp.setNumReduceTasks(40);
    Job group = new Job(lp);

    JobControl jc = new JobControl("L4 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
      ArrayList<Job> failures = jc.getFailedJobs();
      if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
          System.err.println(failure.getMessage());
        }
        break;
      }

      try {
        Thread.sleep(5000);
      } catch (InterruptedException e) {
      }

      if (i % 10000 == 0) {
        System.out.println("Running jobs");
        ArrayList<Job> running = jc.getRunningJobs();
        if (running != null && running.size() > 0) {
          for (Job r : running) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Ready jobs");
        ArrayList<Job> ready = jc.getReadyJobs();
        if (ready != null && ready.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Waiting jobs");
        ArrayList<Job> waiting = jc.getWaitingJobs();
        if (waiting != null && waiting.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
        System.out.println("Successful jobs");
        ArrayList<Job> success = jc.getSuccessfulJobs();
        if (success != null && success.size() > 0) {
          for (Job r : ready) {
            System.out.println(r.getJobName());
          }
        }
      }
      i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
      for (Job failure : failures) {
        System.err.println(failure.getMessage());
      }
    }
    jc.stop();
  }
  /**
   * Starts a Rand MapReduce job which will produce one or more random objects.
   *
   * @param numRows number of rows for each random object
   * @param numCols number of columns for each random object
   * @param blockRowSize number of rows in a block for each random object
   * @param blockColSize number of columns in a block for each random object
   * @param minValue minimum of the random values for each random object
   * @param maxValue maximum of the random values for each random object
   * @param sparsity sparsity for each random object
   * @param pdf probability density function for each random object
   * @param replication file replication
   * @param inputs input file for each random object
   * @param outputs output file for each random object
   * @param outputInfos output information for each random object
   * @param instructionsInMapper instruction for each random object
   * @param resultIndexes result indexes for each random object
   * @return matrix characteristics for each random object
   * @throws Exception if an error occurred in the MapReduce phase
   */
  public static JobReturn runJob(
      MRJobInstruction inst,
      String[] dataGenInstructions,
      String instructionsInMapper,
      String aggInstructionsInReducer,
      String otherInstructionsInReducer,
      int numReducers,
      int replication,
      byte[] resultIndexes,
      String dimsUnknownFilePrefix,
      String[] outputs,
      OutputInfo[] outputInfos)
      throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    // whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
      dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

      MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
      MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
      DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

      rlens[i] = genInst.getRows();
      clens[i] = genInst.getCols();
      brlens[i] = genInst.getRowsInBlock();
      bclens[i] = genInst.getColsInBlock();

      maxbrlen = Math.max(maxbrlen, brlens[i]);
      maxbclen = Math.max(maxbclen, bclens[i]);

      if (mrtype == MRINSTRUCTION_TYPE.Rand) {
        RandInstruction randInst = (RandInstruction) mrins;
        inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
        maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);

        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();

        // seed generation
        Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
        long[] nnz =
            LibMatrixDatagen.computeNNZperBlock(
                rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity());
        int nnzIx = 0;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
          for (long c = 0; c < clens[i]; c += bclens[i]) {
            long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

            sb.append((r / brlens[i]) + 1);
            sb.append(',');
            sb.append((c / bclens[i]) + 1);
            sb.append(',');
            sb.append(curBlockRowSize);
            sb.append(',');
            sb.append(curBlockColSize);
            sb.append(',');
            sb.append(nnz[nnzIx++]);
            sb.append(',');
            sb.append(bigrand.nextLong());
            pw.println(sb.toString());
            sb.setLength(0);
            numblocks++;
          }
        }
        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
        SeqInstruction seqInst = (SeqInstruction) mrins;
        inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
        maxsparsity = 1.0; // always dense

        double from = seqInst.fromValue;
        double to = seqInst.toValue;
        double incr = seqInst.incrValue;

        // handle default 1 to -1 for special case of from>to
        incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

        // Correctness checks on (from, to, incr)
        boolean neg = (from > to);
        if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

        if (neg != (incr < 0))
          throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

        // Compute the number of rows in the sequence
        long numrows = 1 + (long) Math.floor((to - from) / incr);
        if (rlens[i] > 0) {
          if (numrows != rlens[i])
            throw new DMLRuntimeException(
                "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                    + rlens[i]
                    + " != "
                    + numrows);
        } else {
          rlens[i] = numrows;
        }

        if (clens[i] > 0 && clens[i] != 1)
          throw new DMLRuntimeException(
              "Unexpected error while processing sequence instruction. Number of columns ("
                  + clens[i]
                  + ") must be equal to 1.");
        else clens[i] = 1;

        FSDataOutputStream fsOut = fs.create(new Path(inputs[i]));
        PrintWriter pw = new PrintWriter(fsOut);
        StringBuilder sb = new StringBuilder();

        double temp = from;
        double block_from, block_to;
        for (long r = 0; r < rlens[i]; r += brlens[i]) {
          long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

          // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to]
          // (inclusive of both end points of the interval)
          long bid_i = ((r / brlens[i]) + 1);
          long bid_j = 1;
          block_from = temp;
          block_to = temp + (curBlockRowSize - 1) * incr;
          temp = block_to + incr; // next block starts from here

          sb.append(bid_i);
          sb.append(',');
          sb.append(bid_j);
          sb.append(',');
          /*
          // Need not include block size while generating seq()
          sb.append(curBlockRowSize);
          sb.append(',');
          sb.append(1);
          sb.append(',');*/
          sb.append(block_from);
          sb.append(',');
          sb.append(block_to);
          sb.append(',');
          sb.append(incr);

          pw.println(sb.toString());
          // System.out.println("MapTask " + r + ": " + sb.toString());
          sb.setLength(0);
          numblocks++;
        }

        pw.close();
        fsOut.close();
        inputInfos[i] = InputInfo.TextCellInputInfo;
      } else {
        throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
      }
    }
    dataGenInsStr = dataGenInsStr.substring(1); // remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the input files and their format information
      MRJobConfiguration.setUpMultipleInputs(
          job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);

      // set up the dimensions of input matrices
      MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
      MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

      // set up the block size
      MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

      // set up the rand Instructions
      MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

      // set up unary instructions that will perform in the mapper
      MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

      // set up the aggregate instructions that will happen in the combiner and reducer
      MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

      // set up the instructions that will happen in the reducer, after the aggregation instrucions
      MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

      // set up the replication factor for the results
      job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

      // set up map/reduce memory configurations (if in AM context)
      DMLConfig config = ConfigurationManager.getDMLConfig();
      DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

      // set up custom map/reduce configurations
      MRJobConfiguration.setupCustomMRConfigurations(job, config);

      // determine degree of parallelism (nmappers: 1<=n<=capacity)
      // TODO use maxsparsity whenever we have a way of generating sparse rand data
      int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
      long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
      // correction max number of mappers on yarn clusters
      if (InfrastructureAnalyzer.isYarnEnabled())
        capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
      int nmapers =
          Math.max(
              Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity),
              1);
      job.setNumMapTasks(nmapers);

      // set up what matrices are needed to pass from the mapper to reducer
      HashSet<Byte> mapoutputIndexes =
          MRJobConfiguration.setUpOutputIndexesForMapper(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              otherInstructionsInReducer,
              resultIndexes);

      MatrixChar_N_ReducerGroups ret =
          MRJobConfiguration.computeMatrixCharacteristics(
              job,
              realIndexes,
              dataGenInsStr,
              instructionsInMapper,
              null,
              aggInstructionsInReducer,
              null,
              otherInstructionsInReducer,
              resultIndexes,
              mapoutputIndexes,
              false);
      stats = ret.stats;

      // set up the number of reducers
      MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

      // print the complete MRJob instruction
      if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats);

      // Update resultDimsUnknown based on computed "stats"
      byte[] resultDimsUnknown = new byte[resultIndexes.length];
      for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
          resultDimsUnknown[i] = (byte) 1;
        } else {
          resultDimsUnknown[i] = (byte) 0;
        }
      }

      boolean mayContainCtable =
          instructionsInMapper.contains("ctabletransform")
              || instructionsInMapper.contains("groupedagg");

      // set up the multiple output files, and their format information
      MRJobConfiguration.setUpMultipleOutputs(
          job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);

      // configure mapper and the mapper output key value pairs
      job.setMapperClass(DataGenMapper.class);
      if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
      } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
      }

      // set up combiner
      if (numReducers != 0
          && aggInstructionsInReducer != null
          && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class);

      // configure reducer
      job.setReducerClass(GMRReducer.class);
      // job.setReducerClass(PassThroughReducer.class);

      // By default, the job executes in "cluster" mode.
      // Determine if we can optimize and run it in "local" mode.
      MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
      for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
      }

      // set unique working dir
      MRJobConfiguration.setUniqueWorkingDir(job);

      runjob = JobClient.runJob(job);

      /* Process different counters */

      Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
      for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
      }

      String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
      stats = MapReduceTool.processDimsFiles(dir, stats);
      MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
      for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
  }
  /**
   * Create an Aggregate based map/reduce job.
   *
   * @param args the arguments used for job creation. Generic hadoop arguments are accepted.
   * @return a JobConf object ready for submission.
   * @throws IOException
   * @see GenericOptionsParser
   */
  public static JobConf createValueAggregatorJob(String args[]) throws IOException {

    Configuration conf = new Configuration();

    GenericOptionsParser genericParser = new GenericOptionsParser(conf, args);
    args = genericParser.getRemainingArgs();

    if (args.length < 2) {
      System.out.println(
          "usage: inputDirs outDir " + "[numOfReducer [textinputformat|seq [specfile [jobName]]]]");
      GenericOptionsParser.printGenericCommandUsage(System.out);
      System.exit(1);
    }
    String inputDir = args[0];
    String outputDir = args[1];
    int numOfReducers = 1;
    if (args.length > 2) {
      numOfReducers = Integer.parseInt(args[2]);
    }

    Class<? extends InputFormat> theInputFormat = TextInputFormat.class;
    if (args.length > 3 && args[3].compareToIgnoreCase("textinputformat") == 0) {
      theInputFormat = TextInputFormat.class;
    } else {
      theInputFormat = SequenceFileInputFormat.class;
    }

    Path specFile = null;

    if (args.length > 4) {
      specFile = new Path(args[4]);
    }

    String jobName = "";

    if (args.length > 5) {
      jobName = args[5];
    }

    JobConf theJob = new JobConf(conf);
    if (specFile != null) {
      theJob.addResource(specFile);
    }
    String userJarFile = theJob.get("user.jar.file");
    if (userJarFile == null) {
      theJob.setJarByClass(ValueAggregator.class);
    } else {
      theJob.setJar(userJarFile);
    }
    theJob.setJobName("ValueAggregatorJob: " + jobName);

    FileInputFormat.addInputPaths(theJob, inputDir);

    theJob.setInputFormat(theInputFormat);

    theJob.setMapperClass(ValueAggregatorMapper.class);
    FileOutputFormat.setOutputPath(theJob, new Path(outputDir));
    theJob.setOutputFormat(TextOutputFormat.class);
    theJob.setMapOutputKeyClass(Text.class);
    theJob.setMapOutputValueClass(Text.class);
    theJob.setOutputKeyClass(Text.class);
    theJob.setOutputValueClass(Text.class);
    theJob.setReducerClass(ValueAggregatorReducer.class);
    theJob.setCombinerClass(ValueAggregatorCombiner.class);
    theJob.setNumMapTasks(1);
    theJob.setNumReduceTasks(numOfReducers);
    return theJob;
  }
Beispiel #25
0
  public static void main(String[] args) throws Exception {

    String inputPath = args[0];
    String outputPath = args[1];
    int specIteration = Integer.parseInt(args[2]);
    // set number of reducers
    int numReducers = 2 * 15;

    // step 1
    JobConf conf0 = new JobConf(Hashmin.class);
    conf0.setJobName("hashmin Step1");

    conf0.setOutputKeyClass(IntWritable.class);
    conf0.setOutputValueClass(Text.class);
    conf0.setMapOutputKeyClass(IntWritable.class);
    conf0.setMapOutputValueClass(IntWritable.class);

    conf0.setMapperClass(FirstMapper.class);
    conf0.setReducerClass(FirstReducer.class);
    conf0.setInputFormat(TextInputFormat.class);
    conf0.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.addInputPath(conf0, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf0, new Path(args[1] + "/i-1"));

    conf0.setNumReduceTasks(numReducers);
    JobClient.runJob(conf0);

    // loop

    JobConf conf1 = new JobConf();
    conf1.setOutputKeyClass(IntWritable.class);
    conf1.setOutputValueClass(IntWritable.class);
    conf1.setMapOutputKeyClass(IntWritable.class);
    conf1.setMapOutputValueClass(Text.class);
    conf1.setMapperClass(JoinMapper.class);
    conf1.setReducerClass(JoinReducer.class);
    conf1.setInputFormat(TextInputFormat.class);
    conf1.setOutputFormat(TextOutputFormat.class);

    JobConf conf2 = new JobConf();
    conf2.setOutputKeyClass(IntWritable.class);
    conf2.setOutputValueClass(Text.class);
    conf2.setMapOutputKeyClass(IntWritable.class);
    conf2.setMapOutputValueClass(Text.class);
    conf2.setMapperClass(AggMapper.class);
    conf2.setReducerClass(AggReducer.class);
    // combiner
    conf2.setCombinerClass(AggCombiner.class);
    conf2.setInputFormat(TextInputFormat.class);
    conf2.setOutputFormat(TextOutputFormat.class);

    JobConf conf = new JobConf(Hashmin.class);
    conf.setJobName("Hashmin");
    conf.setLoopInputOutput(HashminLoopInputOutput.class);
    // testing
    // conf.setLoopInputOutput(RankLoopInputOutputReuse.class);
    conf.setLoopReduceCacheSwitch(HashminReduceCacheSwitch.class);
    conf.setLoopReduceCacheFilter(HashminReduceCacheFilter.class);
    conf.setLoopStepHook(HashminStepHook.class);
    conf.setInputPath(inputPath);
    conf.setOutputPath(outputPath);

    conf.setStepConf(0, conf1);
    conf.setStepConf(1, conf2);
    conf.setIterative(true);
    conf.setNumIterations(specIteration);
    conf.setNumReduceTasks(numReducers);

    JobClient.runJob(conf);
  }
Beispiel #26
0
 @Override
 protected void beforeRun(String[] args, JobConf config) {
   config.setCombinerClass(LongSumReducer.class);
 }
Beispiel #27
0
    static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput)
        throws IOException {
      FileSystem inputfs = sortInput.getFileSystem(defaults);
      FileSystem outputfs = sortOutput.getFileSystem(defaults);
      FileSystem defaultfs = FileSystem.get(defaults);
      JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class);
      jobConf.setJobName("sortvalidate-recordstats-checker");

      int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length;
      jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks);
      int noSortInputpaths = inputfs.listStatus(sortInput).length;

      jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(IntWritable.class);
      jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setCombinerClass(Reduce.class);
      jobConf.setReducerClass(Reduce.class);

      jobConf.setNumMapTasks(noSortReduceTasks);
      jobConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker");
      if (defaultfs.exists(outputPath)) {
        defaultfs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordStatsChecker: Validate sort "
              + "from "
              + inputPaths[0]
              + " ("
              + noSortInputpaths
              + " files), "
              + inputPaths[1]
              + " ("
              + noSortReduceTasks
              + " files) into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with 1 reducer.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");

      // Check to ensure that the statistics of the
      // framework's sort-input and sort-output match
      SequenceFile.Reader stats =
          new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults);
      IntWritable k1 = new IntWritable();
      IntWritable k2 = new IntWritable();
      RecordStatsWritable v1 = new RecordStatsWritable();
      RecordStatsWritable v2 = new RecordStatsWritable();
      if (!stats.next(k1, v1)) {
        throw new IOException("Failed to read record #1 from reduce's output");
      }
      if (!stats.next(k2, v2)) {
        throw new IOException("Failed to read record #2 from reduce's output");
      }

      if ((v1.getBytes() != v2.getBytes())
          || (v1.getRecords() != v2.getRecords())
          || v1.getChecksum() != v2.getChecksum()) {
        throw new IOException(
            "("
                + v1.getBytes()
                + ", "
                + v1.getRecords()
                + ", "
                + v1.getChecksum()
                + ") v/s ("
                + v2.getBytes()
                + ", "
                + v2.getRecords()
                + ", "
                + v2.getChecksum()
                + ")");
      }
    }