@Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf(
          "Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    // vv MaxTemperatureDriverV6
    conf.setProfileEnabled(true);
    conf.setProfileParams(
        "-agentlib:hprof=cpu=samples,heap=sites,depth=6," + "force=n,thread=y,verbose=n,file=%s");
    conf.setProfileTaskRange(true, "0-2");
    // ^^ MaxTemperatureDriverV6

    JobClient.runJob(conf);
    return 0;
  }
Пример #2
0
  public void testComplexNameWithRegex() throws Exception {
    OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt"));
    Writer wr = new OutputStreamWriter(os);
    wr.write("b a\n");
    wr.close();

    JobConf conf = createJobConf();
    conf.setJobName("name \\Evalue]");

    conf.setInputFormat(TextInputFormat.class);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);

    FileInputFormat.setInputPaths(conf, getInputDir());

    FileOutputFormat.setOutputPath(conf, getOutputDir());

    JobClient.runJob(conf);

    Path[] outputFiles =
        FileUtil.stat2Paths(getFileSystem().listStatus(getOutputDir(), new OutputLogFilter()));
    assertEquals(1, outputFiles.length);
    InputStream is = getFileSystem().open(outputFiles[0]);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    assertEquals("0\tb a", reader.readLine());
    assertNull(reader.readLine());
    reader.close();
  }
  public static void runJob(String[] args) {
    JobConf conf = new JobConf(CassandraBulkLoader.class);

    if (args.length >= 4) {
      conf.setNumReduceTasks(new Integer(args[3]));
    }

    try {
      // We store the cassandra storage-conf.xml on the HDFS cluster
      DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setJobName("CassandraBulkLoader_v2");
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));
    FileOutputFormat.setOutputPath(conf, new Path(args[2]));
    try {
      JobClient.runJob(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
Пример #4
0
  private String runJob() throws Exception {
    OutputStream os = getFileSystem().create(new Path(getInputDir(), "text.txt"));
    Writer wr = new OutputStreamWriter(os);
    wr.write("hello1\n");
    wr.write("hello2\n");
    wr.write("hello3\n");
    wr.close();

    JobConf conf = createJobConf();
    conf.setJobName("mr");
    conf.setJobPriority(JobPriority.HIGH);

    conf.setInputFormat(TextInputFormat.class);

    conf.setMapOutputKeyClass(LongWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class);
    conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class);

    FileInputFormat.setInputPaths(conf, getInputDir());
    FileOutputFormat.setOutputPath(conf, getOutputDir());

    return JobClient.runJob(conf).getID().toString();
  }
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("UFO count");

    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: avro UFO counter <in> <out>");
      System.exit(2);
    }

    FileInputFormat.addInputPath(conf, new Path(otherArgs[0]));
    Path outputPath = new Path(otherArgs[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);
    outputPath.getFileSystem(conf).delete(outputPath);
    Schema input_schema = Schema.parse(getClass().getResourceAsStream("ufo.avsc"));
    AvroJob.setInputSchema(conf, input_schema);
    AvroJob.setMapOutputSchema(
        conf,
        Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.LONG)));

    AvroJob.setOutputSchema(conf, OUTPUT_SCHEMA);
    AvroJob.setMapperClass(conf, AvroRecordMapper.class);
    AvroJob.setReducerClass(conf, AvroRecordReducer.class);
    conf.setInputFormat(AvroInputFormat.class);
    JobClient.runJob(conf);

    return 0;
  }
Пример #6
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
Пример #7
0
  // Job configuration
  private static JobConf createJobConf(Configuration conf) {
    JobConf jobconf = new JobConf(conf, DistCp.class);
    jobconf.setJobName(NAME);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobconf.setMapSpeculativeExecution(false);

    jobconf.setInputFormat(CopyInputFormat.class);
    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(Text.class);

    jobconf.setMapperClass(CopyFilesMapper.class);
    jobconf.setNumReduceTasks(0);
    return jobconf;
  }
Пример #8
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(Add1.class);
    conf.setJobName("sumar1");

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Пример #9
0
  // Main function
  public static void main(String[] args) throws Exception {
    // TODO Auto-generated method stub

    JobConf conf = new JobConf(ProcessUnits.class);
    conf.setJobName("max_eletricityunits");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setMapperClass(EE_Mapper.class);
    conf.setCombinerClass(EE_Reducer.class);
    conf.setReducerClass(EE_Reducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Пример #10
0
  public int run(String[] args) throws Exception {
    if (args.length < 4) {
      System.out.println("ERROR: Please Enter args : input output type(text|seq) splitChar(9=\t)");
      return JobClient.SUCCESS;
    }
    String input = args[0];
    String output = args[1];
    String type = args[2];
    String splitChar = args[3];

    JobConf config = new JobConf(getConf(), getClass());
    config.set("user.split", splitChar);

    config.setJobName("File Filter -" + System.currentTimeMillis());
    config.setNumReduceTasks(10);
    config.setReducerClass(IdentityReducer.class);
    config.setMapperClass(FileTestMapper.class);
    if ("text".equals(type)) {
      config.setInputFormat(TextInputFormat.class);
      TextInputFormat.addInputPath(config, new Path(input));
    } else {
      config.setInputFormat(SequenceFileInputFormat.class);
      SequenceFileInputFormat.addInputPath(config, new Path(input));
    }
    config.setMapOutputKeyClass(Text.class);
    config.setMapOutputValueClass(Text.class);

    config.setOutputKeyClass(Text.class);
    config.setOutputValueClass(Text.class);

    // if output path exists then return
    FileSystem fs = FileSystem.get(config);
    Path outputPath = new Path(output);
    FileOutputFormat.setOutputPath(config, outputPath);

    if (!fs.exists(outputPath)) {
      JobClient.runJob(config);
    } else {
      System.out.println("You has finished this job today ! " + outputPath);
    }

    return JobClient.SUCCESS;
  }
  public static void main(String args[]) throws IOException {
    JobConf job = new JobConf(WordCountJob.class);
    job.setJobName("Word Count Example");

    FileInputFormat.setInputPaths(job, args[0]);
    job.setInputFormat(TextInputFormat.class);

    job.setMapperClass(MapTask.class);
    job.setCombinerClass(ReduceTask.class);
    job.setReducerClass(ReduceTask.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);
  }
  public int run(String[] args) throws Exception {
    if (args.length < 1) {
      args = new String[] {DateStringUtils.now()};
      System.out.println(
          "ERROR: Please Enter Date , eg. 20101010 ! now use default => " + DateStringUtils.now());
    }

    JobConf config = new JobConf(getConf(), getClass());
    config.set("user.args", Utils.asString(args));

    config.setJobName(getClass() + "-" + System.currentTimeMillis());
    config.setNumReduceTasks(100);
    config.setMapperClass(getClass());
    config.setReducerClass(getClass());
    config.setInputFormat(getInputFormat());
    config.setMapOutputKeyClass(Text.class);
    config.setMapOutputValueClass(Text.class);

    // add input paths
    for (String path : getInputPath(args)) {
      if (TextInputFormat.class.equals(getInputFormat())) {
        TextInputFormat.addInputPath(config, new Path(path));
      } else if (SequenceFileInputFormat.class.equals(getInputFormat())) {
        SequenceFileInputFormat.addInputPath(config, new Path(path));
      }
    }

    config.setOutputKeyClass(Text.class);
    config.setOutputValueClass(Text.class);

    // if output path exists then return
    FileSystem fs = FileSystem.get(config);
    Path outputPath = new Path(getOutputPath(args));
    FileOutputFormat.setOutputPath(config, outputPath);

    if (!fs.exists(outputPath)) {
      JobClient.runJob(config);
    } else {
      System.out.println("You has finished this job today ! " + outputPath);
    }

    return JobClient.SUCCESS;
  }
Пример #13
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(WordCount.class);
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Пример #14
0
  public static void main(String[] args) throws IOException {
    if (args.length != 2) {
      System.err.println("Usage: OldMaxTemperature <input path> <output path>");
      System.exit(-1);
    }

    /*[*/ JobConf conf = new JobConf(OldMaxTemperature.class); /*]*/
    /*[*/ conf /*]*/.setJobName("Max temperature");

    FileInputFormat.addInputPath(/*[*/ conf /*]*/, new Path(args[0]));
    FileOutputFormat.setOutputPath(/*[*/ conf /*]*/, new Path(args[1]));

    /*[*/ conf /*]*/.setMapperClass(OldMaxTemperatureMapper.class);
    /*[*/ conf /*]*/.setReducerClass(OldMaxTemperatureReducer.class);

    /*[*/ conf /*]*/.setOutputKeyClass(Text.class);
    /*[*/ conf /*]*/.setOutputValueClass(IntWritable.class);

    /*[*/ JobClient.runJob(conf); /*]*/
  }
Пример #15
0
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), XiangLi1_exercise3.class);
    conf.setJobName("xiangli1_exercise3");
    conf.setNumReduceTasks(0);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    // conf.setCombinerClass(Reduce.class);
    // conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
    return 0;
  }
Пример #16
0
  @Override
  public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), Sqrt2.class);
    conf.setJobName("sqrt2");

    conf.setOutputKeyClass(DoubleWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(Map.class);
    /*conf.setCombinerClass(Reduce.class);*/
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
    return 0;
  }
Пример #17
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(Main.class);
    conf.setJobName("feels-analysis");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(TheOutputClass.class);
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setNumReduceTasks(1);
    conf.setInputFormat(CSVTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    // TODO: determine whether we need extra output
    MultipleOutputs.addMultiNamedOutput(
        conf, SECOND_OUTPUT, TextOutputFormat.class, Text.class, TheOutputClass.class);
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Пример #18
0
  public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(NeighborSearch.class);
    conf.setJobName("star searching");

    conf.setOutputKeyClass(BlockIDWritable.class);
    conf.setOutputValueClass(PairWritable.class);

    conf.setMapperClass(Map.class);
    // conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
    //		conf.setPartitionerClass(BlockPartitioner.class);

    //		conf.setFloat("mapred.reduce.slowstart.completed.maps", (float) 1.0);

    conf.setInputFormat(StarInputFormat.class);
    conf.setOutputFormat(StarOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);
  }
Пример #19
0
    static void checkRecords(
        Configuration defaults, int noMaps, int noReduces, Path sortInput, Path sortOutput)
        throws IOException {
      JobConf jobConf = new JobConf(defaults, RecordChecker.class);
      jobConf.setJobName("sortvalidate-record-checker");

      jobConf.setInputFormat(SequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(BytesWritable.class);
      jobConf.setOutputValueClass(IntWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setReducerClass(Reduce.class);

      JobClient client = new JobClient(jobConf);
      ClusterStatus cluster = client.getClusterStatus();
      if (noMaps == -1) {
        noMaps = cluster.getTaskTrackers() * jobConf.getInt("test.sortvalidate.maps_per_host", 10);
      }
      if (noReduces == -1) {
        noReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sortvalidate.reduces_per_host");
        if (sortReduces != null) {
          noReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }
      }
      jobConf.setNumMapTasks(noMaps);
      jobConf.setNumReduceTasks(noReduces);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordchecker");
      FileSystem fs = FileSystem.get(defaults);
      if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordChecker: Running on "
              + cluster.getTaskTrackers()
              + " nodes to validate sort from "
              + inputPaths[0]
              + ", "
              + inputPaths[1]
              + " into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with "
              + noReduces
              + " reduces.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    }
  /** Check refreshNodes for decommissioning blacklisted nodes. */
  public void testBlacklistedNodeDecommissioning() throws Exception {
    LOG.info("Testing blacklisted node decommissioning");
    MiniMRCluster mr = null;
    JobTracker jt = null;

    try {
      // start mini mr
      JobConf jtConf = new JobConf();
      jtConf.set("mapred.max.tracker.blacklists", "1");
      mr = new MiniMRCluster(0, 0, 2, "file:///", 1, null, null, null, jtConf);
      jt = mr.getJobTrackerRunner().getJobTracker();

      assertEquals("Trackers not up", 2, jt.taskTrackers().size());
      // validate the total tracker count
      assertEquals(
          "Active tracker count mismatch", 2, jt.getClusterStatus(false).getTaskTrackers());
      // validate blacklisted count
      assertEquals(
          "Blacklisted tracker count mismatch",
          0,
          jt.getClusterStatus(false).getBlacklistedTrackers());

      // run a failing job to blacklist the tracker
      JobConf jConf = mr.createJobConf();
      jConf.set("mapred.max.tracker.failures", "1");
      jConf.setJobName("test-job-fail-once");
      jConf.setMapperClass(FailOnceMapper.class);
      jConf.setReducerClass(IdentityReducer.class);
      jConf.setNumMapTasks(1);
      jConf.setNumReduceTasks(0);

      RunningJob job =
          UtilsForTests.runJob(jConf, new Path(TEST_DIR, "in"), new Path(TEST_DIR, "out"));
      job.waitForCompletion();

      // validate the total tracker count
      assertEquals(
          "Active tracker count mismatch", 1, jt.getClusterStatus(false).getTaskTrackers());
      // validate blacklisted count
      assertEquals(
          "Blacklisted tracker count mismatch",
          1,
          jt.getClusterStatus(false).getBlacklistedTrackers());

      // find the blacklisted tracker
      String trackerName = null;
      for (TaskTrackerStatus status : jt.taskTrackers()) {
        if (jt.isBlacklisted(status.getTrackerName())) {
          trackerName = status.getTrackerName();
          break;
        }
      }
      // get the hostname
      String hostToDecommission = JobInProgress.convertTrackerNameToHostName(trackerName);
      LOG.info("Decommissioning tracker " + hostToDecommission);

      // decommission the node
      HashSet<String> decom = new HashSet<String>(1);
      decom.add(hostToDecommission);
      jt.decommissionNodes(decom);

      // validate
      // check the cluster status and tracker size
      assertEquals(
          "Tracker is not lost upon host decommissioning",
          1,
          jt.getClusterStatus(false).getTaskTrackers());
      assertEquals(
          "Blacklisted tracker count incorrect in cluster status " + "after decommissioning",
          0,
          jt.getClusterStatus(false).getBlacklistedTrackers());
      assertEquals("Tracker is not lost upon host decommissioning", 1, jt.taskTrackers().size());
    } finally {
      if (mr != null) {
        mr.shutdown();
        mr = null;
        jt = null;
        FileUtil.fullyDelete(new File(TEST_DIR.toString()));
      }
    }
  }
Пример #21
0
  /**
   * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce
   * job.
   *
   * @throws IOException When there is communication problems with the job tracker.
   */
  public void testOldJobWithMapAndReducers() throws Exception {
    JobConf conf = new JobConf(TestJobCounters.class);
    conf.setJobName("wordcount-map-reducers");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(WordCount.MapClass.class);
    conf.setCombinerClass(WordCount.Reduce.class);
    conf.setReducerClass(WordCount.Reduce.class);

    conf.setNumMapTasks(3);
    conf.setNumReduceTasks(1);
    conf.setInt("io.sort.mb", 1);
    conf.setInt("io.sort.factor", 2);
    conf.set("io.sort.record.percent", "0.05");
    conf.set("io.sort.spill.percent", "0.80");

    FileSystem fs = FileSystem.get(conf);
    Path testDir = new Path(TEST_ROOT_DIR, "countertest");
    conf.set("test.build.data", testDir.toString());
    try {
      if (fs.exists(testDir)) {
        fs.delete(testDir, true);
      }
      if (!fs.mkdirs(testDir)) {
        throw new IOException("Mkdirs failed to create " + testDir.toString());
      }

      String inDir = testDir + File.separator + "genins" + File.separator;
      String outDir = testDir + File.separator;
      Path wordsIns = new Path(inDir);
      if (!fs.mkdirs(wordsIns)) {
        throw new IOException("Mkdirs failed to create " + wordsIns.toString());
      }

      long inputSize = 0;
      // create 3 input files each with 5*2k words
      File inpFile = new File(inDir + "input5_2k_1");
      createWordsFile(inpFile);
      inputSize += inpFile.length();
      inpFile = new File(inDir + "input5_2k_2");
      createWordsFile(inpFile);
      inputSize += inpFile.length();
      inpFile = new File(inDir + "input5_2k_3");
      createWordsFile(inpFile);
      inputSize += inpFile.length();

      FileInputFormat.setInputPaths(conf, inDir);
      Path outputPath1 = new Path(outDir, "output5_2k_3");
      FileOutputFormat.setOutputPath(conf, outputPath1);

      RunningJob myJob = JobClient.runJob(conf);
      Counters c1 = myJob.getCounters();
      // 3maps & in each map, 4 first level spills --- So total 12.
      // spilled records count:
      // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k;
      //           3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill)
      //           So total 8k+8k+2k=18k
      // For 3 Maps, total = 3*18=54k
      // Reduce: each of the 3 map o/p's(2k each) will be spilled in shuffleToDisk()
      //         So 3*2k=6k in 1st level; 2nd level:4k(2k+2k);
      //         3rd level directly given to reduce(4k+2k --- combineAndSpill => 2k.
      //         So 0 records spilled to disk in 3rd level)
      //         So total of 6k+4k=10k
      // Total job counter will be 54k+10k = 64k

      // 3 maps and 2.5k lines --- So total 7.5k map input records
      // 3 maps and 10k words in each --- So total of 30k map output recs
      validateMapredCounters(c1, 64000, 7500, 30000);
      validateMapredFileCounters(c1, inputSize, inputSize, 0, 0, 0);

      // create 4th input file each with 5*2k words and test with 4 maps
      inpFile = new File(inDir + "input5_2k_4");
      createWordsFile(inpFile);
      inputSize += inpFile.length();
      conf.setNumMapTasks(4);
      Path outputPath2 = new Path(outDir, "output5_2k_4");
      FileOutputFormat.setOutputPath(conf, outputPath2);

      myJob = JobClient.runJob(conf);
      c1 = myJob.getCounters();
      // 4maps & in each map 4 first level spills --- So total 16.
      // spilled records count:
      // Each Map: 1st level:2k+2k+2k+2k=8k;2ndlevel=4k+4k=8k;
      //           3rd level=2k(4k from 1st level & 4k from 2nd level & combineAndSpill)
      //           So total 8k+8k+2k=18k
      // For 3 Maps, total = 4*18=72k
      // Reduce: each of the 4 map o/p's(2k each) will be spilled in shuffleToDisk()
      //         So 4*2k=8k in 1st level; 2nd level:4k+4k=8k;
      //         3rd level directly given to reduce(4k+4k --- combineAndSpill => 2k.
      //         So 0 records spilled to disk in 3rd level)
      //         So total of 8k+8k=16k
      // Total job counter will be 72k+16k = 88k

      // 4 maps and 2.5k words in each --- So 10k map input records
      // 4 maps and 10k unique words --- So 40k map output records
      validateMapredCounters(c1, 88000, 10000, 40000);
      validateMapredFileCounters(c1, inputSize, inputSize, 0, 0, 0);

      // check for a map only job
      conf.setNumReduceTasks(0);
      Path outputPath3 = new Path(outDir, "output5_2k_5");
      FileOutputFormat.setOutputPath(conf, outputPath3);

      myJob = JobClient.runJob(conf);
      c1 = myJob.getCounters();
      // 4 maps and 2.5k words in each --- So 10k map input records
      // 4 maps and 10k unique words --- So 40k map output records
      validateMapredCounters(c1, 0, 10000, 40000);
      validateMapredFileCounters(c1, inputSize, inputSize, 0, -1, -1);
    } finally {
      // clean up the input and output files
      if (fs.exists(testDir)) {
        fs.delete(testDir, true);
      }
    }
  }
Пример #22
0
    static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput)
        throws IOException {
      FileSystem inputfs = sortInput.getFileSystem(defaults);
      FileSystem outputfs = sortOutput.getFileSystem(defaults);
      FileSystem defaultfs = FileSystem.get(defaults);
      JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class);
      jobConf.setJobName("sortvalidate-recordstats-checker");

      int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length;
      jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks);
      int noSortInputpaths = inputfs.listStatus(sortInput).length;

      jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(IntWritable.class);
      jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setCombinerClass(Reduce.class);
      jobConf.setReducerClass(Reduce.class);

      jobConf.setNumMapTasks(noSortReduceTasks);
      jobConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker");
      if (defaultfs.exists(outputPath)) {
        defaultfs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordStatsChecker: Validate sort "
              + "from "
              + inputPaths[0]
              + " ("
              + noSortInputpaths
              + " files), "
              + inputPaths[1]
              + " ("
              + noSortReduceTasks
              + " files) into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with 1 reducer.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");

      // Check to ensure that the statistics of the
      // framework's sort-input and sort-output match
      SequenceFile.Reader stats =
          new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults);
      IntWritable k1 = new IntWritable();
      IntWritable k2 = new IntWritable();
      RecordStatsWritable v1 = new RecordStatsWritable();
      RecordStatsWritable v2 = new RecordStatsWritable();
      if (!stats.next(k1, v1)) {
        throw new IOException("Failed to read record #1 from reduce's output");
      }
      if (!stats.next(k2, v2)) {
        throw new IOException("Failed to read record #2 from reduce's output");
      }

      if ((v1.getBytes() != v2.getBytes())
          || (v1.getRecords() != v2.getRecords())
          || v1.getChecksum() != v2.getChecksum()) {
        throw new IOException(
            "("
                + v1.getBytes()
                + ", "
                + v1.getRecords()
                + ", "
                + v1.getChecksum()
                + ") v/s ("
                + v2.getBytes()
                + ", "
                + v2.getRecords()
                + ", "
                + v2.getChecksum()
                + ")");
      }
    }