public int run(String[] args) throws Exception {

    if (args.length < 2) {
      printUsage();
      return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    // set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
  }
示例#2
0
  /**
   * Create an Aggregate based map/reduce job.
   *
   * @param args the arguments used for job creation. Generic hadoop arguments are accepted.
   * @return a JobConf object ready for submission.
   * @throws IOException
   * @see GenericOptionsParser
   */
  public static JobConf createValueAggregatorJob(String args[]) throws IOException {

    Configuration conf = new Configuration();

    GenericOptionsParser genericParser = new GenericOptionsParser(conf, args);
    args = genericParser.getRemainingArgs();

    if (args.length < 2) {
      System.out.println(
          "usage: inputDirs outDir " + "[numOfReducer [textinputformat|seq [specfile [jobName]]]]");
      GenericOptionsParser.printGenericCommandUsage(System.out);
      System.exit(1);
    }
    String inputDir = args[0];
    String outputDir = args[1];
    int numOfReducers = 1;
    if (args.length > 2) {
      numOfReducers = Integer.parseInt(args[2]);
    }

    Class<? extends InputFormat> theInputFormat = TextInputFormat.class;
    if (args.length > 3 && args[3].compareToIgnoreCase("textinputformat") == 0) {
      theInputFormat = TextInputFormat.class;
    } else {
      theInputFormat = SequenceFileInputFormat.class;
    }

    Path specFile = null;

    if (args.length > 4) {
      specFile = new Path(args[4]);
    }

    String jobName = "";

    if (args.length > 5) {
      jobName = args[5];
    }

    JobConf theJob = new JobConf(conf);
    if (specFile != null) {
      theJob.addResource(specFile);
    }
    String userJarFile = theJob.get("user.jar.file");
    if (userJarFile == null) {
      theJob.setJarByClass(ValueAggregator.class);
    } else {
      theJob.setJar(userJarFile);
    }
    theJob.setJobName("ValueAggregatorJob: " + jobName);

    FileInputFormat.addInputPaths(theJob, inputDir);

    theJob.setInputFormat(theInputFormat);

    theJob.setMapperClass(ValueAggregatorMapper.class);
    FileOutputFormat.setOutputPath(theJob, new Path(outputDir));
    theJob.setOutputFormat(TextOutputFormat.class);
    theJob.setMapOutputKeyClass(Text.class);
    theJob.setMapOutputValueClass(Text.class);
    theJob.setOutputKeyClass(Text.class);
    theJob.setOutputValueClass(Text.class);
    theJob.setReducerClass(ValueAggregatorReducer.class);
    theJob.setCombinerClass(ValueAggregatorCombiner.class);
    theJob.setNumMapTasks(1);
    theJob.setNumReduceTasks(numOfReducers);
    return theJob;
  }
示例#3
0
  /** Runs this tool. */
  @SuppressWarnings("deprecation")
  public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf(), Docnos2Titles.class);

    // Read commandline arguments
    CommandLine cmdline = parseArgs(args);
    if (cmdline == null) {
      printUsage();
    }
    String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION);
    String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION);
    String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION);
    String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String eLang = cmdline.getOptionValue(ELANG_OPTION);
    String fLang = cmdline.getOptionValue(FLANG_OPTION);
    String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION);
    job.setJobName("Docnos2Titles_" + fLang + "-" + eLang);

    FileInputFormat.addInputPaths(job, eCollectionPath);
    FileInputFormat.addInputPaths(job, fCollectionPath);
    FileOutputFormat.setOutputPath(job, new Path(titlePairsPath));
    DistributedCache.addCacheFile(new URI(pwsimOutputPath), job);
    DistributedCache.addCacheFile(new URI(samplesFile), job);
    job.set("eLang", eLang);
    job.set("fLang", fLang);
    job.set("PwsimPairs", pwsimOutputPath);
    job.set("Ivory.SampleFile", samplesFile);

    job.setInt("mapred.task.timeout", 60000000);
    job.set("mapreduce.map.memory.mb", "3000");
    job.set("mapreduce.map.java.opts", "-Xmx3000m");
    job.setBoolean("mapred.map.tasks.speculative.execution", false);
    job.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setNumMapTasks(100);
    job.setNumReduceTasks(1);
    job.setInt("mapred.min.split.size", 2000000000);
    job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(PairOfIntString.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("E-collection path: " + eCollectionPath);
    sLogger.info("F-collection path: " + fCollectionPath);
    sLogger.info("Pwsim output path: " + pwsimOutputPath);
    sLogger.info("Output path: " + titlePairsPath);
    sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none"));

    long startTime = System.currentTimeMillis();
    JobClient.runJob(job);
    System.out.println(
        "Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }