public int run(String[] args) throws Exception {
    if (args.length != 3) {
      System.err.println("Usage: " + getClass().getName() + " <input> <output> <nPopulation>");
      // ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }

    // Create a JobConf using the processed <code>conf</code>
    final JobConf jobConf = new JobConf(getConf(), getClass());

    // Specify various job-specific parameters
    jobConf.setJobName(MapreduceStringFinder.class.getSimpleName());

    // setting the input format
    jobConf.setInputFormat(Individuals.class);

    // setting the output ke and value class
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BooleanWritable.class);

    // setting the mapper class
    jobConf.setMapperClass(CIMapper.class);
    jobConf.setNumMapTasks(3); // setting number of maptasks

    // setting the reducer class
    jobConf.setReducerClass(CIReducer.class);

    // setup input/output directories
    final String dataset = args[0];

    FileInputFormat.setInputPaths(jobConf, new Path(dataset));
    FileOutputFormat.setOutputPath(jobConf, new Path(args[1]));
    final int pop = Integer.parseInt(args[2]);

    // based on the configuration, make this job threadable
    if (jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2) == 1) {
      jobConf.setMapRunnerClass(MultithreadedMapRunner.class);
      jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    }
    jobConf.setInt("mapred.map.multithreadedrunner.threads", 100);
    // for computation intensive data, do not allow the job to fail if the task tracker does not
    // respond
    // with a heatbeat message before the timeout value
    final int timeout = 9000000;
    jobConf.setInt("mapred.task.timeout", timeout);

    // set the parameters to be available before a call to the mapper
    jobConf.setInt("popsize", pop);
    jobConf.setStrings("dataset", dataset);

    // int map = jobConf.getNumMapTasks();
    // System.out.println("Number of Maps"+ map);

    // start the  map/reduce job
    System.out.println("Starting Job");

    // get the start time for this job
    final long startTime = System.currentTimeMillis();

    // Submit the job, then poll for progress until the job is complete
    JobClient.runJob(jobConf);

    // get the end time for this job
    final long endTime = System.currentTimeMillis();

    // get the duration of this job
    final double duration = (endTime - startTime) / 1000.0;
    // System.out.println("Job Finished in " + duration + " seconds");
    // getElapsedTime(startTime - endTime);

    return 0;
  }
 public JobBuilder mapRunner(Class<? extends MapRunner> mapRunner) throws IOException {
   _jobConf.setMapRunnerClass(mapRunner);
   _jobConf.setJarByClass(mapRunner);
   return this;
 }
  public static void main(String[] args) {

    String accessKey = args[0];
    String secretKey = args[1];

    String paths[] = {
      // "2008/06",
      // "2008/07",
      // "2008/08",
      // "2008/09",
      // "2008/10",
      // "2008/11",
      "2009"
    };

    for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) {

      LOG.info("Processing Path:" + paths[pathIndex]);

      JobConf job = new JobConf(S3GetMetdataJob.class);

      Path tempDir =
          new Path(
              job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

      LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir);
      System.out.println("Output Path is:" + tempDir);

      job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]);

      // setup s3 properties
      JetS3tARCSource.setMaxRetries(job, 1);
      // set up S3 credentials ...
      JetS3tARCSource.setAWSAccessKeyID(job, accessKey);
      JetS3tARCSource.setAWSSecretAccessKey(job, secretKey);
      ARCSplitCalculator.setFilesPerSplit(job, 25);
      // set up arc reader properties
      ArcFileReader.setIOTimeoutValue(30000);
      // set input prefixes ...
      JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]);
      // and S3 bucket name ...
      JetS3tARCSource.setBucketName(job, "commoncrawl");
      // and setup arc source for ArcInputFormat
      ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class);
      // and set up input format ...
      job.setInputFormat(ARCInputFormat.class);
      // set mapper ...
      job.setMapRunnerClass(S3GetMetdataJob.class);
      // setup reducer (identity in this case ... )
      job.setReducerClass(IdentityReducer.class);
      // standard output format ...
      job.setOutputFormat(SequenceFileOutputFormat.class);
      // set output path
      FileOutputFormat.setOutputPath(job, tempDir);
      // map output types
      job.setMapOutputKeyClass(Text.class);
      job.setMapOutputValueClass(CrawlURLMetadata.class);
      // reduce output types
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(CrawlURLMetadata.class);
      // double the number of reducers ...
      // job.setNumReduceTasks(job.getNumReduceTasks() * 2);

      // run the job ...
      try {
        LOG.info("Starting Job:" + job.getJobName());
        JobClient.runJob(job);
        LOG.info("Finished Job:" + job.getJobName());

        Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result");
        LOG.info("Copying Job Output to:" + finalPath);
        FileSystem fs = FileSystem.get(job);

        try {
          fs.mkdirs(finalPath.getParent());
          fs.rename(tempDir, finalPath);
          LOG.info("Copied Job Output to:" + finalPath);
        } finally {
          // fs.close();
        }

      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
      }
    }
  }
Пример #4
0
  @SuppressWarnings("static-access")
  @Override
  public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("tmp output directory")
            .create(OUTPUT_OPTION));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("index file")
            .create(INDEX_FILE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)
        || !cmdline.hasOption(OUTPUT_OPTION)
        || !cmdline.hasOption(INDEX_FILE_OPTION)) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(this.getClass().getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    if (!inputPath.isAbsolute()) {
      System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
      return -1;
    }

    JobConf conf = new JobConf(getConf(), BuildWikipediaForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(
        String.format(
            "BuildWikipediaForwardIndex[%s: %s, %s: %s]",
            INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // delete the output directory if it exists already
    fs.delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.findCounter(Blocks.Total).getCounter();

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF("edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex");
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
      String[] arr = line.toString().split("\\s+");

      int docno = Integer.parseInt(arr[0]);
      int offset = Integer.parseInt(arr[1]);
      short fileno = Short.parseShort(arr[2]);

      out.writeInt(docno);
      out.writeInt(offset);
      out.writeShort(fileno);

      cnt++;

      if (cnt % 100000 == 0) {
        LOG.info(cnt + " blocks written");
      }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
      throw new RuntimeException("Error: mismatch in block count!");
    }

    return 0;
  }