Ejemplo n.º 1
0
  public int run(String[] args) throws Exception {

    if (args.length != 5) {
      printUsage();
      return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    String stoplistPath = args[4];

    sLogger.info("Tool: AFormatter");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(AFormatterWG.class);
    conf.setJobName("Authority Formatter -- Web Graph");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    // conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HITSNode.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setCompressMapOutput(true);
    conf.setSpeculativeExecution(false);
    // InputSampler.Sampler<IntWritable, Text> sampler = new
    // InputSampler.RandomSampler<IntWritable, Text>(0.1, 10, 10);
    // InputSampler.writePartitionFile(conf, sampler);
    // conf.setPartitionerClass(TotalOrderPartitioner.class);
    conf.setMapperClass(AFormatMapperIMC.class);
    conf.setCombinerClass(AFormatReducer.class);
    conf.setReducerClass(AFormatReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    Path stopList = new Path(stoplistPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    sLogger.info("Starting job");
    DistributedCache.addCacheFile(stopList.toUri(), conf);
    JobClient.runJob(conf);
    sLogger.info(
        "Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }
Ejemplo n.º 2
0
  @Override
  public int run(String[] args) throws Exception {

    System.out.println("\n\nConvolutionJob\n");
    JobConf conf = new JobConf(getConf(), ConvolutionJob.class);
    conf.setJobName("ConvolutionJob");

    this.cacheKernel(conf);
    this.CreateRats(conf);
    conf.setMapperClass(ConvolutionMapper.class);
    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
      try {
        if ("-m".equals(args[i])) {
          conf.setNumMapTasks(Integer.parseInt(args[++i]));
        } else if ("-r".equals(args[i])) {
          conf.setNumReduceTasks(Integer.parseInt(args[++i]));
        } else {
          other_args.add(args[i]);
        }
      } catch (NumberFormatException except) {
        System.out.println("ERROR: Integer expected instead of " + args[i]);
        return printUsage();
      } catch (ArrayIndexOutOfBoundsException except) {
        System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
        return printUsage();
      }
    }

    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
      System.out.println(
          "ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
      return printUsage();
    }

    conf.setNumReduceTasks(0);
    conf.setInputFormat(NonSplittableTextInputFormat.class);
    conf.setOutputFormat(MultiFileOutput.class);
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(Text.class);
    conf.setCompressMapOutput(true);
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    conf.set("mapred.output.compression.type", "BLOCK");

    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
    // FileOutputFormat.setCompressOutput(conf, true);

    JobClient.runJob(conf);

    return 0;
  }
Ejemplo n.º 3
0
  @Override
  public int run(String[] args) throws IOException {
    OptionParser p = new OptionParser();
    OptionSpec<String> maxwiOpt =
        p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED")
            .withRequiredArg()
            .ofType(String.class);
    OptionSpec<Float> thresholdOpt =
        p.accepts(thresholdOptName, "similarity threshold")
            .withRequiredArg()
            .ofType(Float.class)
            .defaultsTo(DEFAULT_THRESHOLD);
    OptionSpec<Integer> stripesOpt =
        p.accepts(stripesOptName, "number of stripes to divide the similarity matrix")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(1);
    OptionSpec<Integer> spreadOpt =
        p.accepts(spreadOptName, "number of reducers per stripe")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_SPREAD);
    OptionSpec<Integer> factorOpt =
        p.accepts(factorOptName, "number of mappers per reducer")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_FACTOR);
    OptionSpec<Integer> maxVectorIDOpt =
        p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class);
    p.acceptsAll(Arrays.asList("h", "?"), "show help");

    OptionSet options = parseOptions(p, args);

    // to distinguish indexes built in successive runs
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    Date date = new Date();

    float threshold = options.valueOf(thresholdOpt); // threshold
    if (threshold < 0 || threshold >= 1) {
      System.err.println(thresholdOptName + " should be between 0 and 1");
      System.exit(1);
    }

    int numStripes = options.valueOf(stripesOpt); // number of stripes
    if (numStripes < 1) {
      System.err.println(stripesOptName + " should be > 0");
      System.exit(1);
    }

    // MapReduce parameters
    int spread = options.valueOf(spreadOpt); // how many reducers per stripe
    if (spread < 1) {
      System.err.println(spreadOptName + " should be > 0");
      System.exit(1);
    }

    int factor = options.valueOf(factorOpt); // how many mappers per reducer
    if (factor < 1) {
      System.err.println(factorOptName + " should be > 0");
      System.exit(1);
    }

    int maxKey = 0;
    if (options.has(maxVectorIDOpt)) {
      maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID
      if (maxKey < 1) {
        System.err.println(maxVectorIDOptName + " should be > 0");
        System.exit(1);
      }
    }

    int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread);
    int numMappers = numReducers * factor;
    int numBuckets = numMappers;

    // pick the file with max weights from command line
    String maxWiDir = options.valueOf(maxwiOpt);
    List<String> nonOptArgs = options.nonOptionArguments();

    LOG.info("Threshold set to " + threshold);
    LOG.info(
        String.format(
            "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s",
            numBuckets, factor, numStripes, spread, numReducers));

    // start building the jobs
    JobConf conf1 = new JobConf(getConf(), Similarity.class);
    conf1.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf1.setInt(PARAM_APS_STRIPES, numStripes);
    DistributedCache.addCacheFile(URI.create(maxWiDir), conf1);

    Path inputPath = new Path(nonOptArgs.get(0));
    Path indexPath =
        new Path(
            nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date));
    // index filtering pruned nested directory
    Path indexOnlyPath = new Path(indexPath, "part*");
    Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes);
    FileInputFormat.setInputPaths(conf1, inputPath);
    FileOutputFormat.setOutputPath(conf1, indexPath);

    conf1.setInputFormat(SequenceFileInputFormat.class);
    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setMapOutputKeyClass(LongWritable.class);
    conf1.setMapOutputValueClass(IndexItem.class);
    conf1.setOutputKeyClass(LongWritable.class);
    conf1.setOutputValueClass(IndexItemArrayWritable.class);
    conf1.setMapperClass(IndexerMapper.class);
    conf1.setReducerClass(IndexerReducer.class);

    // assuming input is sorted according to the key (vectorID) so that the
    // part files are locally sorted
    MultipleOutputs.addNamedOutput(
        conf1,
        PRUNED,
        SequenceFileOutputFormat.class,
        IntWritable.class,
        VectorComponentArrayWritable.class);

    // remove the stuff we added from the job name
    conf1.set(
        "mapred.job.name",
        "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16));
    conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf1.setSpeculativeExecution(false);
    conf1.setCompressMapOutput(true);
    // hash the posting lists in different buckets to distribute the load
    conf1.setNumReduceTasks(numBuckets);

    RunningJob job1 = JobClient.runJob(conf1);

    // part 2
    JobConf conf2 = new JobConf(getConf(), Similarity.class);

    if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL);

    MultipleInputs.addInputPath(
        conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class);
    MultipleInputs.addInputPath(
        conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class);
    FileOutputFormat.setOutputPath(conf2, outputPath);
    conf2.setCombinerClass(SimilarityCombiner.class);
    conf2.setReducerClass(SimilarityReducer.class);
    conf2.setPartitionerClass(GenericKey.StripePartitioner.class);
    conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class);
    conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class);
    conf2.setMapOutputKeyClass(GenericKey.class);
    conf2.setMapOutputValueClass(GenericValue.class);
    conf2.setOutputKeyClass(VectorPair.class);
    conf2.setOutputValueClass(NullWritable.class);

    Counter numDocs =
        job1.getCounters()
            .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS");
    maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue();
    LOG.info("Setting max key value in input to " + maxKey);
    conf2.setInt(PARAM_APS_MAXKEY, maxKey);

    conf2.setInt(PARAM_APS_STRIPES, numStripes);
    conf2.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread);
    conf2.set("mapred.job.name", "APS-" + outputPath.getName());

    conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf2.setSpeculativeExecution(false);
    conf2.setCompressMapOutput(true);
    conf2.setNumReduceTasks(numReducers);

    JobClient.runJob(conf2);

    return 0;
  }
 public JobBuilder compressMapOutput(boolean compress) throws IOException {
   _jobConf.setCompressMapOutput(compress);
   return this;
 }