Esempio n. 1
0
  /** Runs this tool. */
  @SuppressWarnings("deprecation")
  public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf(), Docnos2Titles.class);

    // Read commandline arguments
    CommandLine cmdline = parseArgs(args);
    if (cmdline == null) {
      printUsage();
    }
    String eCollectionPath = cmdline.getOptionValue(ECOLLECTION_OPTION);
    String fCollectionPath = cmdline.getOptionValue(FCOLLECTION_OPTION);
    String pwsimOutputPath = cmdline.getOptionValue(PWSIM_OPTION);
    String titlePairsPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String eLang = cmdline.getOptionValue(ELANG_OPTION);
    String fLang = cmdline.getOptionValue(FLANG_OPTION);
    String samplesFile = cmdline.getOptionValue(SAMPLEDOCNOS_OPTION);
    job.setJobName("Docnos2Titles_" + fLang + "-" + eLang);

    FileInputFormat.addInputPaths(job, eCollectionPath);
    FileInputFormat.addInputPaths(job, fCollectionPath);
    FileOutputFormat.setOutputPath(job, new Path(titlePairsPath));
    DistributedCache.addCacheFile(new URI(pwsimOutputPath), job);
    DistributedCache.addCacheFile(new URI(samplesFile), job);
    job.set("eLang", eLang);
    job.set("fLang", fLang);
    job.set("PwsimPairs", pwsimOutputPath);
    job.set("Ivory.SampleFile", samplesFile);

    job.setInt("mapred.task.timeout", 60000000);
    job.set("mapreduce.map.memory.mb", "3000");
    job.set("mapreduce.map.java.opts", "-Xmx3000m");
    job.setBoolean("mapred.map.tasks.speculative.execution", false);
    job.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setNumMapTasks(100);
    job.setNumReduceTasks(1);
    job.setInt("mapred.min.split.size", 2000000000);
    job.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(PairOfIntString.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    sLogger.info("Running job " + job.getJobName() + "...");
    sLogger.info("E-collection path: " + eCollectionPath);
    sLogger.info("F-collection path: " + fCollectionPath);
    sLogger.info("Pwsim output path: " + pwsimOutputPath);
    sLogger.info("Output path: " + titlePairsPath);
    sLogger.info("Sample file?: " + ((samplesFile != null) ? samplesFile : "none"));

    long startTime = System.currentTimeMillis();
    JobClient.runJob(job);
    System.out.println(
        "Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
  }
Esempio n. 2
0
  @Override
  public int run(String[] args) throws IOException {
    OptionParser p = new OptionParser();
    OptionSpec<String> maxwiOpt =
        p.accepts(maxwiOptName, "location of maxWi map file (HDFS) REQUIRED")
            .withRequiredArg()
            .ofType(String.class);
    OptionSpec<Float> thresholdOpt =
        p.accepts(thresholdOptName, "similarity threshold")
            .withRequiredArg()
            .ofType(Float.class)
            .defaultsTo(DEFAULT_THRESHOLD);
    OptionSpec<Integer> stripesOpt =
        p.accepts(stripesOptName, "number of stripes to divide the similarity matrix")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(1);
    OptionSpec<Integer> spreadOpt =
        p.accepts(spreadOptName, "number of reducers per stripe")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_SPREAD);
    OptionSpec<Integer> factorOpt =
        p.accepts(factorOptName, "number of mappers per reducer")
            .withRequiredArg()
            .ofType(Integer.class)
            .defaultsTo(DEFAULT_FACTOR);
    OptionSpec<Integer> maxVectorIDOpt =
        p.accepts(maxVectorIDOptName, "maximum vector ID").withRequiredArg().ofType(Integer.class);
    p.acceptsAll(Arrays.asList("h", "?"), "show help");

    OptionSet options = parseOptions(p, args);

    // to distinguish indexes built in successive runs
    DateFormat df = new SimpleDateFormat("yyyyMMdd-HHmmss");
    Date date = new Date();

    float threshold = options.valueOf(thresholdOpt); // threshold
    if (threshold < 0 || threshold >= 1) {
      System.err.println(thresholdOptName + " should be between 0 and 1");
      System.exit(1);
    }

    int numStripes = options.valueOf(stripesOpt); // number of stripes
    if (numStripes < 1) {
      System.err.println(stripesOptName + " should be > 0");
      System.exit(1);
    }

    // MapReduce parameters
    int spread = options.valueOf(spreadOpt); // how many reducers per stripe
    if (spread < 1) {
      System.err.println(spreadOptName + " should be > 0");
      System.exit(1);
    }

    int factor = options.valueOf(factorOpt); // how many mappers per reducer
    if (factor < 1) {
      System.err.println(factorOptName + " should be > 0");
      System.exit(1);
    }

    int maxKey = 0;
    if (options.has(maxVectorIDOpt)) {
      maxKey = options.valueOf(maxVectorIDOpt); // maximum value of the vector ID
      if (maxKey < 1) {
        System.err.println(maxVectorIDOptName + " should be > 0");
        System.exit(1);
      }
    }

    int numReducers = GenericKey.StripePartitioner.numReducers(numStripes, spread);
    int numMappers = numReducers * factor;
    int numBuckets = numMappers;

    // pick the file with max weights from command line
    String maxWiDir = options.valueOf(maxwiOpt);
    List<String> nonOptArgs = options.nonOptionArguments();

    LOG.info("Threshold set to " + threshold);
    LOG.info(
        String.format(
            "Buckets: %1$-10s Factor: %2$-10s Stripes: %3$-10s Spread: %4$-10s Reducers: %5$-10s",
            numBuckets, factor, numStripes, spread, numReducers));

    // start building the jobs
    JobConf conf1 = new JobConf(getConf(), Similarity.class);
    conf1.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf1.setInt(PARAM_APS_STRIPES, numStripes);
    DistributedCache.addCacheFile(URI.create(maxWiDir), conf1);

    Path inputPath = new Path(nonOptArgs.get(0));
    Path indexPath =
        new Path(
            nonOptArgs.get(0) + "-index-" + threshold + "-s" + numStripes + "_" + df.format(date));
    // index filtering pruned nested directory
    Path indexOnlyPath = new Path(indexPath, "part*");
    Path outputPath = new Path(nonOptArgs.get(1) + "-" + threshold + "-s" + numStripes);
    FileInputFormat.setInputPaths(conf1, inputPath);
    FileOutputFormat.setOutputPath(conf1, indexPath);

    conf1.setInputFormat(SequenceFileInputFormat.class);
    conf1.setOutputFormat(SequenceFileOutputFormat.class);
    conf1.setMapOutputKeyClass(LongWritable.class);
    conf1.setMapOutputValueClass(IndexItem.class);
    conf1.setOutputKeyClass(LongWritable.class);
    conf1.setOutputValueClass(IndexItemArrayWritable.class);
    conf1.setMapperClass(IndexerMapper.class);
    conf1.setReducerClass(IndexerReducer.class);

    // assuming input is sorted according to the key (vectorID) so that the
    // part files are locally sorted
    MultipleOutputs.addNamedOutput(
        conf1,
        PRUNED,
        SequenceFileOutputFormat.class,
        IntWritable.class,
        VectorComponentArrayWritable.class);

    // remove the stuff we added from the job name
    conf1.set(
        "mapred.job.name",
        "APS-" + indexPath.getName().substring(0, indexPath.getName().length() - 16));
    conf1.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf1.setSpeculativeExecution(false);
    conf1.setCompressMapOutput(true);
    // hash the posting lists in different buckets to distribute the load
    conf1.setNumReduceTasks(numBuckets);

    RunningJob job1 = JobClient.runJob(conf1);

    // part 2
    JobConf conf2 = new JobConf(getConf(), Similarity.class);

    if (numStripes > 0) FileUtils.mergeRestFile(conf2, indexPath, PRUNED, INDEX_INTERVAL);

    MultipleInputs.addInputPath(
        conf2, indexOnlyPath, SequenceFileInputFormat.class, SimilarityMapperIndex.class);
    MultipleInputs.addInputPath(
        conf2, inputPath, SequenceFileInputFormat.class, SimilarityMapperInput.class);
    FileOutputFormat.setOutputPath(conf2, outputPath);
    conf2.setCombinerClass(SimilarityCombiner.class);
    conf2.setReducerClass(SimilarityReducer.class);
    conf2.setPartitionerClass(GenericKey.StripePartitioner.class);
    conf2.setOutputKeyComparatorClass(GenericKey.Comparator.class);
    conf2.setOutputValueGroupingComparator(GenericKey.PrimaryComparator.class);
    conf2.setMapOutputKeyClass(GenericKey.class);
    conf2.setMapOutputValueClass(GenericValue.class);
    conf2.setOutputKeyClass(VectorPair.class);
    conf2.setOutputValueClass(NullWritable.class);

    Counter numDocs =
        job1.getCounters()
            .findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS");
    maxKey = maxKey > 0 ? maxKey : (int) numDocs.getValue();
    LOG.info("Setting max key value in input to " + maxKey);
    conf2.setInt(PARAM_APS_MAXKEY, maxKey);

    conf2.setInt(PARAM_APS_STRIPES, numStripes);
    conf2.setFloat(PARAM_APS_THRESHOLD, threshold);
    conf2.setInt(PARAM_APS_REDUCER_PER_STRIPE, spread);
    conf2.set("mapred.job.name", "APS-" + outputPath.getName());

    conf2.setNumTasksToExecutePerJvm(-1); // JVM reuse
    conf2.setSpeculativeExecution(false);
    conf2.setCompressMapOutput(true);
    conf2.setNumReduceTasks(numReducers);

    JobClient.runJob(conf2);

    return 0;
  }
 public JobBuilder delayReducersUntil(float pctMappersComplete) throws IOException {
   _jobConf.setFloat("mapred.reduce.slowstart.completed.maps", pctMappersComplete);
   return this;
 }