Пример #1
0
  @Override
  public int run(String[] args) throws Exception {
    final int ret = parseArgs(args);
    if (ret < 0) {
      return ret;
    }

    Job job = Job.getInstance(getConf());
    job.setJarByClass(GreeDiFirst.class);
    job.setJobName(String.format("Coverage-GreeDiFirst[%s %s]", partitionCount, selectCount));

    job.getConfiguration().setInt(Fields.PARTITION_COUNT.get(), partitionCount);
    job.getConfiguration().setInt(Fields.SELECT_COUNT.get(), selectCount);

    job.setNumReduceTasks(partitionCount);

    SetupHelper.getInstance().setSequenceInput(job, inputPath).setTextOutput(job, outputPath);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DocumentWithVectorWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(GreeDiReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
  }
Пример #2
0
    @Override
    public void map(Text key, VectorWritable value, Context context)
        throws IOException, InterruptedException {
      final int partitionCount =
          context
              .getConfiguration()
              .getInt(Fields.PARTITION_COUNT.get(), Defaults.PARTITION_COUNT.get());
      final int partition = Integer.parseInt(key.toString()) % partitionCount;
      final IntWritable outKey = new IntWritable(partition);

      final DocumentWithVectorWritable outValue = new DocumentWithVectorWritable(key, value);

      context.write(outKey, outValue);
    }
Пример #3
0
  @SuppressWarnings("static-access")
  private int parseArgs(String[] args) {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Tfidf vectors")
            .create(Fields.INPUT.get()));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Selected articles")
            .create(Fields.OUTPUT.get()));
    options.addOption(
        OptionBuilder.withArgName("integer")
            .hasArg()
            .withDescription("Partition count")
            .create(Fields.PARTITION_COUNT.get()));
    options.addOption(
        OptionBuilder.withArgName("integer")
            .hasArg()
            .withDescription("Select count")
            .create(Fields.SELECT_COUNT.get()));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      logger.fatal("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    if (!cmdline.hasOption(Fields.INPUT.get()) || !cmdline.hasOption(Fields.OUTPUT.get())) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(this.getClass().getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    inputPath = cmdline.getOptionValue(Fields.INPUT.get());
    outputPath = cmdline.getOptionValue(Fields.OUTPUT.get());

    partitionCount = Defaults.PARTITION_COUNT.get();
    if (cmdline.hasOption(Fields.PARTITION_COUNT.get())) {
      partitionCount = Integer.parseInt(cmdline.getOptionValue(Fields.PARTITION_COUNT.get()));
      if (partitionCount <= 0) {
        System.err.println("Error: \"" + partitionCount + "\" has to be positive!");
        return -1;
      }
    }
    selectCount = Integer.parseInt(cmdline.getOptionValue(Fields.SELECT_COUNT.get()));

    logger.info("Tool name: " + this.getClass().getName());
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - partitions: " + partitionCount);
    logger.info(" - select: " + selectCount);

    return 0;
  }