Ejemplo n.º 1
0
  @Override
  public int run(String[] args) throws Exception {
    final int ret = parseArgs(args);
    if (ret < 0) {
      return ret;
    }

    JobConf config = new JobConf(getConf(), TfIdfNovelty.class);
    config.setJobName("Influence-TfIdfNovelty");

    config.set(Fields.BASIS.get(), basisPath);
    if (datesPath != null) {
      config.set(Fields.DOC_DATES.get(), datesPath);
    }
    config.setBoolean(Fields.IGNORE.get(), ignoreDocs);
    if (bands > 0) {
      config.setInt(Fields.BANDS.get(), bands);
    }
    if (rows > 0) {
      config.setInt(Fields.ROWS.get(), rows);
    }

    SetupHelper.getInstance()
        .setSequenceInput(config, inputPath)
        .setSequenceOutput(config, outputPath);

    config.setMapOutputKeyClass(HashBandWritable.class);
    config.setMapOutputValueClass(DocumentWithVectorWritable.class);
    config.setMapperClass(TfIdfNoveltyLshMapper.class);

    if (outputBuckets) {
      config.setOutputKeyClass(HashBandWritable.class);
      config.setOutputValueClass(IntArrayWritable.class);
      config.setReducerClass(TfIdfNoveltyIdentityReducer.class);
    } else {
      config.setOutputKeyClass(Text.class);
      config.setOutputValueClass(VectorWritable.class);
      config.setReducerClass(TfIdfNoveltyReducer.class);
    }

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    JobClient.runJob(config);

    return 0;
  }
Ejemplo n.º 2
0
  @SuppressWarnings("static-access")
  private int parseArgs(String[] args) {
    Options options = new Options();
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Tfidf vectors")
            .create(Fields.INPUT.get()));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Vectors' length")
            .create(Fields.BASIS.get()));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Near documents")
            .create(Fields.OUTPUT.get()));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Document dates")
            .create(Fields.DOC_DATES.get()));
    options.addOption(
        OptionBuilder.withDescription("Ignore docs without NN").create(Fields.IGNORE.get()));
    options.addOption(
        OptionBuilder.withDescription("Output buckets").create(Fields.OUTPUT_BUCKETS.get()));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Number of bands")
            .create(Fields.BANDS.get()));
    options.addOption(
        OptionBuilder.withArgName("path")
            .hasArg()
            .withDescription("Number of rows")
            .create(Fields.ROWS.get()));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
      cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
      System.err.println("Error parsing command line: " + exp.getMessage());
      return -1;
    }

    if (!cmdline.hasOption(Fields.INPUT.get())
        || !cmdline.hasOption(Fields.OUTPUT.get())
        || !cmdline.hasOption(Fields.BASIS.get())
        || (!cmdline.hasOption(Fields.DOC_DATES.get())
            && !cmdline.hasOption(Fields.OUTPUT_BUCKETS.get()))) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(this.getClass().getName(), options);
      ToolRunner.printGenericCommandUsage(System.out);
      return -1;
    }

    inputPath = cmdline.getOptionValue(Fields.INPUT.get());
    outputPath = cmdline.getOptionValue(Fields.OUTPUT.get());
    basisPath = cmdline.getOptionValue(Fields.BASIS.get());
    datesPath = cmdline.getOptionValue(Fields.DOC_DATES.get());

    ignoreDocs = false;
    if (cmdline.hasOption(Fields.IGNORE.get())) {
      ignoreDocs = true;
    }
    outputBuckets = false;
    if (cmdline.hasOption(Fields.OUTPUT_BUCKETS.get())) {
      outputBuckets = true;
    }
    bands = -1;
    if (cmdline.hasOption(Fields.BANDS.get())) {
      bands = Integer.parseInt(cmdline.getOptionValue(Fields.BANDS.get()));
    }
    rows = -1;
    if (cmdline.hasOption(Fields.ROWS.get())) {
      rows = Integer.parseInt(cmdline.getOptionValue(Fields.ROWS.get()));
    }

    logger.info("Tool name: " + this.getClass().getName());
    logger.info(" - input: " + inputPath);
    logger.info(" - basis: " + basisPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dates: " + datesPath);
    logger.info(" - ignore: " + ignoreDocs);
    logger.info(" - outputBuckets: " + outputBuckets);
    logger.info(" - bands: " + bands);
    logger.info(" - rows: " + rows);

    return 0;
  }