Beispiel #1
0
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(new Configuration(), output);
    }
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    run(input, output, measure, t1, t2);
    return 0;
  }
Beispiel #2
0
  @Override
  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    addInputOption();
    addOutputOption();
    addOption(MinhashOptionCreator.minClusterSizeOption().create());
    addOption(MinhashOptionCreator.minVectorSizeOption().create());
    addOption(MinhashOptionCreator.hashTypeOption().create());
    addOption(MinhashOptionCreator.numHashFunctionsOption().create());
    addOption(MinhashOptionCreator.keyGroupsOption().create());
    addOption(MinhashOptionCreator.numReducersOption().create());
    addOption(MinhashOptionCreator.debugOutputOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(getConf(), output);
    }
    int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE));
    int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE));
    String hashType = getOption(MinhashOptionCreator.HASH_TYPE);
    int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS));
    int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS));
    int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS));
    boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT);

    runJob(
        input,
        output,
        minClusterSize,
        minVectorSize,
        hashType,
        numHashFunctions,
        keyGroups,
        numReduceTasks,
        debugOutput);
    return 0;
  }
Beispiel #3
0
  @Override
  public int run(String[] args) throws Exception {
    String path = System.getProperty("user.dir");

    addInputOption();
    addOutputOption();

    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
        buildOption(
            TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Path labPath = new Path(path + "/../out/labelindex/");

    long labelSize = createLabelIndex(labPath);
    float alphaI = 1.0F;
    boolean trainComplementary = true;

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());
    HadoopUtil.delete(getConf(), new Path("/tmp/summedObservations"));
    HadoopUtil.delete(getConf(), new Path("/tmp/weights"));
    HadoopUtil.delete(getConf(), new Path("/tmp/thetas"));

    // Add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances =
        prepareJob(
            new Path(path + "/../out/training"),
            new Path("/tmp/summedObservations"),
            SequenceFileInputFormat.class,
            IndexInstancesMapper.class,
            IntWritable.class,
            VectorWritable.class,
            VectorSumReducer.class,
            IntWritable.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }
    // Sum up all the weights from the previous step, per label and per feature
    Job weightSummer =
        prepareJob(
            new Path("/tmp/summedObservations"),
            new Path("/tmp/weights"),
            SequenceFileInputFormat.class,
            WeightsMapper.class,
            Text.class,
            VectorWritable.class,
            VectorSumReducer.class,
            Text.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }

    // Put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/weights"), getConf());

    if (trainComplementary) {
      // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
      // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight
      // Magnitude Errors
      Job thetaSummer =
          prepareJob(
              new Path("/tmp/summedObservations"),
              new Path("/tmp/thetas"),
              SequenceFileInputFormat.class,
              ThetaMapper.class,
              Text.class,
              VectorWritable.class,
              VectorSumReducer.class,
              Text.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      thetaSummer.setCombinerClass(VectorSumReducer.class);
      thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
      thetaSummer
          .getConfiguration()
          .setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
      succeeded = thetaSummer.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    // Put the per label theta normalizers into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/thetas"), getConf());

    // Validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path("/tmp/"), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(new Path(path + "/../out/model"), getConf());

    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption(
        "similarityClassname",
        "s",
        "Name of distributed similarity class to instantiate, alternatively use "
            + "one of the predefined similarities ("
            + VectorSimilarityMeasures.list()
            + ')');
    addOption(
        "maxSimilaritiesPerRow",
        "m",
        "Number of maximum similarities per row (default: "
            + DEFAULT_MAX_SIMILARITIES_PER_ROW
            + ')',
        String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption(
        "excludeSelfSimilarity",
        "ess",
        "compute similarity of rows to themselves?",
        String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    int numberOfColumns;

    if (hasOption("numberOfColumns")) {
      // Number of columns explicitly specified via CLI
      numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
    } else {
      // else get the number of columns by determining the cardinality of a vector in the input
      // matrix
      numberOfColumns = getDimensions(getInputPath());
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
      similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
      similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      // Clear the temp path
      HadoopUtil.delete(getConf(), getTempPath());
      // Clear the output path
      HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold =
        hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job normsAndTranspose =
          prepareJob(
              getInputPath(),
              weightsPath,
              VectorNormMapper.class,
              IntWritable.class,
              VectorWritable.class,
              MergeVectorsReducer.class,
              IntWritable.class,
              VectorWritable.class);
      normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
      Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
      normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
      normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
      normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
      normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
      normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
      boolean succeeded = normsAndTranspose.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job pairwiseSimilarity =
          prepareJob(
              weightsPath,
              pairwiseSimilarityPath,
              CooccurrencesMapper.class,
              IntWritable.class,
              VectorWritable.class,
              SimilarityReducer.class,
              IntWritable.class,
              VectorWritable.class);
      pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
      Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
      pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
      pairwiseConf.set(NORMS_PATH, normsPath.toString());
      pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
      pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
      pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
      pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
      pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
      boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job asMatrix =
          prepareJob(
              pairwiseSimilarityPath,
              getOutputPath(),
              UnsymmetrifyMapper.class,
              IntWritable.class,
              VectorWritable.class,
              MergeToTopKSimilaritiesReducer.class,
              IntWritable.class,
              VectorWritable.class);
      asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
      asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
      boolean succeeded = asMatrix.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    return 0;
  }