public static NaiveBayesModel materialize(Path output, Configuration conf) throws IOException {
    FileSystem fs = output.getFileSystem(conf);

    Vector weightsPerLabel = null;
    Vector perLabelThetaNormalizer = null;
    Vector weightsPerFeature = null;
    Matrix weightsPerLabelAndFeature;
    float alphaI;

    FSDataInputStream in = fs.open(new Path(output, "naiveBayesModel.bin"));
    try {
      alphaI = in.readFloat();
      weightsPerFeature = VectorWritable.readVector(in);
      weightsPerLabel = VectorWritable.readVector(in);
      perLabelThetaNormalizer = VectorWritable.readVector(in);

      weightsPerLabelAndFeature =
          new SparseMatrix(weightsPerLabel.size(), weightsPerFeature.size());
      for (int label = 0; label < weightsPerLabelAndFeature.numRows(); label++) {
        weightsPerLabelAndFeature.assignRow(label, VectorWritable.readVector(in));
      }
    } finally {
      Closeables.closeQuietly(in);
    }
    NaiveBayesModel model =
        new NaiveBayesModel(
            weightsPerLabelAndFeature,
            weightsPerFeature,
            weightsPerLabel,
            perLabelThetaNormalizer,
            alphaI);
    model.validate();
    return model;
  }
Ejemplo n.º 2
0
  @Override
  public int run(String[] args) throws Exception {
    String path = System.getProperty("user.dir");

    addInputOption();
    addOutputOption();

    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
        buildOption(
            TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Path labPath = new Path(path + "/../out/labelindex/");

    long labelSize = createLabelIndex(labPath);
    float alphaI = 1.0F;
    boolean trainComplementary = true;

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());
    HadoopUtil.delete(getConf(), new Path("/tmp/summedObservations"));
    HadoopUtil.delete(getConf(), new Path("/tmp/weights"));
    HadoopUtil.delete(getConf(), new Path("/tmp/thetas"));

    // Add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances =
        prepareJob(
            new Path(path + "/../out/training"),
            new Path("/tmp/summedObservations"),
            SequenceFileInputFormat.class,
            IndexInstancesMapper.class,
            IntWritable.class,
            VectorWritable.class,
            VectorSumReducer.class,
            IntWritable.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }
    // Sum up all the weights from the previous step, per label and per feature
    Job weightSummer =
        prepareJob(
            new Path("/tmp/summedObservations"),
            new Path("/tmp/weights"),
            SequenceFileInputFormat.class,
            WeightsMapper.class,
            Text.class,
            VectorWritable.class,
            VectorSumReducer.class,
            Text.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }

    // Put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/weights"), getConf());

    if (trainComplementary) {
      // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
      // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight
      // Magnitude Errors
      Job thetaSummer =
          prepareJob(
              new Path("/tmp/summedObservations"),
              new Path("/tmp/thetas"),
              SequenceFileInputFormat.class,
              ThetaMapper.class,
              Text.class,
              VectorWritable.class,
              VectorSumReducer.class,
              Text.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      thetaSummer.setCombinerClass(VectorSumReducer.class);
      thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
      thetaSummer
          .getConfiguration()
          .setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
      succeeded = thetaSummer.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    // Put the per label theta normalizers into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/thetas"), getConf());

    // Validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path("/tmp/"), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(new Path(path + "/../out/model"), getConf());

    return 0;
  }