Example #1
0
  /**
   * Run the job
   *
   * @param params The Job parameters containing the gramSize, input output folders, defaultCat,
   *     encoding
   */
  public static void runJob(Parameters params) throws IOException {
    Configurable client = new JobClient();
    JobConf conf = new JobConf(BayesClassifierDriver.class);
    conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath"));
    conf.setOutputKeyClass(StringTuple.class);
    conf.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath")));
    Path outPath = new Path(params.get("testDirPath") + "-output");
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setMapperClass(BayesClassifierMapper.class);
    conf.setCombinerClass(BayesClassifierReducer.class);
    conf.setReducerClass(BayesClassifierReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    HadoopUtil.overwriteOutput(outPath);
    conf.set("bayes.parameters", params.toString());

    client.setConf(conf);
    JobClient.runJob(conf);

    Path outputFiles = new Path(outPath, "part*");
    FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
    ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params);
    log.info("{}", matrix.summarize());
  }
Example #2
0
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String args[]) throws Exception {

    log.setLevel(Level.INFO);

    Configuration v_HadoopConf = new Configuration();
    s_HadoopConf = v_HadoopConf;
    s_Config = Config.getInstance();
    String v_PathPrefix = s_Config.getBaseDir() + s_Config.getDataDir() + "mahout/";
    String v_TextDir = s_Config.getTextPath();

    Path m_DocumentDir = new Path(v_TextDir);
    Path m_SequenceDir = new Path(v_PathPrefix, "sequence/");
    Path m_TokensDir = new Path(v_PathPrefix, "tokens");
    Path m_TF = new Path(v_PathPrefix, "termfreq/");
    String m_VecFolder = "Vectors";
    Path m_tf_idf = new Path(v_PathPrefix, "tfidf/");

    boolean m_Sequential = true;

    HadoopUtil.delete(v_HadoopConf, new Path(v_PathPrefix, "clusters/"));

    if (!s_Config.getReuseTFIDF()) {
      createTFIDF(
          v_HadoopConf, m_DocumentDir, m_SequenceDir, m_TokensDir, m_TF, m_VecFolder, m_tf_idf);
    }

    HierarchicalKMeansClusterer v_Hkmc = new HierarchicalKMeansClusterer();
    SetTree<ClusterDescriptor> v_Tree = v_Hkmc.run(s_HadoopConf, m_Sequential);

    saveAsTree(v_Tree);

    saveAsXml(v_Tree);
  }
Example #3
0
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(new Configuration(), output);
    }
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    run(input, output, measure, t1, t2);
    return 0;
  }
Example #4
0
  /**
   * Count the document frequencies of features in parallel using Map/Reduce. The input documents
   * have to be in {@link SequenceFile} format
   */
  private static void startDFCounting(Path input, Path output, Configuration baseConf)
      throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Job job = new Job(conf);
    job.setJobName("VectorTfIdf Document Frequency Count running over input: " + input);
    job.setJarByClass(TFIDFConverter.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermDocumentCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermDocumentCountReducer.class);
    job.setReducerClass(TermDocumentCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }
  }
Example #5
0
    @Override
    protected void setup(Context ctx) throws IOException, InterruptedException {
      try {
        Configuration conf = ctx.getConfiguration();
        Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
        Preconditions.checkArgument(
            localFiles != null && localFiles.length >= 1,
            "missing paths from the DistributedCache");

        Path inputVectorPath = HadoopUtil.getSingleCachedFile(conf);

        SequenceFileValueIterator<VectorWritable> iterator =
            new SequenceFileValueIterator<VectorWritable>(inputVectorPath, true, conf);
        try {
          inputVector = iterator.next().get();
        } finally {
          Closeables.close(iterator, true);
        }

        int outDim = conf.getInt(OUTPUT_VECTOR_DIMENSION, Integer.MAX_VALUE);
        outputVector =
            conf.getBoolean(IS_SPARSE_OUTPUT, false)
                ? new RandomAccessSparseVector(outDim, 10)
                : new DenseVector(outDim);
      } catch (IOException ioe) {
        throw new IllegalStateException(ioe);
      }
    }
Example #6
0
  /**
   * Create Term Frequency-Inverse Document Frequency (Tf-Idf) Vectors from the input set of vectors
   * in {@link SequenceFile} format. This job uses a fixed limit on the maximum memory used by the
   * feature chunk per node thereby splitting the process across multiple map/reduces. Before using
   * this method calculateDF should be called
   *
   * @param input input directory of the vectors in {@link SequenceFile} format
   * @param output output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s
   *     of the document are generated
   * @param datasetFeatures Document frequencies information calculated by calculateDF
   * @param minDf The minimum document frequency. Default 1
   * @param maxDF The max percentage of vectors for the DF. Can be used to remove really high
   *     frequency features. Expressed as an integer between 0 and 100. Default 99
   * @param numReducers The number of reducers to spawn. This also affects the possible parallelism
   *     since each reducer will typically produce a single output file containing tf-idf vectors
   *     for a subset of the documents in the corpus.
   */
  public static void processTfIdf(
      Path input,
      Path output,
      Configuration baseConf,
      Pair<Long[], List<Path>> datasetFeatures,
      int minDf,
      long maxDF,
      float normPower,
      boolean logNormalize,
      boolean sequentialAccessOutput,
      boolean namedVector,
      int numReducers)
      throws IOException, InterruptedException, ClassNotFoundException {
    Preconditions.checkArgument(
        normPower == PartialVectorMerger.NO_NORMALIZING || normPower >= 0,
        "If specified normPower must be nonnegative",
        normPower);
    Preconditions.checkArgument(
        normPower == PartialVectorMerger.NO_NORMALIZING
            || (normPower > 1 && !Double.isInfinite(normPower))
            || !logNormalize,
        "normPower must be > 1 and not infinite if log normalization is chosen",
        normPower);

    int partialVectorIndex = 0;
    List<Path> partialVectorPaths = Lists.newArrayList();
    List<Path> dictionaryChunks = datasetFeatures.getSecond();
    for (Path dictionaryChunk : dictionaryChunks) {
      Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
      partialVectorPaths.add(partialVectorOutputPath);
      makePartialVectors(
          input,
          baseConf,
          datasetFeatures.getFirst()[0],
          datasetFeatures.getFirst()[1],
          minDf,
          maxDF,
          dictionaryChunk,
          partialVectorOutputPath,
          sequentialAccessOutput,
          namedVector);
    }

    Configuration conf = new Configuration(baseConf);

    Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);

    PartialVectorMerger.mergePartialVectors(
        partialVectorPaths,
        outputDir,
        baseConf,
        normPower,
        logNormalize,
        datasetFeatures.getFirst()[0].intValue(),
        sequentialAccessOutput,
        namedVector,
        numReducers);
    HadoopUtil.delete(conf, partialVectorPaths);
  }
  /**
   * Merge all the partial {@link org.apache.mahout.math.RandomAccessSparseVector}s into the
   * complete Document {@link org.apache.mahout.math.RandomAccessSparseVector}
   *
   * @param partialVectorPaths input directory of the vectors in {@link
   *     org.apache.hadoop.io.SequenceFile} format
   * @param output output directory were the partial vectors have to be created
   * @param baseConf job configuration
   * @param normPower The normalization value. Must be greater than or equal to 0 or equal to {@link
   *     #NO_NORMALIZING}
   * @param dimension
   * @param sequentialAccess output vectors should be optimized for sequential access
   * @param namedVector output vectors should be named, retaining key (doc id) as a label
   * @param numReducers The number of reducers to spawn
   */
  public static void mergePartialVectors(
      Iterable<Path> partialVectorPaths,
      Path output,
      Configuration baseConf,
      float normPower,
      boolean logNormalize,
      int dimension,
      boolean sequentialAccess,
      boolean namedVector,
      int numReducers)
      throws IOException, InterruptedException, ClassNotFoundException {
    Preconditions.checkArgument(
        normPower == NO_NORMALIZING || normPower >= 0,
        "If specified normPower must be nonnegative",
        normPower);
    Preconditions.checkArgument(
        normPower == NO_NORMALIZING
            || (normPower > 1 && !Double.isInfinite(normPower))
            || !logNormalize,
        "normPower must be > 1 and not infinite if log normalization is chosen",
        normPower);

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(NAMED_VECTOR, namedVector);
    conf.setInt(DIMENSION, dimension);
    conf.setFloat(NORMALIZATION_POWER, normPower);
    conf.setBoolean(LOG_NORMALIZE, logNormalize);

    Job job = new Job(conf);
    job.setJobName("PartialVectorMerger::MergePartialVectors");
    job.setJarByClass(PartialVectorMerger.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths));

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(PartialVectorMergeReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }
  }
Example #8
0
 public static void main(String[] args) throws Exception {
   if (args.length > 0) {
     log.info("Running with only user-supplied arguments");
     ToolRunner.run(new Configuration(), new Job(), args);
   } else {
     log.info("Running with default arguments");
     Path output = new Path("output");
     HadoopUtil.delete(new Configuration(), output);
     run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
   }
 }
Example #9
0
  public static Job createTimesSquaredJob(
      Configuration initialConf,
      Vector v,
      int outputVectorDim,
      Path matrixInputPath,
      Path outputVectorPathBase,
      Class<? extends TimesSquaredMapper> mapClass,
      Class<? extends VectorSummingReducer> redClass)
      throws IOException {

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    outputVectorPathBase = fs.makeQualified(outputVectorPathBase);

    long now = System.nanoTime();
    Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);

    SequenceFile.Writer inputVectorPathWriter = null;

    try {
      inputVectorPathWriter =
          new SequenceFile.Writer(
              fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class);
      inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
    } finally {
      Closeables.close(inputVectorPathWriter, false);
    }

    URI ivpURI = inputVectorPath.toUri();
    DistributedCache.setCacheFiles(new URI[] {ivpURI}, initialConf);

    Job job =
        HadoopUtil.prepareJob(
            matrixInputPath,
            new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
            SequenceFileInputFormat.class,
            mapClass,
            NullWritable.class,
            VectorWritable.class,
            redClass,
            NullWritable.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class,
            initialConf);
    job.setCombinerClass(redClass);
    job.setJobName("TimesSquaredJob: " + matrixInputPath);

    Configuration conf = job.getConfiguration();
    conf.set(INPUT_VECTOR, ivpURI.toString());
    conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
    conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);

    return job;
  }
  /**
   * Run the job
   *
   * @param input the input pathname String
   * @param output the output pathname String
   * @param catFile the file containing the Wikipedia categories
   * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply
   *     containing the category string
   */
  public static void runJob(
      String input,
      String output,
      String catFile,
      boolean exactMatchOnly,
      Class<? extends Analyzer> analyzerClass)
      throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("key.value.separator.in.input.line", " ");
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.set("analyzer.class", analyzerClass.getName());
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // Dont ever forget this. People should keep track of how hadoop conf
    // parameters can make or break a piece of code

    Set<String> categories = Sets.newHashSet();
    for (String line : new FileLineIterable(new File(catFile))) {
      categories.add(line.trim().toLowerCase(Locale.ENGLISH));
    }

    Stringifier<Set<String>> setStringifier =
        new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    Job job = new Job(conf);
    log.info("Input: {} Out: {} Categories: {}", input, output, catFile);
    job.setJarByClass(WikipediaDatasetCreatorDriver.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WikipediaDatasetCreatorMapper.class);
    // TODO: job.setNumMapTasks(100);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(WikipediaDatasetCreatorReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    HadoopUtil.delete(conf, outPath);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }
  }
  /**
   * Run the job
   *
   * @param input the input pathname String
   * @param output the output pathname String
   * @param catFile the file containing the Wikipedia categories
   * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply
   *     containing the category string
   * @param all if true select all categories
   * @throws ClassNotFoundException
   * @throws InterruptedException
   */
  public static void runJob(
      String input, String output, String catFile, boolean exactMatchOnly, boolean all)
      throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    conf.set("xmlinput.start", "<page>");
    conf.set("xmlinput.end", "</page>");
    conf.setBoolean("exact.match.only", exactMatchOnly);
    conf.setBoolean("all.files", all);
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    Job job = new Job(conf);
    if (WikipediaToSequenceFile.log.isInfoEnabled()) {
      log.info(
          "Input: " + input + " Out: " + output + " Categories: " + catFile + " All Files: " + all);
    }
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputPaths(job, new Path(input));
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(WikipediaMapper.class);
    job.setInputFormatClass(XmlInputFormat.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(WikipediaToSequenceFile.class);

    /*
     * conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression.type",
     * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
     */
    HadoopUtil.overwriteOutput(outPath);

    Set<String> categories = new HashSet<String>();
    if (catFile.length() > 0) {
      for (String line : new FileLineIterable(new File(catFile))) {
        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
      }
    }

    DefaultStringifier<Set<String>> setStringifier =
        new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));

    String categoriesStr = setStringifier.toString(categories);

    conf.set("wikipedia.categories", categoriesStr);

    job.waitForCompletion(true);
  }
Example #12
0
  /**
   * Create a partial tfidf vector using a chunk of features from the input vectors. The input
   * vectors has to be in the {@link SequenceFile} format
   *
   * @param input input directory of the vectors in {@link SequenceFile} format
   * @param featureCount Number of unique features in the dataset
   * @param vectorCount Number of vectors in the dataset
   * @param minDf The minimum document frequency. Default 1
   * @param maxDF The max percentage of vectors for the DF. Can be used to remove really high
   *     frequency features. Expressed as an integer between 0 and 100. Default 99
   * @param dictionaryFilePath location of the chunk of features and the id's
   * @param output output directory were the partial vectors have to be created
   * @param sequentialAccess output vectors should be optimized for sequential access
   * @param namedVector output vectors should be named, retaining key (doc id) as a label
   */
  private static void makePartialVectors(
      Path input,
      Configuration baseConf,
      Long featureCount,
      Long vectorCount,
      int minDf,
      long maxDF,
      Path dictionaryFilePath,
      Path output,
      boolean sequentialAccess,
      boolean namedVector)
      throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set(
        "io.serializations",
        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setLong(FEATURE_COUNT, featureCount);
    conf.setLong(VECTOR_COUNT, vectorCount);
    conf.setInt(MIN_DF, minDf);
    conf.setLong(MAX_DF, maxDF);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
    DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf);

    Job job = new Job(conf);
    job.setJobName(
        ": MakePartialVectors: input-folder: "
            + input
            + ", dictionary-file: "
            + dictionaryFilePath.toString());
    job.setJarByClass(TFIDFConverter.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFIDFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
      throw new IllegalStateException("Job failed!");
    }
  }
Example #13
0
  private static void createTFIDF(
      Configuration p_HadoopConf,
      Path p_DocumentDir,
      Path p_SequenceDir,
      Path p_TokensDir,
      Path p_TF,
      String p_VecFolder,
      Path p_tf_idf)
      throws IOException, Exception, ClassNotFoundException, InterruptedException {

    log.info(new Date() + " - Creating TF-IDF vectors started.");

    HadoopUtil.delete(p_HadoopConf, p_SequenceDir);
    HadoopUtil.delete(p_HadoopConf, p_TokensDir);
    HadoopUtil.delete(p_HadoopConf, p_TF);

    createSequence(p_DocumentDir.toString(), p_SequenceDir.toString());
    createTokens(p_SequenceDir, p_TokensDir, s_HadoopConf);
    createTFVectors(p_TokensDir, p_TF, p_VecFolder);
    createTFIDFVectors(new Path(p_TF, p_VecFolder), p_tf_idf, s_HadoopConf);

    log.info("Creating TF-IDF vectors finished.");
  }
Example #14
0
  @Override
  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    addInputOption();
    addOutputOption();
    addOption(MinhashOptionCreator.minClusterSizeOption().create());
    addOption(MinhashOptionCreator.minVectorSizeOption().create());
    addOption(MinhashOptionCreator.hashTypeOption().create());
    addOption(MinhashOptionCreator.numHashFunctionsOption().create());
    addOption(MinhashOptionCreator.keyGroupsOption().create());
    addOption(MinhashOptionCreator.numReducersOption().create());
    addOption(MinhashOptionCreator.debugOutputOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
      return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      HadoopUtil.delete(getConf(), output);
    }
    int minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE));
    int minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE));
    String hashType = getOption(MinhashOptionCreator.HASH_TYPE);
    int numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS));
    int keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS));
    int numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS));
    boolean debugOutput = hasOption(MinhashOptionCreator.DEBUG_OUTPUT);

    runJob(
        input,
        output,
        minClusterSize,
        minVectorSize,
        hashType,
        numHashFunctions,
        keyGroups,
        numReduceTasks,
        debugOutput);
    return 0;
  }
Example #15
0
  @Override
  public int run(String[] args) throws Exception {
    String path = System.getProperty("user.dir");

    addInputOption();
    addOutputOption();

    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
        buildOption(
            TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Path labPath = new Path(path + "/../out/labelindex/");

    long labelSize = createLabelIndex(labPath);
    float alphaI = 1.0F;
    boolean trainComplementary = true;

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());
    HadoopUtil.delete(getConf(), new Path("/tmp/summedObservations"));
    HadoopUtil.delete(getConf(), new Path("/tmp/weights"));
    HadoopUtil.delete(getConf(), new Path("/tmp/thetas"));

    // Add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances =
        prepareJob(
            new Path(path + "/../out/training"),
            new Path("/tmp/summedObservations"),
            SequenceFileInputFormat.class,
            IndexInstancesMapper.class,
            IntWritable.class,
            VectorWritable.class,
            VectorSumReducer.class,
            IntWritable.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }
    // Sum up all the weights from the previous step, per label and per feature
    Job weightSummer =
        prepareJob(
            new Path("/tmp/summedObservations"),
            new Path("/tmp/weights"),
            SequenceFileInputFormat.class,
            WeightsMapper.class,
            Text.class,
            VectorWritable.class,
            VectorSumReducer.class,
            Text.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }

    // Put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/weights"), getConf());

    if (trainComplementary) {
      // Calculate the per label theta normalizers, write out to LABEL_THETA_NORMALIZER vector
      // see http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf - Section 3.2, Weight
      // Magnitude Errors
      Job thetaSummer =
          prepareJob(
              new Path("/tmp/summedObservations"),
              new Path("/tmp/thetas"),
              SequenceFileInputFormat.class,
              ThetaMapper.class,
              Text.class,
              VectorWritable.class,
              VectorSumReducer.class,
              Text.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      thetaSummer.setCombinerClass(VectorSumReducer.class);
      thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
      thetaSummer
          .getConfiguration()
          .setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
      succeeded = thetaSummer.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    // Put the per label theta normalizers into the cache
    HadoopUtil.cacheFiles(new Path("/tmp/thetas"), getConf());

    // Validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    getConf().setBoolean(NaiveBayesModel.COMPLEMENTARY_MODEL, trainComplementary);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path("/tmp/"), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(new Path(path + "/../out/model"), getConf());

    return 0;
  }
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption(
        "similarityClassname",
        "s",
        "Name of distributed similarity class to instantiate, alternatively use "
            + "one of the predefined similarities ("
            + VectorSimilarityMeasures.list()
            + ')');
    addOption(
        "maxSimilaritiesPerRow",
        "m",
        "Number of maximum similarities per row (default: "
            + DEFAULT_MAX_SIMILARITIES_PER_ROW
            + ')',
        String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption(
        "excludeSelfSimilarity",
        "ess",
        "compute similarity of rows to themselves?",
        String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    int numberOfColumns;

    if (hasOption("numberOfColumns")) {
      // Number of columns explicitly specified via CLI
      numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
    } else {
      // else get the number of columns by determining the cardinality of a vector in the input
      // matrix
      numberOfColumns = getDimensions(getInputPath());
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
      similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
      similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
      // Clear the temp path
      HadoopUtil.delete(getConf(), getTempPath());
      // Clear the output path
      HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold =
        hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job normsAndTranspose =
          prepareJob(
              getInputPath(),
              weightsPath,
              VectorNormMapper.class,
              IntWritable.class,
              VectorWritable.class,
              MergeVectorsReducer.class,
              IntWritable.class,
              VectorWritable.class);
      normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
      Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
      normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
      normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
      normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
      normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
      normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
      boolean succeeded = normsAndTranspose.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job pairwiseSimilarity =
          prepareJob(
              weightsPath,
              pairwiseSimilarityPath,
              CooccurrencesMapper.class,
              IntWritable.class,
              VectorWritable.class,
              SimilarityReducer.class,
              IntWritable.class,
              VectorWritable.class);
      pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
      Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
      pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
      pairwiseConf.set(NORMS_PATH, normsPath.toString());
      pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
      pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
      pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
      pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
      pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
      boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job asMatrix =
          prepareJob(
              pairwiseSimilarityPath,
              getOutputPath(),
              UnsymmetrifyMapper.class,
              IntWritable.class,
              VectorWritable.class,
              MergeToTopKSimilaritiesReducer.class,
              IntWritable.class,
              VectorWritable.class);
      asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
      asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
      boolean succeeded = asMatrix.waitForCompletion(true);
      if (!succeeded) {
        return -1;
      }
    }

    return 0;
  }