Example #1
0
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(
        "similarityClassname",
        "s",
        "Name of distributed similarity class to instantiate, alternatively use "
            + "one of the predefined similarities ("
            + SimilarityType.listEnumNames()
            + ')');
    addOption(
        "maxSimilaritiesPerItem",
        "m",
        "try to cap the number of similar items per item to this number "
            + "(default: "
            + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM
            + ')',
        String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption(
        "maxCooccurrencesPerItem",
        "mo",
        "try to cap the number of cooccurrences per item to this number "
            + "(default: "
            + DEFAULT_MAX_COOCCURRENCES_PER_ITEM
            + ')',
        String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job itemIDIndex =
          prepareJob(
              inputPath,
              itemIDIndexPath,
              TextInputFormat.class,
              ItemIDIndexMapper.class,
              VarIntWritable.class,
              VarLongWritable.class,
              ItemIDIndexReducer.class,
              VarIntWritable.class,
              VarLongWritable.class,
              SequenceFileOutputFormat.class);
      itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
      itemIDIndex.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job countUsers =
          prepareJob(
              inputPath,
              countUsersPath,
              TextInputFormat.class,
              CountUsersMapper.class,
              CountUsersKeyWritable.class,
              VarLongWritable.class,
              CountUsersReducer.class,
              VarIntWritable.class,
              NullWritable.class,
              TextOutputFormat.class);
      countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
      countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
      countUsers.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job toUserVector =
          prepareJob(
              inputPath,
              userVectorPath,
              TextInputFormat.class,
              ToItemPrefsMapper.class,
              VarLongWritable.class,
              booleanData ? VarLongWritable.class : EntityPrefWritable.class,
              ToUserVectorReducer.class,
              VarLongWritable.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
      toUserVector.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job maybePruneAndTransponse =
          prepareJob(
              userVectorPath,
              itemUserMatrixPath,
              SequenceFileInputFormat.class,
              MaybePruneRowsMapper.class,
              IntWritable.class,
              DistributedRowMatrix.MatrixEntryWritable.class,
              ToItemVectorsReducer.class,
              IntWritable.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      maybePruneAndTransponse
          .getConfiguration()
          .setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem);
      maybePruneAndTransponse.waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...) */
    ToolRunner.run(
        getConf(),
        new RowSimilarityJob(),
        new String[] {
          "-Dmapred.input.dir=" + itemUserMatrixPath.toString(),
          "-Dmapred.output.dir=" + similarityMatrixPath.toString(),
          "--numberOfColumns",
          String.valueOf(numberOfUsers),
          "--similarityClassname",
          similarityClassName,
          "--maxSimilaritiesPerRow",
          String.valueOf(maxSimilarItemsPerItem + 1),
          "--tempDir",
          tempDirPath.toString()
        });

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job mostSimilarItems =
          prepareJob(
              similarityMatrixPath,
              outputPath,
              SequenceFileInputFormat.class,
              MostSimilarItemPairsMapper.class,
              EntityEntityWritable.class,
              DoubleWritable.class,
              MostSimilarItemPairsReducer.class,
              EntityEntityWritable.class,
              DoubleWritable.class,
              TextOutputFormat.class);
      Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
      mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
      mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
      mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
      mostSimilarItems.waitForCompletion(true);
    }

    return 0;
  }
  /**
   * integration test with a tiny data set
   *
   * <pre>
   * user-item-matrix
   *
   *        Game   Mouse   PC    Disk
   * Jane    -       1      2      -
   * Paul    1       -      1      -
   * Fred    -       -      -      1
   * </pre>
   */
  public void testCompleteJob() throws Exception {

    File inputFile = getTestTempFile("prefs.txt");
    File outputDir = getTestTempDir("output");
    outputDir.delete();
    File tmpDir = getTestTempDir("tmp");

    writeLines(inputFile, "2,1,1", "1,2,1", "3,4,1", "1,3,2", "2,3,1");

    ItemSimilarityJob similarityJob = new ItemSimilarityJob();

    Configuration conf = new Configuration();
    conf.set("mapred.input.dir", inputFile.getAbsolutePath());
    conf.set("mapred.output.dir", outputDir.getAbsolutePath());
    conf.setBoolean("mapred.output.compress", false);

    similarityJob.setConf(conf);

    similarityJob.run(
        new String[] {
          "--tempDir",
          tmpDir.getAbsolutePath(),
          "--similarityClassname",
          DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName()
        });

    File countUsersPart = new File(tmpDir, "countUsers");
    int numberOfUsers =
        TasteHadoopUtils.readIntFromFile(
            new Configuration(), new Path(countUsersPart.getAbsolutePath()));

    assertEquals(3, numberOfUsers);

    File outPart =
        outputDir
            .listFiles(
                new FilenameFilter() {
                  @Override
                  public boolean accept(File dir, String name) {
                    return name.startsWith("part-");
                  }
                })[0];
    BufferedReader reader = new BufferedReader(new FileReader(outPart));

    String line;
    int currentLine = 1;
    while ((line = reader.readLine()) != null) {

      String[] tokens = line.split("\t");

      long itemAID = Long.parseLong(tokens[0]);
      long itemBID = Long.parseLong(tokens[1]);
      double similarity = Double.parseDouble(tokens[2]);

      if (currentLine == 1) {
        assertEquals(1L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.45, similarity, 0.01);
      }

      if (currentLine == 2) {
        assertEquals(2L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.89, similarity, 0.01);
      }

      currentLine++;
    }

    int linesWritten = currentLine - 1;
    assertEquals(2, linesWritten);
  }