Esempio n. 1
0
  /** tests {@link ToUserVectorReducer} using boolean data */
  @Test
  public void testToUserVectorReducerWithBooleanData() throws Exception {
    Reducer<VarLongWritable, VarLongWritable, VarLongWritable, VectorWritable>.Context context =
        EasyMock.createMock(Reducer.Context.class);

    context.write(
        EasyMock.eq(new VarLongWritable(12L)),
        MathHelper.vectorMatches(
            MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0),
            MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 1.0)));

    EasyMock.replay(context);

    new ToUserVectorReducer()
        .reduce(
            new VarLongWritable(12L),
            Arrays.asList(new VarLongWritable(34L), new VarLongWritable(56L)),
            context);

    EasyMock.verify(context);
  }
Esempio n. 2
0
  /** tests {@link ItemIDIndexMapper} */
  @Test
  public void testItemIDIndexMapper() throws Exception {
    Mapper<LongWritable, Text, VarIntWritable, VarLongWritable>.Context context =
        EasyMock.createMock(Mapper.Context.class);

    context.write(new VarIntWritable(TasteHadoopUtils.idToIndex(789L)), new VarLongWritable(789L));
    EasyMock.replay(context);

    new ItemIDIndexMapper().map(new LongWritable(123L), new Text("456,789,5.0"), context);

    EasyMock.verify(context);
  }
Esempio n. 3
0
  /** tests {@link ToUserVectorReducer} */
  @Test
  public void testToUserVectorReducer() throws Exception {
    Reducer<VarLongWritable, VarLongWritable, VarLongWritable, VectorWritable>.Context context =
        EasyMock.createMock(Reducer.Context.class);

    context.write(
        EasyMock.eq(new VarLongWritable(12L)),
        MathHelper.vectorMatches(
            MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0),
            MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 2.0)));

    EasyMock.replay(context);

    Collection<VarLongWritable> varLongWritables = new LinkedList<VarLongWritable>();
    varLongWritables.add(new EntityPrefWritable(34L, 1.0f));
    varLongWritables.add(new EntityPrefWritable(56L, 2.0f));

    new ToUserVectorReducer().reduce(new VarLongWritable(12L), varLongWritables, context);

    EasyMock.verify(context);
  }
Esempio n. 4
0
  /** tests {@link org.apache.mahout.cf.taste.hadoop.item.ItemFilterAsVectorAndPrefsReducer} */
  @Test
  public void testItemFilterAsVectorAndPrefsReducer() throws Exception {
    Reducer<VarLongWritable, VarLongWritable, VarIntWritable, VectorAndPrefsWritable>.Context
        context = EasyMock.createMock(Reducer.Context.class);

    int itemIDIndex = TasteHadoopUtils.idToIndex(123L);
    context.write(
        EasyMock.eq(new VarIntWritable(itemIDIndex)),
        vectorAndPrefsForFilteringMatches(123L, 456L, 789L));

    EasyMock.replay(context);

    new ItemFilterAsVectorAndPrefsReducer()
        .reduce(
            new VarLongWritable(123L),
            Arrays.asList(new VarLongWritable(456L), new VarLongWritable(789L)),
            context);

    EasyMock.verify(context);
  }
  @Override
  protected void reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx)
      throws IOException, InterruptedException {

    int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get());
    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
    /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */
    vector.set(itemIDIndex, Double.NaN);
    // 这是过滤的trick
    // 从这里可以反推出来,AggregateAndRecommendReducer里面过滤了评分过的 user,item这种pair

    List<Long> userIDs = Lists.newArrayList();
    List<Float> prefValues = Lists.newArrayList();
    for (VarLongWritable userID : values) {
      userIDs.add(userID.get());
      prefValues.add(1.0f);
    }

    itemIDIndexWritable.set(itemIDIndex);
    vectorAndPrefs.set(vector, userIDs, prefValues);
    ctx.write(itemIDIndexWritable, vectorAndPrefs);
  }
Esempio n. 6
0
  @Override
  public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();
    addOption(
        "similarityClassname",
        "s",
        "Name of distributed similarity class to instantiate, alternatively use "
            + "one of the predefined similarities ("
            + SimilarityType.listEnumNames()
            + ')');
    addOption(
        "maxSimilaritiesPerItem",
        "m",
        "try to cap the number of similar items per item to this number "
            + "(default: "
            + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM
            + ')',
        String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption(
        "maxCooccurrencesPerItem",
        "mo",
        "try to cap the number of cooccurrences per item to this number "
            + "(default: "
            + DEFAULT_MAX_COOCCURRENCES_PER_ITEM
            + ')',
        String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
      return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job itemIDIndex =
          prepareJob(
              inputPath,
              itemIDIndexPath,
              TextInputFormat.class,
              ItemIDIndexMapper.class,
              VarIntWritable.class,
              VarLongWritable.class,
              ItemIDIndexReducer.class,
              VarIntWritable.class,
              VarLongWritable.class,
              SequenceFileOutputFormat.class);
      itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
      itemIDIndex.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job countUsers =
          prepareJob(
              inputPath,
              countUsersPath,
              TextInputFormat.class,
              CountUsersMapper.class,
              CountUsersKeyWritable.class,
              VarLongWritable.class,
              CountUsersReducer.class,
              VarIntWritable.class,
              NullWritable.class,
              TextOutputFormat.class);
      countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
      countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
      countUsers.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job toUserVector =
          prepareJob(
              inputPath,
              userVectorPath,
              TextInputFormat.class,
              ToItemPrefsMapper.class,
              VarLongWritable.class,
              booleanData ? VarLongWritable.class : EntityPrefWritable.class,
              ToUserVectorReducer.class,
              VarLongWritable.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
      toUserVector.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job maybePruneAndTransponse =
          prepareJob(
              userVectorPath,
              itemUserMatrixPath,
              SequenceFileInputFormat.class,
              MaybePruneRowsMapper.class,
              IntWritable.class,
              DistributedRowMatrix.MatrixEntryWritable.class,
              ToItemVectorsReducer.class,
              IntWritable.class,
              VectorWritable.class,
              SequenceFileOutputFormat.class);
      maybePruneAndTransponse
          .getConfiguration()
          .setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem);
      maybePruneAndTransponse.waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...) */
    ToolRunner.run(
        getConf(),
        new RowSimilarityJob(),
        new String[] {
          "-Dmapred.input.dir=" + itemUserMatrixPath.toString(),
          "-Dmapred.output.dir=" + similarityMatrixPath.toString(),
          "--numberOfColumns",
          String.valueOf(numberOfUsers),
          "--similarityClassname",
          similarityClassName,
          "--maxSimilaritiesPerRow",
          String.valueOf(maxSimilarItemsPerItem + 1),
          "--tempDir",
          tempDirPath.toString()
        });

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
      Job mostSimilarItems =
          prepareJob(
              similarityMatrixPath,
              outputPath,
              SequenceFileInputFormat.class,
              MostSimilarItemPairsMapper.class,
              EntityEntityWritable.class,
              DoubleWritable.class,
              MostSimilarItemPairsReducer.class,
              EntityEntityWritable.class,
              DoubleWritable.class,
              TextOutputFormat.class);
      Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
      mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
      mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
      mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
      mostSimilarItems.waitForCompletion(true);
    }

    return 0;
  }
  /**
   * integration test with a tiny data set
   *
   * <pre>
   * user-item-matrix
   *
   *        Game   Mouse   PC    Disk
   * Jane    -       1      2      -
   * Paul    1       -      1      -
   * Fred    -       -      -      1
   * </pre>
   */
  public void testCompleteJob() throws Exception {

    File inputFile = getTestTempFile("prefs.txt");
    File outputDir = getTestTempDir("output");
    outputDir.delete();
    File tmpDir = getTestTempDir("tmp");

    writeLines(inputFile, "2,1,1", "1,2,1", "3,4,1", "1,3,2", "2,3,1");

    ItemSimilarityJob similarityJob = new ItemSimilarityJob();

    Configuration conf = new Configuration();
    conf.set("mapred.input.dir", inputFile.getAbsolutePath());
    conf.set("mapred.output.dir", outputDir.getAbsolutePath());
    conf.setBoolean("mapred.output.compress", false);

    similarityJob.setConf(conf);

    similarityJob.run(
        new String[] {
          "--tempDir",
          tmpDir.getAbsolutePath(),
          "--similarityClassname",
          DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName()
        });

    File countUsersPart = new File(tmpDir, "countUsers");
    int numberOfUsers =
        TasteHadoopUtils.readIntFromFile(
            new Configuration(), new Path(countUsersPart.getAbsolutePath()));

    assertEquals(3, numberOfUsers);

    File outPart =
        outputDir
            .listFiles(
                new FilenameFilter() {
                  @Override
                  public boolean accept(File dir, String name) {
                    return name.startsWith("part-");
                  }
                })[0];
    BufferedReader reader = new BufferedReader(new FileReader(outPart));

    String line;
    int currentLine = 1;
    while ((line = reader.readLine()) != null) {

      String[] tokens = line.split("\t");

      long itemAID = Long.parseLong(tokens[0]);
      long itemBID = Long.parseLong(tokens[1]);
      double similarity = Double.parseDouble(tokens[2]);

      if (currentLine == 1) {
        assertEquals(1L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.45, similarity, 0.01);
      }

      if (currentLine == 2) {
        assertEquals(2L, itemAID);
        assertEquals(3L, itemBID);
        assertEquals(0.89, similarity, 0.01);
      }

      currentLine++;
    }

    int linesWritten = currentLine - 1;
    assertEquals(2, linesWritten);
  }