/** tests {@link ToUserVectorReducer} using boolean data */ @Test public void testToUserVectorReducerWithBooleanData() throws Exception { Reducer<VarLongWritable, VarLongWritable, VarLongWritable, VectorWritable>.Context context = EasyMock.createMock(Reducer.Context.class); context.write( EasyMock.eq(new VarLongWritable(12L)), MathHelper.vectorMatches( MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 1.0))); EasyMock.replay(context); new ToUserVectorReducer() .reduce( new VarLongWritable(12L), Arrays.asList(new VarLongWritable(34L), new VarLongWritable(56L)), context); EasyMock.verify(context); }
/** tests {@link ItemIDIndexMapper} */ @Test public void testItemIDIndexMapper() throws Exception { Mapper<LongWritable, Text, VarIntWritable, VarLongWritable>.Context context = EasyMock.createMock(Mapper.Context.class); context.write(new VarIntWritable(TasteHadoopUtils.idToIndex(789L)), new VarLongWritable(789L)); EasyMock.replay(context); new ItemIDIndexMapper().map(new LongWritable(123L), new Text("456,789,5.0"), context); EasyMock.verify(context); }
/** tests {@link ToUserVectorReducer} */ @Test public void testToUserVectorReducer() throws Exception { Reducer<VarLongWritable, VarLongWritable, VarLongWritable, VectorWritable>.Context context = EasyMock.createMock(Reducer.Context.class); context.write( EasyMock.eq(new VarLongWritable(12L)), MathHelper.vectorMatches( MathHelper.elem(TasteHadoopUtils.idToIndex(34L), 1.0), MathHelper.elem(TasteHadoopUtils.idToIndex(56L), 2.0))); EasyMock.replay(context); Collection<VarLongWritable> varLongWritables = new LinkedList<VarLongWritable>(); varLongWritables.add(new EntityPrefWritable(34L, 1.0f)); varLongWritables.add(new EntityPrefWritable(56L, 2.0f)); new ToUserVectorReducer().reduce(new VarLongWritable(12L), varLongWritables, context); EasyMock.verify(context); }
/** tests {@link org.apache.mahout.cf.taste.hadoop.item.ItemFilterAsVectorAndPrefsReducer} */ @Test public void testItemFilterAsVectorAndPrefsReducer() throws Exception { Reducer<VarLongWritable, VarLongWritable, VarIntWritable, VectorAndPrefsWritable>.Context context = EasyMock.createMock(Reducer.Context.class); int itemIDIndex = TasteHadoopUtils.idToIndex(123L); context.write( EasyMock.eq(new VarIntWritable(itemIDIndex)), vectorAndPrefsForFilteringMatches(123L, 456L, 789L)); EasyMock.replay(context); new ItemFilterAsVectorAndPrefsReducer() .reduce( new VarLongWritable(123L), Arrays.asList(new VarLongWritable(456L), new VarLongWritable(789L)), context); EasyMock.verify(context); }
@Override protected void reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx) throws IOException, InterruptedException { int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get()); Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */ vector.set(itemIDIndex, Double.NaN); // 这是过滤的trick // 从这里可以反推出来,AggregateAndRecommendReducer里面过滤了评分过的 user,item这种pair List<Long> userIDs = Lists.newArrayList(); List<Float> prefValues = Lists.newArrayList(); for (VarLongWritable userID : values) { userIDs.add(userID.get()); prefValues.add(1.0f); } itemIDIndexWritable.set(itemIDIndex); vectorAndPrefs.set(vector, userIDs, prefValues); ctx.write(itemIDIndexWritable, vectorAndPrefs); }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption( "similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption( "maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption( "maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this number " + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = parsedArgs.get("--similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob( inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); itemIDIndex.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob( inputPath, countUsersPath, TextInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); countUsers.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob( inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVector.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob( userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse .getConfiguration() .setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); maybePruneAndTransponse.waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ ToolRunner.run( getConf(), new RowSimilarityJob(), new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath.toString(), "-Dmapred.output.dir=" + similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir", tempDirPath.toString() }); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob( similarityMatrixPath, outputPath, SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class); mostSimilarItems.waitForCompletion(true); } return 0; }
/** * integration test with a tiny data set * * <pre> * user-item-matrix * * Game Mouse PC Disk * Jane - 1 2 - * Paul 1 - 1 - * Fred - - - 1 * </pre> */ public void testCompleteJob() throws Exception { File inputFile = getTestTempFile("prefs.txt"); File outputDir = getTestTempDir("output"); outputDir.delete(); File tmpDir = getTestTempDir("tmp"); writeLines(inputFile, "2,1,1", "1,2,1", "3,4,1", "1,3,2", "2,3,1"); ItemSimilarityJob similarityJob = new ItemSimilarityJob(); Configuration conf = new Configuration(); conf.set("mapred.input.dir", inputFile.getAbsolutePath()); conf.set("mapred.output.dir", outputDir.getAbsolutePath()); conf.setBoolean("mapred.output.compress", false); similarityJob.setConf(conf); similarityJob.run( new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname", DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName() }); File countUsersPart = new File(tmpDir, "countUsers"); int numberOfUsers = TasteHadoopUtils.readIntFromFile( new Configuration(), new Path(countUsersPart.getAbsolutePath())); assertEquals(3, numberOfUsers); File outPart = outputDir .listFiles( new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.startsWith("part-"); } })[0]; BufferedReader reader = new BufferedReader(new FileReader(outPart)); String line; int currentLine = 1; while ((line = reader.readLine()) != null) { String[] tokens = line.split("\t"); long itemAID = Long.parseLong(tokens[0]); long itemBID = Long.parseLong(tokens[1]); double similarity = Double.parseDouble(tokens[2]); if (currentLine == 1) { assertEquals(1L, itemAID); assertEquals(3L, itemBID); assertEquals(0.45, similarity, 0.01); } if (currentLine == 2) { assertEquals(2L, itemAID); assertEquals(3L, itemBID); assertEquals(0.89, similarity, 0.01); } currentLine++; } int linesWritten = currentLine - 1; assertEquals(2, linesWritten); }