@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption( "similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption( "maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption( "maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this number " + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = parsedArgs.get("--similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob( inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); itemIDIndex.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob( inputPath, countUsersPath, TextInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); countUsers.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob( inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVector.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob( userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse .getConfiguration() .setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); maybePruneAndTransponse.waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ ToolRunner.run( getConf(), new RowSimilarityJob(), new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath.toString(), "-Dmapred.output.dir=" + similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir", tempDirPath.toString() }); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob( similarityMatrixPath, outputPath, SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class); mostSimilarItems.waitForCompletion(true); } return 0; }
/** * integration test with a tiny data set * * <pre> * user-item-matrix * * Game Mouse PC Disk * Jane - 1 2 - * Paul 1 - 1 - * Fred - - - 1 * </pre> */ public void testCompleteJob() throws Exception { File inputFile = getTestTempFile("prefs.txt"); File outputDir = getTestTempDir("output"); outputDir.delete(); File tmpDir = getTestTempDir("tmp"); writeLines(inputFile, "2,1,1", "1,2,1", "3,4,1", "1,3,2", "2,3,1"); ItemSimilarityJob similarityJob = new ItemSimilarityJob(); Configuration conf = new Configuration(); conf.set("mapred.input.dir", inputFile.getAbsolutePath()); conf.set("mapred.output.dir", outputDir.getAbsolutePath()); conf.setBoolean("mapred.output.compress", false); similarityJob.setConf(conf); similarityJob.run( new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname", DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName() }); File countUsersPart = new File(tmpDir, "countUsers"); int numberOfUsers = TasteHadoopUtils.readIntFromFile( new Configuration(), new Path(countUsersPart.getAbsolutePath())); assertEquals(3, numberOfUsers); File outPart = outputDir .listFiles( new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.startsWith("part-"); } })[0]; BufferedReader reader = new BufferedReader(new FileReader(outPart)); String line; int currentLine = 1; while ((line = reader.readLine()) != null) { String[] tokens = line.split("\t"); long itemAID = Long.parseLong(tokens[0]); long itemBID = Long.parseLong(tokens[1]); double similarity = Double.parseDouble(tokens[2]); if (currentLine == 1) { assertEquals(1L, itemAID); assertEquals(3L, itemBID); assertEquals(0.45, similarity, 0.01); } if (currentLine == 2) { assertEquals(2L, itemAID); assertEquals(3L, itemBID); assertEquals(0.89, similarity, 0.01); } currentLine++; } int linesWritten = currentLine - 1; assertEquals(2, linesWritten); }