/** * Probes a model to determine the effect of a particular variable. This is done with the ade of a * trace dictionary which has recorded the locations in the feature vector that are modified by * various variable values. We can set these locations to 1 and then look at the resulting score. * This tells us the weight the model places on that variable. * * @param features A feature vector to use (destructively) * @param traceDictionary A trace dictionary containing variables and what locations in the * feature vector are affected by them * @param learner The model that we are probing to find weights on features */ public void update( Vector features, Map<String, Set<Integer>> traceDictionary, AbstractVectorClassifier learner) { // zero out feature vector features.assign(0); for (Map.Entry<String, Set<Integer>> entry : traceDictionary.entrySet()) { // get a feature and locations where it is stored in the feature vector String key = entry.getKey(); Set<Integer> value = entry.getValue(); // if we haven't looked at this feature yet if (!weightMap.containsKey(key)) { // put probe values in the feature vector for (Integer where : value) { features.set(where, 1); } // see what the model says Vector v = learner.classifyNoLink(features); weightMap.put(key, v); // and zero out those locations again for (Integer where : value) { features.set(where, 0); } } } }
@Test public void testVectorDistanceInvertedMapper() throws Exception { Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = EasyMock.createMock(Mapper.Context.class); Vector expectVec = new DenseVector(new double[] {Math.sqrt(2.0), 1.0}); context.write(new Text("other"), new VectorWritable(expectVec)); EasyMock.replay(context); Vector vector = new NamedVector(new RandomAccessSparseVector(2), "other"); vector.set(0, 2); vector.set(1, 2); VectorDistanceInvertedMapper mapper = new VectorDistanceInvertedMapper(); setField(mapper, "measure", new EuclideanDistanceMeasure()); List<NamedVector> seedVectors = new ArrayList<NamedVector>(); Vector seed1 = new RandomAccessSparseVector(2); seed1.set(0, 1); seed1.set(1, 1); Vector seed2 = new RandomAccessSparseVector(2); seed2.set(0, 2); seed2.set(1, 1); seedVectors.add(new NamedVector(seed1, "foo")); seedVectors.add(new NamedVector(seed2, "foo2")); setField(mapper, "seedVectors", seedVectors); mapper.map(new IntWritable(123), new VectorWritable(vector), context); EasyMock.verify(context); }
/** tests {@link MostSimilarItemPairsMapper} */ public void testMostSimilarItemsPairsMapper() throws Exception { OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap(); indexItemIDMap.put(12, 12L); indexItemIDMap.put(34, 34L); indexItemIDMap.put(56, 56L); Mapper<IntWritable, VectorWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock.createMock(Mapper.Context.class); context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9)); EasyMock.replay(context); Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE); vector.set(12, 0.2); vector.set(34, 1.0); vector.set(56, 0.9); MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper(); setField(mapper, "indexItemIDMap", indexItemIDMap); setField(mapper, "maxSimilarItemsPerItem", 1); mapper.map(new IntWritable(34), new VectorWritable(vector), context); EasyMock.verify(context); }
/** tests {@link PartialMultiplyMapper} */ @Test public void testPartialMultiplyMapper() throws Exception { Vector similarityColumn = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); similarityColumn.set(3, 0.5); similarityColumn.set(7, 0.8); Mapper<VarIntWritable, VectorAndPrefsWritable, VarLongWritable, PrefAndSimilarityColumnWritable> .Context context = EasyMock.createMock(Mapper.Context.class); PrefAndSimilarityColumnWritable one = new PrefAndSimilarityColumnWritable(); PrefAndSimilarityColumnWritable two = new PrefAndSimilarityColumnWritable(); one.set(1.0f, similarityColumn); two.set(3.0f, similarityColumn); context.write(EasyMock.eq(new VarLongWritable(123L)), EasyMock.eq(one)); context.write(EasyMock.eq(new VarLongWritable(456L)), EasyMock.eq(two)); EasyMock.replay(context); VectorAndPrefsWritable vectorAndPrefs = new VectorAndPrefsWritable( similarityColumn, Arrays.asList(123L, 456L), Arrays.asList(1.0f, 3.0f)); new PartialMultiplyMapper().map(new VarIntWritable(1), vectorAndPrefs, context); EasyMock.verify(context); }
/** tests {@link ToVectorAndPrefReducer} */ @Test public void testToVectorAndPrefReducer() throws Exception { Reducer<VarIntWritable, VectorOrPrefWritable, VarIntWritable, VectorAndPrefsWritable>.Context context = EasyMock.createMock(Reducer.Context.class); context.write( EasyMock.eq(new VarIntWritable(1)), vectorAndPrefsWritableMatches( Arrays.asList(123L, 456L), Arrays.asList(1.0f, 2.0f), MathHelper.elem(3, 0.5), MathHelper.elem(7, 0.8))); EasyMock.replay(context); Vector similarityColumn = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); similarityColumn.set(3, 0.5); similarityColumn.set(7, 0.8); VectorOrPrefWritable itemPref1 = new VectorOrPrefWritable(123L, 1.0f); VectorOrPrefWritable itemPref2 = new VectorOrPrefWritable(456L, 2.0f); VectorOrPrefWritable similarities = new VectorOrPrefWritable(similarityColumn); new ToVectorAndPrefReducer() .reduce(new VarIntWritable(1), Arrays.asList(itemPref1, itemPref2, similarities), context); EasyMock.verify(context); }
@Override public Vector select(Vector probabilities) { int maxValueIndex = probabilities.maxValueIndex(); Vector weights = new SequentialAccessSparseVector(probabilities.size()); weights.set(maxValueIndex, 1.0); return weights; }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partialDots, Context ctx) throws IOException, InterruptedException { Iterator<VectorWritable> partialDotsIterator = partialDots.iterator(); Vector dots = partialDotsIterator.next().get(); while (partialDotsIterator.hasNext()) { Vector toAdd = partialDotsIterator.next().get(); Iterator<Vector.Element> nonZeroElements = toAdd.iterateNonZero(); while (nonZeroElements.hasNext()) { Vector.Element nonZeroElement = nonZeroElements.next(); dots.setQuick( nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get()); } } Vector similarities = dots.like(); double normA = norms.getQuick(row.get()); Iterator<Vector.Element> dotsWith = dots.iterateNonZero(); while (dotsWith.hasNext()) { Vector.Element b = dotsWith.next(); double similarityValue = similarity.similarity(b.get(), normA, norms.getQuick(b.index()), numberOfColumns); if (similarityValue >= treshold) { similarities.set(b.index(), similarityValue); } } if (excludeSelfSimilarity) { similarities.setQuick(row.get(), 0); } ctx.write(row, new VectorWritable(similarities)); }
// @Override public void reduce(VarLongWritable userID, Iterable<VarLongWritable> itemPrefs, Context context) throws IOException, InterruptedException { Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100); for (VarLongWritable itemPref : itemPrefs) { userVector.set((int) itemPref.get(), 1.0f); } context.write(userID, (new VectorWritable(userVector))); }
protected void updateHdfsState() throws IOException { if (conf == null) { return; } int numBasisVectorsOnDisk = 0; Path nextBasisVectorPath = new Path(basisPath, BASIS_PREFIX + '_' + numBasisVectorsOnDisk); while (fs.exists(nextBasisVectorPath)) { nextBasisVectorPath = new Path(basisPath, BASIS_PREFIX + '_' + ++numBasisVectorsOnDisk); } Vector nextVector; while (numBasisVectorsOnDisk < iterationNumber && (nextVector = getBasisVector(numBasisVectorsOnDisk)) != null) { persistVector(nextBasisVectorPath, numBasisVectorsOnDisk, nextVector); nextBasisVectorPath = new Path(basisPath, BASIS_PREFIX + '_' + ++numBasisVectorsOnDisk); } if (scaleFactor <= 0) { scaleFactor = getScaleFactor(); // load from disk if possible } diagonalMatrix = getDiagonalMatrix(); // load from disk if possible Vector norms = new DenseVector(diagonalMatrix.numCols() - 1); Vector projections = new DenseVector(diagonalMatrix.numCols()); int i = 0; while (i < diagonalMatrix.numCols() - 1) { norms.set(i, diagonalMatrix.get(i, i + 1)); projections.set(i, diagonalMatrix.get(i, i)); i++; } projections.set(i, diagonalMatrix.get(i, i)); persistVector(new Path(baseDir, "projections"), 0, projections); persistVector(new Path(baseDir, "norms"), 0, norms); persistVector(new Path(baseDir, "scaleFactor"), 0, new DenseVector(new double[] {scaleFactor})); for (Map.Entry<Integer, Vector> entry : singularVectors.entrySet()) { persistVector( new Path(singularVectorPath, SINGULAR_PREFIX + '_' + entry.getKey()), entry.getKey(), entry.getValue()); } super.setIterationNumber(numBasisVectorsOnDisk); }
@Test public void testVectorDistanceMapper() throws Exception { Mapper<WritableComparable<?>, VectorWritable, StringTuple, DoubleWritable>.Context context = EasyMock.createMock(Mapper.Context.class); StringTuple tuple; tuple = new StringTuple(); tuple.add("foo"); tuple.add("123"); context.write(tuple, new DoubleWritable(Math.sqrt(2.0))); tuple = new StringTuple(); tuple.add("foo2"); tuple.add("123"); context.write(tuple, new DoubleWritable(1)); EasyMock.replay(context); Vector vector = new RandomAccessSparseVector(2); vector.set(0, 2); vector.set(1, 2); VectorDistanceMapper mapper = new VectorDistanceMapper(); setField(mapper, "measure", new EuclideanDistanceMeasure()); List<NamedVector> seedVectors = new ArrayList<NamedVector>(); Vector seed1 = new RandomAccessSparseVector(2); seed1.set(0, 1); seed1.set(1, 1); Vector seed2 = new RandomAccessSparseVector(2); seed2.set(0, 2); seed2.set(1, 1); seedVectors.add(new NamedVector(seed1, "foo")); seedVectors.add(new NamedVector(seed2, "foo2")); setField(mapper, "seedVectors", seedVectors); mapper.map(new IntWritable(123), new VectorWritable(vector), context); EasyMock.verify(context); }
public static Matrix sampledCorpus( Matrix matrix, Random random, int numDocs, int numSamples, int numTopicsPerDoc) { Matrix corpus = new SparseRowMatrix(numDocs, matrix.numCols()); LDASampler modelSampler = new LDASampler(matrix, random); Vector topicVector = new DenseVector(matrix.numRows()); for (int i = 0; i < numTopicsPerDoc; i++) { int topic = random.nextInt(topicVector.size()); topicVector.set(topic, topicVector.get(topic) + 1); } for (int docId = 0; docId < numDocs; docId++) { for (int sample : modelSampler.sample(topicVector, numSamples)) { corpus.set(docId, sample, corpus.get(docId, sample) + 1); } } return corpus; }
private List<? extends WeightedVector> cubishTestData(double radius) { List<WeightedVector> data = Lists.newArrayListWithCapacity(K1 + 5000); int row = 0; MultiNormal g = new MultiNormal(radius, new ConstantVector(0, 10)); for (int i = 0; i < K1; i++) { data.add(new WeightedVector(g.sample(), 1, row++)); } for (int i = 0; i < 5; i++) { Vector m = new DenseVector(10); m.set(i, i == 0 ? 6 : 6); MultiNormal gx = new MultiNormal(radius, m); for (int j = 0; j < 1000; j++) { data.add(new WeightedVector(gx.sample(), 1, row++)); } } return data; }
@Override protected void reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx) throws IOException, InterruptedException { int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get()); Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */ vector.set(itemIDIndex, Double.NaN); // 这是过滤的trick // 从这里可以反推出来,AggregateAndRecommendReducer里面过滤了评分过的 user,item这种pair List<Long> userIDs = Lists.newArrayList(); List<Float> prefValues = Lists.newArrayList(); for (VarLongWritable userID : values) { userIDs.add(userID.get()); prefValues.add(1.0f); } itemIDIndexWritable.set(itemIDIndex); vectorAndPrefs.set(vector, userIDs, prefValues); ctx.write(itemIDIndexWritable, vectorAndPrefs); }