@Test public void testDampedEqualKeys() { long[] keys = {2, 5, 6}; double[] val1 = {1, 2, 1}; double[] val2 = {1, 2, 5}; SparseVector v1 = MutableSparseVector.wrap(keys, val1).freeze(); SparseVector v2 = MutableSparseVector.wrap(keys, val2).freeze(); assertEquals(0.375, dampedSimilarity.similarity(v1, v1), EPSILON); assertEquals(0.42705098, dampedSimilarity.similarity(v1, v2), EPSILON); }
@Test public void testEqualKeys() { long[] keys = {2, 5, 6}; double[] val1 = {1, 2, 1}; double[] val2 = {1, 2, 5}; SparseVector v1 = MutableSparseVector.wrap(keys, val1).freeze(); SparseVector v2 = MutableSparseVector.wrap(keys, val2).freeze(); assertEquals(1, similarity.similarity(v1, v1), EPSILON); assertEquals(0.745355993, similarity.similarity(v1, v2), EPSILON); }
/** * Score items in a vector. The key domain of the provided vector is the items to score, and the * score method sets the values for each item to its score (or unsets it, if no score can be * provided). The previous values are discarded. * * @param user The user ID. * @param scores The score vector. */ @Override public void score(long user, @Nonnull MutableSparseVector scores) { // TODO Score the items in the key domain of scores for (VectorEntry e : scores.fast(VectorEntry.State.EITHER)) { long item = e.getKey(); // TODO Set the scores double score = prediction(user, item); scores.set(e, score); } }
@Test public void testOverlap() { long[] k1 = {1, 2, 5, 6}; double[] val1 = {3, 1, 2, 1}; long[] k2 = {2, 3, 5, 6, 7}; double[] val2 = {1, 7, 2, 5, 0}; SparseVector v1 = MutableSparseVector.wrap(k1, val1).freeze(); SparseVector v2 = MutableSparseVector.wrap(k2, val2).freeze(); assertEquals(1, similarity.similarity(v1, v1), EPSILON); assertEquals(1, similarity.similarity(v2, v2), EPSILON); assertEquals(0.29049645, similarity.similarity(v1, v2), EPSILON); }
@Test public void testDisjoint() { long[] k1 = {2, 5, 6}; double[] val1 = {1, 3, 2}; long[] k2 = {3, 4, 7}; double[] val2 = {1, 3, 2}; SparseVector v1, v2; v1 = MutableSparseVector.wrap(k1, val1).freeze(); v2 = MutableSparseVector.wrap(k2, val2).freeze(); assertEquals(0, similarity.similarity(v1, v2), EPSILON); assertEquals(0, dampedSimilarity.similarity(v1, v2), EPSILON); }
@Override public void predict(long uid, @Nonnull MutableSparseVector predictions) { logger.debug("predicting {} items for {}", predictions.keyDomain().size(), uid); OrdRecModel params = new OrdRecModel(quantizer); SparseVector ratings = makeUserVector(uid, userEventDao); LongSet keySet = LongUtils.setUnion(ratings.keySet(), predictions.keyDomain()); MutableSparseVector scores = MutableSparseVector.create(keySet); itemScorer.score(uid, scores); params.train(ratings, scores); logger.debug("trained parameters for {}: {}", uid, params); Vector probabilities = Vector.createLength(params.getLevelCount()); Long2ObjectMap<IVector> distChannel = null; if (reportDistribution) { distChannel = predictions.addChannel(RATING_PROBABILITY_CHANNEL); } for (VectorEntry e : predictions.fast(VectorEntry.State.EITHER)) { long iid = e.getKey(); double score = scores.get(iid); params.getProbDistribution(score, probabilities); int mlIdx = probabilities.maxElementIndex(); predictions.set(e, quantizer.getIndexValue(mlIdx)); if (distChannel != null) { distChannel.put(e.getKey(), probabilities.immutable()); } } }
@Test public void testItemMeanBaseline() { ItemScorer pred = new ItemMeanRatingItemScorer.Builder(dao, 0.0).get(); long[] items = {5, 7, 10}; double[] values = {3, 6, 4}; SparseVector map = MutableSparseVector.wrap(items, values).freeze(); // unseen item, should be global mean assertThat(pred.score(10, 2), closeTo(RATINGS_DAT_MEAN, 0.001)); // seen item - should be item average assertThat(pred.score(10, 5), closeTo(3.0, 0.001)); }
/** * Build a rating matrix from the rating data. Each user's ratings are first normalized by * subtracting a baseline score (usually a mean). * * @param userMapping The index mapping of user IDs to column numbers. * @param itemMapping The index mapping of item IDs to row numbers. * @return A matrix storing the <i>normalized</i> user ratings. */ private RealMatrix createRatingMatrix(IdIndexMapping userMapping, IdIndexMapping itemMapping) { final int nusers = userMapping.size(); final int nitems = itemMapping.size(); // Create a matrix with users on rows and items on columns logger.info("creating {} by {} rating matrix", nusers, nitems); RealMatrix matrix = MatrixUtils.createRealMatrix(nusers, nitems); // populate it with data Cursor<UserHistory<Event>> users = userEventDAO.streamEventsByUser(); try { for (UserHistory<Event> user : users) { // Get the row number for this user int u = userMapping.getIndex(user.getUserId()); MutableSparseVector ratings = Ratings.userRatingVector(user.filter(Rating.class)); MutableSparseVector baselines = MutableSparseVector.create(ratings.keySet()); baselineScorer.score(user.getUserId(), baselines); // TODO Populate this user's row with their ratings, minus the baseline scores for (VectorEntry entry : ratings.fast(State.SET)) { long itemid = entry.getKey(); int i = itemMapping.getIndex(itemid); double rating = entry.getValue(); double baseline = baselines.get(itemid); matrix.setEntry(u, i, rating - baseline); } } } finally { users.close(); } return matrix; }
@Override public double similarity(SparseVector vec1, SparseVector vec2) { final double distance; // One of the vector is empty if (Scalars.isZero(vec1.norm()) || Scalars.isZero(vec2.norm())) { return Double.NaN; } LongSet ts = LongUtils.setUnion(vec1.keySet(), vec2.keySet()); MutableSparseVector v1 = MutableSparseVector.create(ts); v1.fill(0); v1.set(vec1); v1.multiply(1.0 / v1.norm()); v1.addScaled(vec2, -1.0 / vec2.norm()); distance = v1.norm(); return 1 - distance; }
/** The train function of OrdRec. Get all parameters after learning process. */ @SuppressWarnings("ConstantConditions") private void train(SparseVector ratings, MutableSparseVector scores) { Vector dbeta = Vector.createLength(beta.length()); double dt1; // n is the number of iteration; for (int j = 0; j < iterationCount; j++) { for (VectorEntry rating : ratings.fast()) { long iid = rating.getKey(); double score = scores.get(iid); int r = quantizer.index(rating.getValue()); double probEqualR = getProbEQ(score, r); double probLessR = getProbLE(score, r); double probLessR_1 = getProbLE(score, r - 1); dt1 = learningRate / probEqualR * (probLessR * (1 - probLessR) * derivateOfBeta(r, 0, t1) - probLessR_1 * (1 - probLessR_1) * derivateOfBeta(r - 1, 0, t1) - regTerm * t1); double dbetaK; for (int k = 0; k < beta.length(); k++) { dbetaK = learningRate / probEqualR * (probLessR * (1 - probLessR) * derivateOfBeta(r, k + 1, beta.get(k)) - probLessR_1 * (1 - probLessR_1) * derivateOfBeta(r - 1, k + 1, beta.get(k)) - regTerm * beta.get(k)); dbeta.set(k, dbetaK); } t1 = t1 + dt1; beta.add(dbeta); } } }
/** * Build a sparse vector directly from the list of IDs. This allows a scored ID list builder to be * used to efficiently accumulate a sparse vector. If the same ID is added multiple times, the * first instance is used. * * @return A sparse vector containing the data accumulated. */ public ImmutableSparseVector buildVector() { MutableSparseVector msv = MutableSparseVector.create(ids); final int size = size(); for (int i = 0; i < size; i++) { msv.set(ids.get(i), scores.get(i)); } for (ChannelStorage chan : channels.values()) { MutableSparseVector vchan = msv.getOrAddChannelVector(chan.symbol); for (int i = 0; i < size; i++) { vchan.set(ids.get(i), chan.values.get(i)); } } for (TypedChannelStorage<?> chan : typedChannels.values()) { Long2ObjectMap vchan = msv.getOrAddChannel(chan.symbol); for (int i = 0; i < size; i++) { vchan.put(ids.get(i), chan.values.get(i)); } } return msv.freeze(); }
/** * This method is where the model should actually be computed. * * @return The TF-IDF model (a model of item tag vectors). */ @Override public TFIDFModel get() { // Build a map of tags to numeric IDs. This lets you convert tags (which are strings) // into long IDs that you can use as keys in a tag vector. Map<String, Long> tagIds = buildTagIdMap(); // Create a vector to accumulate document frequencies for the IDF computation MutableSparseVector docFreq = MutableSparseVector.create(tagIds.values()); docFreq.fill(0); // We now proceed in 2 stages. First, we build a TF vector for each item. // While we do this, we also build the DF vector. // We will then apply the IDF to each TF vector and normalize it to a unit vector. // Create a map to store the item TF vectors. Map<Long, MutableSparseVector> itemVectors = Maps.newHashMap(); // Create a work vector to accumulate each item's tag vector. // This vector will be re-used for each item. MutableSparseVector work = MutableSparseVector.create(tagIds.values()); // Iterate over the items to compute each item's vector. LongSet items = dao.getItemIds(); for (long item : items) { // Reset the work vector for this item's tags. // work.clear(); work.fill(0); // Now the vector is empty (all keys are 'unset'). HashMap<String, Integer> DFcount = new HashMap<String, Integer>(); // TODO Populate the work vector with the number of times each tag is applied to this item. // TODO Increment the document frequency vector once for each unique tag on the item. List<String> tags = dao.getItemTags(item); // System.out.println(tags.toString()); for (String tag : tags) { // System.out.println(tag); // System.out.println(tagIds.get(tag)); // System.out.println(work.size()); work.set(tagIds.get(tag), work.get(tagIds.get(tag)) + 1); if (!DFcount.containsKey(tag)) { DFcount.put(tag, 1); docFreq.set(tagIds.get(tag), docFreq.get(tagIds.get(tag)) + 1); } } /*for(VectorEntry e: work.fast()){ if(e.getValue() == 0){ work.unset(e.getKey()); } }*/ // Save a shrunk copy of the vector (only storing tags that apply to this item) in // our map, we'll add IDF and normalize later. itemVectors.put(item, work.shrinkDomain()); // work is ready to be reset and re-used for the next item } // Now we've seen all the items, so we have each item's TF vector and a global vector // of document frequencies. // Invert and log the document frequency. We can do this in-place. for (VectorEntry e : docFreq.fast()) { // TODO Update this document frequency entry to be a log-IDF value docFreq.set(e, Math.log(items.size() * 1.0 / e.getValue())); } // Now docFreq is a log-IDF vector. // So we can use it to apply IDF to each item vector to put it in the final model. // Create a map to store the final model data. Map<Long, SparseVector> modelData = Maps.newHashMap(); for (Map.Entry<Long, MutableSparseVector> entry : itemVectors.entrySet()) { MutableSparseVector tv = entry.getValue(); // TODO Convert this vector to a TF-IDF vector for (Long i : tagIds.values()) { tv.set(i, tv.get(i) * docFreq.get(i)); } // TODO Normalize the TF-IDF vector to be a unit vector // HINT The method tv.norm() will give you the Euclidian length of the vector tv.multiply(1.0 / tv.norm()); // Store a frozen (immutable) version of the vector in the model data. modelData.put(entry.getKey(), tv.freeze()); } // we technically don't need the IDF vector anymore, so long as we have no new tags return new TFIDFModel(tagIds, modelData); }
/** * Create a new mutable vector over all tag IDs. The vector is initially empty, and its key domain * is the set of all tag IDs. * * @return A fresh vector over tag IDs. */ public MutableSparseVector newTagVector() { return MutableSparseVector.create(tagIds.values()); }
/** * This method is where the model should actually be computed. * * @return The TF-IDF model (a model of item tag vectors). */ @Override public TFIDFModel get() { // Build a map of tags to numeric IDs. This lets you convert tags (which are strings) // into long IDs that you can use as keys in a tag vector. Map<String, Long> tagIds = buildTagIdMap(); // Create a vector to accumulate document frequencies for the IDF computation MutableSparseVector docFreq = MutableSparseVector.create(tagIds.values()); docFreq.fill(0); // We now proceed in 2 stages. First, we build a TF vector for each item. // While we do this, we also build the DF vector. // We will then apply the IDF to each TF vector and normalize it to a unit vector. // Create a map to store the item TF vectors. Map<Long, MutableSparseVector> itemVectors = Maps.newHashMap(); // Create a work vector to accumulate each item's tag vector. // This vector will be re-used for each item. MutableSparseVector work = MutableSparseVector.create(tagIds.values()); // Iterate over the items to compute each item's vector. LongSet items = dao.getItemIds(); for (long item : items) { // Reset the work vector for this item's tags. work.clear(); // Now the vector is empty (all keys are 'unset'). List<String> hashtag = new ArrayList<String>(); for (String tag : dao.getItemTags(item)) { Long id = tagIds.get(tag); try { // if id is not in the key set, throw the Exception. work.set(id, work.get(id) + 1); } catch (Exception e) { // if you catch the Exception, which means that id has not been set yet. work.set(id, 1.0); // use set method to "set" the Key } if (!hashtag.contains(tag)) { docFreq.set(id, docFreq.get(id) + 1); hashtag.add(tag); } } // Save a shrunk copy of the vector (only storing tags that apply to this item) in // our map, we'll add IDF and normalize later. itemVectors.put(item, work.shrinkDomain()); // work is ready to be reset and re-used for the next item } // Now we've seen all the items, so we have each item's TF vector and a global vector // of document frequencies. // Invert and log the document frequency. We can do this in-place. for (VectorEntry e : docFreq.fast()) { docFreq.set(e.getKey(), Math.log(items.size() / e.getValue())); } // Now docFreq is a log-IDF vector. // So we can use it to apply IDF to each item vector to put it in the final model. // Create a map to store the final model data. Map<Long, SparseVector> modelData = Maps.newHashMap(); for (Map.Entry<Long, MutableSparseVector> entry : itemVectors.entrySet()) { MutableSparseVector tv = entry.getValue(); // DA FARE Convert this vector to a TF-IDF vector for (VectorEntry e : tv.fast()) { tv.set(e.getKey(), ((e.getValue() * docFreq.get(e.getKey())))); } // DA FARE Normalize the TF-IDF vector to be a unit vector // HINT The method tv.norm() will give you the Euclidian length of the vector tv.multiply(1 / tv.norm()); // Store a frozen (immutable) version of the vector in the model data. modelData.put(entry.getKey(), tv.freeze()); } // we technically don't need the IDF vector anymore, so long as we have no new tags return new TFIDFModel(tagIds, modelData); }
private SparseVector emptyVector() { long[] keys = {}; double[] values = {}; return MutableSparseVector.wrap(keys, values); }
@Override public MutableSparseVector unapply(MutableSparseVector vector) { vector.multiply(factor); return vector; }