Ejemplo n.º 1
0
 @Test
 public void testDampedEqualKeys() {
   long[] keys = {2, 5, 6};
   double[] val1 = {1, 2, 1};
   double[] val2 = {1, 2, 5};
   SparseVector v1 = MutableSparseVector.wrap(keys, val1).freeze();
   SparseVector v2 = MutableSparseVector.wrap(keys, val2).freeze();
   assertEquals(0.375, dampedSimilarity.similarity(v1, v1), EPSILON);
   assertEquals(0.42705098, dampedSimilarity.similarity(v1, v2), EPSILON);
 }
Ejemplo n.º 2
0
 @Test
 public void testEqualKeys() {
   long[] keys = {2, 5, 6};
   double[] val1 = {1, 2, 1};
   double[] val2 = {1, 2, 5};
   SparseVector v1 = MutableSparseVector.wrap(keys, val1).freeze();
   SparseVector v2 = MutableSparseVector.wrap(keys, val2).freeze();
   assertEquals(1, similarity.similarity(v1, v1), EPSILON);
   assertEquals(0.745355993, similarity.similarity(v1, v2), EPSILON);
 }
  /**
   * Score items in a vector. The key domain of the provided vector is the items to score, and the
   * score method sets the values for each item to its score (or unsets it, if no score can be
   * provided). The previous values are discarded.
   *
   * @param user The user ID.
   * @param scores The score vector.
   */
  @Override
  public void score(long user, @Nonnull MutableSparseVector scores) {
    // TODO Score the items in the key domain of scores

    for (VectorEntry e : scores.fast(VectorEntry.State.EITHER)) {
      long item = e.getKey();
      // TODO Set the scores
      double score = prediction(user, item);
      scores.set(e, score);
    }
  }
Ejemplo n.º 4
0
 @Test
 public void testOverlap() {
   long[] k1 = {1, 2, 5, 6};
   double[] val1 = {3, 1, 2, 1};
   long[] k2 = {2, 3, 5, 6, 7};
   double[] val2 = {1, 7, 2, 5, 0};
   SparseVector v1 = MutableSparseVector.wrap(k1, val1).freeze();
   SparseVector v2 = MutableSparseVector.wrap(k2, val2).freeze();
   assertEquals(1, similarity.similarity(v1, v1), EPSILON);
   assertEquals(1, similarity.similarity(v2, v2), EPSILON);
   assertEquals(0.29049645, similarity.similarity(v1, v2), EPSILON);
 }
Ejemplo n.º 5
0
 @Test
 public void testDisjoint() {
   long[] k1 = {2, 5, 6};
   double[] val1 = {1, 3, 2};
   long[] k2 = {3, 4, 7};
   double[] val2 = {1, 3, 2};
   SparseVector v1, v2;
   v1 = MutableSparseVector.wrap(k1, val1).freeze();
   v2 = MutableSparseVector.wrap(k2, val2).freeze();
   assertEquals(0, similarity.similarity(v1, v2), EPSILON);
   assertEquals(0, dampedSimilarity.similarity(v1, v2), EPSILON);
 }
Ejemplo n.º 6
0
  @Override
  public void predict(long uid, @Nonnull MutableSparseVector predictions) {
    logger.debug("predicting {} items for {}", predictions.keyDomain().size(), uid);
    OrdRecModel params = new OrdRecModel(quantizer);
    SparseVector ratings = makeUserVector(uid, userEventDao);
    LongSet keySet = LongUtils.setUnion(ratings.keySet(), predictions.keyDomain());
    MutableSparseVector scores = MutableSparseVector.create(keySet);
    itemScorer.score(uid, scores);
    params.train(ratings, scores);
    logger.debug("trained parameters for {}: {}", uid, params);

    Vector probabilities = Vector.createLength(params.getLevelCount());
    Long2ObjectMap<IVector> distChannel = null;
    if (reportDistribution) {
      distChannel = predictions.addChannel(RATING_PROBABILITY_CHANNEL);
    }

    for (VectorEntry e : predictions.fast(VectorEntry.State.EITHER)) {
      long iid = e.getKey();
      double score = scores.get(iid);
      params.getProbDistribution(score, probabilities);

      int mlIdx = probabilities.maxElementIndex();

      predictions.set(e, quantizer.getIndexValue(mlIdx));
      if (distChannel != null) {
        distChannel.put(e.getKey(), probabilities.immutable());
      }
    }
  }
Ejemplo n.º 7
0
 @Test
 public void testItemMeanBaseline() {
   ItemScorer pred = new ItemMeanRatingItemScorer.Builder(dao, 0.0).get();
   long[] items = {5, 7, 10};
   double[] values = {3, 6, 4};
   SparseVector map = MutableSparseVector.wrap(items, values).freeze();
   // unseen item, should be global mean
   assertThat(pred.score(10, 2), closeTo(RATINGS_DAT_MEAN, 0.001));
   // seen item - should be item average
   assertThat(pred.score(10, 5), closeTo(3.0, 0.001));
 }
  /**
   * Build a rating matrix from the rating data. Each user's ratings are first normalized by
   * subtracting a baseline score (usually a mean).
   *
   * @param userMapping The index mapping of user IDs to column numbers.
   * @param itemMapping The index mapping of item IDs to row numbers.
   * @return A matrix storing the <i>normalized</i> user ratings.
   */
  private RealMatrix createRatingMatrix(IdIndexMapping userMapping, IdIndexMapping itemMapping) {
    final int nusers = userMapping.size();
    final int nitems = itemMapping.size();

    // Create a matrix with users on rows and items on columns
    logger.info("creating {} by {} rating matrix", nusers, nitems);
    RealMatrix matrix = MatrixUtils.createRealMatrix(nusers, nitems);

    // populate it with data
    Cursor<UserHistory<Event>> users = userEventDAO.streamEventsByUser();
    try {
      for (UserHistory<Event> user : users) {
        // Get the row number for this user
        int u = userMapping.getIndex(user.getUserId());
        MutableSparseVector ratings = Ratings.userRatingVector(user.filter(Rating.class));
        MutableSparseVector baselines = MutableSparseVector.create(ratings.keySet());
        baselineScorer.score(user.getUserId(), baselines);
        // TODO Populate this user's row with their ratings, minus the baseline scores
        for (VectorEntry entry : ratings.fast(State.SET)) {
          long itemid = entry.getKey();
          int i = itemMapping.getIndex(itemid);
          double rating = entry.getValue();
          double baseline = baselines.get(itemid);
          matrix.setEntry(u, i, rating - baseline);
        }
      }
    } finally {
      users.close();
    }

    return matrix;
  }
Ejemplo n.º 9
0
  @Override
  public double similarity(SparseVector vec1, SparseVector vec2) {
    final double distance;
    // One of the vector is empty
    if (Scalars.isZero(vec1.norm()) || Scalars.isZero(vec2.norm())) {
      return Double.NaN;
    }

    LongSet ts = LongUtils.setUnion(vec1.keySet(), vec2.keySet());

    MutableSparseVector v1 = MutableSparseVector.create(ts);
    v1.fill(0);
    v1.set(vec1);
    v1.multiply(1.0 / v1.norm());
    v1.addScaled(vec2, -1.0 / vec2.norm());

    distance = v1.norm();
    return 1 - distance;
  }
Ejemplo n.º 10
0
    /** The train function of OrdRec. Get all parameters after learning process. */
    @SuppressWarnings("ConstantConditions")
    private void train(SparseVector ratings, MutableSparseVector scores) {

      Vector dbeta = Vector.createLength(beta.length());
      double dt1;
      // n is the number of iteration;
      for (int j = 0; j < iterationCount; j++) {
        for (VectorEntry rating : ratings.fast()) {
          long iid = rating.getKey();
          double score = scores.get(iid);
          int r = quantizer.index(rating.getValue());

          double probEqualR = getProbEQ(score, r);
          double probLessR = getProbLE(score, r);
          double probLessR_1 = getProbLE(score, r - 1);

          dt1 =
              learningRate
                  / probEqualR
                  * (probLessR * (1 - probLessR) * derivateOfBeta(r, 0, t1)
                      - probLessR_1 * (1 - probLessR_1) * derivateOfBeta(r - 1, 0, t1)
                      - regTerm * t1);

          double dbetaK;
          for (int k = 0; k < beta.length(); k++) {
            dbetaK =
                learningRate
                    / probEqualR
                    * (probLessR * (1 - probLessR) * derivateOfBeta(r, k + 1, beta.get(k))
                        - probLessR_1
                            * (1 - probLessR_1)
                            * derivateOfBeta(r - 1, k + 1, beta.get(k))
                        - regTerm * beta.get(k));
            dbeta.set(k, dbetaK);
          }
          t1 = t1 + dt1;
          beta.add(dbeta);
        }
      }
    }
Ejemplo n.º 11
0
  /**
   * Build a sparse vector directly from the list of IDs. This allows a scored ID list builder to be
   * used to efficiently accumulate a sparse vector. If the same ID is added multiple times, the
   * first instance is used.
   *
   * @return A sparse vector containing the data accumulated.
   */
  public ImmutableSparseVector buildVector() {
    MutableSparseVector msv = MutableSparseVector.create(ids);
    final int size = size();
    for (int i = 0; i < size; i++) {
      msv.set(ids.get(i), scores.get(i));
    }

    for (ChannelStorage chan : channels.values()) {
      MutableSparseVector vchan = msv.getOrAddChannelVector(chan.symbol);
      for (int i = 0; i < size; i++) {
        vchan.set(ids.get(i), chan.values.get(i));
      }
    }

    for (TypedChannelStorage<?> chan : typedChannels.values()) {
      Long2ObjectMap vchan = msv.getOrAddChannel(chan.symbol);
      for (int i = 0; i < size; i++) {
        vchan.put(ids.get(i), chan.values.get(i));
      }
    }

    return msv.freeze();
  }
Ejemplo n.º 12
0
  /**
   * This method is where the model should actually be computed.
   *
   * @return The TF-IDF model (a model of item tag vectors).
   */
  @Override
  public TFIDFModel get() {
    // Build a map of tags to numeric IDs.  This lets you convert tags (which are strings)
    // into long IDs that you can use as keys in a tag vector.
    Map<String, Long> tagIds = buildTagIdMap();

    // Create a vector to accumulate document frequencies for the IDF computation
    MutableSparseVector docFreq = MutableSparseVector.create(tagIds.values());
    docFreq.fill(0);

    // We now proceed in 2 stages. First, we build a TF vector for each item.
    // While we do this, we also build the DF vector.
    // We will then apply the IDF to each TF vector and normalize it to a unit vector.

    // Create a map to store the item TF vectors.
    Map<Long, MutableSparseVector> itemVectors = Maps.newHashMap();

    // Create a work vector to accumulate each item's tag vector.
    // This vector will be re-used for each item.
    MutableSparseVector work = MutableSparseVector.create(tagIds.values());

    // Iterate over the items to compute each item's vector.
    LongSet items = dao.getItemIds();
    for (long item : items) {
      // Reset the work vector for this item's tags.
      // work.clear();
      work.fill(0);
      // Now the vector is empty (all keys are 'unset').

      HashMap<String, Integer> DFcount = new HashMap<String, Integer>();
      // TODO Populate the work vector with the number of times each tag is applied to this item.
      // TODO Increment the document frequency vector once for each unique tag on the item.
      List<String> tags = dao.getItemTags(item);
      // System.out.println(tags.toString());
      for (String tag : tags) {
        // System.out.println(tag);
        // System.out.println(tagIds.get(tag));
        // System.out.println(work.size());

        work.set(tagIds.get(tag), work.get(tagIds.get(tag)) + 1);
        if (!DFcount.containsKey(tag)) {
          DFcount.put(tag, 1);
          docFreq.set(tagIds.get(tag), docFreq.get(tagIds.get(tag)) + 1);
        }
      }

      /*for(VectorEntry e: work.fast()){
      	if(e.getValue() == 0){
      		work.unset(e.getKey());
      	}
      }*/

      // Save a shrunk copy of the vector (only storing tags that apply to this item) in
      // our map, we'll add IDF and normalize later.
      itemVectors.put(item, work.shrinkDomain());
      // work is ready to be reset and re-used for the next item
    }

    // Now we've seen all the items, so we have each item's TF vector and a global vector
    // of document frequencies.
    // Invert and log the document frequency.  We can do this in-place.

    for (VectorEntry e : docFreq.fast()) {
      // TODO Update this document frequency entry to be a log-IDF value
      docFreq.set(e, Math.log(items.size() * 1.0 / e.getValue()));
    }

    // Now docFreq is a log-IDF vector.
    // So we can use it to apply IDF to each item vector to put it in the final model.
    // Create a map to store the final model data.
    Map<Long, SparseVector> modelData = Maps.newHashMap();
    for (Map.Entry<Long, MutableSparseVector> entry : itemVectors.entrySet()) {
      MutableSparseVector tv = entry.getValue();
      // TODO Convert this vector to a TF-IDF vector
      for (Long i : tagIds.values()) {
        tv.set(i, tv.get(i) * docFreq.get(i));
      }

      // TODO Normalize the TF-IDF vector to be a unit vector
      // HINT The method tv.norm() will give you the Euclidian length of the vector
      tv.multiply(1.0 / tv.norm());

      // Store a frozen (immutable) version of the vector in the model data.
      modelData.put(entry.getKey(), tv.freeze());
    }

    // we technically don't need the IDF vector anymore, so long as we have no new tags
    return new TFIDFModel(tagIds, modelData);
  }
Ejemplo n.º 13
0
 /**
  * Create a new mutable vector over all tag IDs. The vector is initially empty, and its key domain
  * is the set of all tag IDs.
  *
  * @return A fresh vector over tag IDs.
  */
 public MutableSparseVector newTagVector() {
   return MutableSparseVector.create(tagIds.values());
 }
  /**
   * This method is where the model should actually be computed.
   *
   * @return The TF-IDF model (a model of item tag vectors).
   */
  @Override
  public TFIDFModel get() {
    // Build a map of tags to numeric IDs.  This lets you convert tags (which are strings)
    // into long IDs that you can use as keys in a tag vector.
    Map<String, Long> tagIds = buildTagIdMap();

    // Create a vector to accumulate document frequencies for the IDF computation
    MutableSparseVector docFreq = MutableSparseVector.create(tagIds.values());
    docFreq.fill(0);

    // We now proceed in 2 stages. First, we build a TF vector for each item.
    // While we do this, we also build the DF vector.
    // We will then apply the IDF to each TF vector and normalize it to a unit vector.

    // Create a map to store the item TF vectors.
    Map<Long, MutableSparseVector> itemVectors = Maps.newHashMap();

    // Create a work vector to accumulate each item's tag vector.
    // This vector will be re-used for each item.
    MutableSparseVector work = MutableSparseVector.create(tagIds.values());

    // Iterate over the items to compute each item's vector.
    LongSet items = dao.getItemIds();

    for (long item : items) {
      // Reset the work vector for this item's tags.
      work.clear();
      // Now the vector is empty (all keys are 'unset').

      List<String> hashtag = new ArrayList<String>();

      for (String tag : dao.getItemTags(item)) {

        Long id = tagIds.get(tag);

        try {
          // if id is not in the key set, throw the Exception.
          work.set(id, work.get(id) + 1);

        } catch (Exception e) {
          // if you catch the Exception, which means that id has not been set yet.
          work.set(id, 1.0); // use set method to "set" the Key
        }

        if (!hashtag.contains(tag)) {
          docFreq.set(id, docFreq.get(id) + 1);
          hashtag.add(tag);
        }
      }

      // Save a shrunk copy of the vector (only storing tags that apply to this item) in
      // our map, we'll add IDF and normalize later.
      itemVectors.put(item, work.shrinkDomain());
      // work is ready to be reset and re-used for the next item

    }

    // Now we've seen all the items, so we have each item's TF vector and a global vector
    // of document frequencies.
    // Invert and log the document frequency.  We can do this in-place.
    for (VectorEntry e : docFreq.fast()) {

      docFreq.set(e.getKey(), Math.log(items.size() / e.getValue()));
    }

    // Now docFreq is a log-IDF vector.
    // So we can use it to apply IDF to each item vector to put it in the final model.
    // Create a map to store the final model data.
    Map<Long, SparseVector> modelData = Maps.newHashMap();
    for (Map.Entry<Long, MutableSparseVector> entry : itemVectors.entrySet()) {
      MutableSparseVector tv = entry.getValue();

      // DA FARE Convert this vector to a TF-IDF vector
      for (VectorEntry e : tv.fast()) {
        tv.set(e.getKey(), ((e.getValue() * docFreq.get(e.getKey()))));
      }

      // DA FARE Normalize the TF-IDF vector to be a unit vector
      // HINT The method tv.norm() will give you the Euclidian length of the vector
      tv.multiply(1 / tv.norm());

      // Store a frozen (immutable) version of the vector in the model data.
      modelData.put(entry.getKey(), tv.freeze());
    }

    // we technically don't need the IDF vector anymore, so long as we have no new tags
    return new TFIDFModel(tagIds, modelData);
  }
Ejemplo n.º 15
0
 private SparseVector emptyVector() {
   long[] keys = {};
   double[] values = {};
   return MutableSparseVector.wrap(keys, values);
 }
Ejemplo n.º 16
0
 @Override
 public MutableSparseVector unapply(MutableSparseVector vector) {
   vector.multiply(factor);
   return vector;
 }