Example #1
0
  /**
   * Probes a model to determine the effect of a particular variable. This is done with the ade of a
   * trace dictionary which has recorded the locations in the feature vector that are modified by
   * various variable values. We can set these locations to 1 and then look at the resulting score.
   * This tells us the weight the model places on that variable.
   *
   * @param features A feature vector to use (destructively)
   * @param traceDictionary A trace dictionary containing variables and what locations in the
   *     feature vector are affected by them
   * @param learner The model that we are probing to find weights on features
   */
  public void update(
      Vector features,
      Map<String, Set<Integer>> traceDictionary,
      AbstractVectorClassifier learner) {
    // zero out feature vector
    features.assign(0);
    for (Map.Entry<String, Set<Integer>> entry : traceDictionary.entrySet()) {
      // get a feature and locations where it is stored in the feature vector
      String key = entry.getKey();
      Set<Integer> value = entry.getValue();

      // if we haven't looked at this feature yet
      if (!weightMap.containsKey(key)) {
        // put probe values in the feature vector
        for (Integer where : value) {
          features.set(where, 1);
        }

        // see what the model says
        Vector v = learner.classifyNoLink(features);
        weightMap.put(key, v);

        // and zero out those locations again
        for (Integer where : value) {
          features.set(where, 0);
        }
      }
    }
  }
  @Test
  public void testVectorDistanceInvertedMapper() throws Exception {
    Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context =
        EasyMock.createMock(Mapper.Context.class);
    Vector expectVec = new DenseVector(new double[] {Math.sqrt(2.0), 1.0});
    context.write(new Text("other"), new VectorWritable(expectVec));
    EasyMock.replay(context);
    Vector vector = new NamedVector(new RandomAccessSparseVector(2), "other");
    vector.set(0, 2);
    vector.set(1, 2);

    VectorDistanceInvertedMapper mapper = new VectorDistanceInvertedMapper();
    setField(mapper, "measure", new EuclideanDistanceMeasure());
    List<NamedVector> seedVectors = new ArrayList<NamedVector>();
    Vector seed1 = new RandomAccessSparseVector(2);
    seed1.set(0, 1);
    seed1.set(1, 1);
    Vector seed2 = new RandomAccessSparseVector(2);
    seed2.set(0, 2);
    seed2.set(1, 1);

    seedVectors.add(new NamedVector(seed1, "foo"));
    seedVectors.add(new NamedVector(seed2, "foo2"));
    setField(mapper, "seedVectors", seedVectors);

    mapper.map(new IntWritable(123), new VectorWritable(vector), context);

    EasyMock.verify(context);
  }
  /** tests {@link MostSimilarItemPairsMapper} */
  public void testMostSimilarItemsPairsMapper() throws Exception {

    OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
    indexItemIDMap.put(12, 12L);
    indexItemIDMap.put(34, 34L);
    indexItemIDMap.put(56, 56L);

    Mapper<IntWritable, VectorWritable, EntityEntityWritable, DoubleWritable>.Context context =
        EasyMock.createMock(Mapper.Context.class);

    context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9));

    EasyMock.replay(context);

    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
    vector.set(12, 0.2);
    vector.set(34, 1.0);
    vector.set(56, 0.9);

    MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper();
    setField(mapper, "indexItemIDMap", indexItemIDMap);
    setField(mapper, "maxSimilarItemsPerItem", 1);

    mapper.map(new IntWritable(34), new VectorWritable(vector), context);

    EasyMock.verify(context);
  }
Example #4
0
  /** tests {@link PartialMultiplyMapper} */
  @Test
  public void testPartialMultiplyMapper() throws Exception {

    Vector similarityColumn = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
    similarityColumn.set(3, 0.5);
    similarityColumn.set(7, 0.8);

    Mapper<VarIntWritable, VectorAndPrefsWritable, VarLongWritable, PrefAndSimilarityColumnWritable>
            .Context
        context = EasyMock.createMock(Mapper.Context.class);

    PrefAndSimilarityColumnWritable one = new PrefAndSimilarityColumnWritable();
    PrefAndSimilarityColumnWritable two = new PrefAndSimilarityColumnWritable();
    one.set(1.0f, similarityColumn);
    two.set(3.0f, similarityColumn);

    context.write(EasyMock.eq(new VarLongWritable(123L)), EasyMock.eq(one));
    context.write(EasyMock.eq(new VarLongWritable(456L)), EasyMock.eq(two));

    EasyMock.replay(context);

    VectorAndPrefsWritable vectorAndPrefs =
        new VectorAndPrefsWritable(
            similarityColumn, Arrays.asList(123L, 456L), Arrays.asList(1.0f, 3.0f));

    new PartialMultiplyMapper().map(new VarIntWritable(1), vectorAndPrefs, context);

    EasyMock.verify(context);
  }
Example #5
0
  /** tests {@link ToVectorAndPrefReducer} */
  @Test
  public void testToVectorAndPrefReducer() throws Exception {
    Reducer<VarIntWritable, VectorOrPrefWritable, VarIntWritable, VectorAndPrefsWritable>.Context
        context = EasyMock.createMock(Reducer.Context.class);

    context.write(
        EasyMock.eq(new VarIntWritable(1)),
        vectorAndPrefsWritableMatches(
            Arrays.asList(123L, 456L),
            Arrays.asList(1.0f, 2.0f),
            MathHelper.elem(3, 0.5),
            MathHelper.elem(7, 0.8)));

    EasyMock.replay(context);

    Vector similarityColumn = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
    similarityColumn.set(3, 0.5);
    similarityColumn.set(7, 0.8);

    VectorOrPrefWritable itemPref1 = new VectorOrPrefWritable(123L, 1.0f);
    VectorOrPrefWritable itemPref2 = new VectorOrPrefWritable(456L, 2.0f);
    VectorOrPrefWritable similarities = new VectorOrPrefWritable(similarityColumn);

    new ToVectorAndPrefReducer()
        .reduce(new VarIntWritable(1), Arrays.asList(itemPref1, itemPref2, similarities), context);

    EasyMock.verify(context);
  }
 @Override
 public Vector select(Vector probabilities) {
   int maxValueIndex = probabilities.maxValueIndex();
   Vector weights = new SequentialAccessSparseVector(probabilities.size());
   weights.set(maxValueIndex, 1.0);
   return weights;
 }
    @Override
    protected void reduce(IntWritable row, Iterable<VectorWritable> partialDots, Context ctx)
        throws IOException, InterruptedException {
      Iterator<VectorWritable> partialDotsIterator = partialDots.iterator();
      Vector dots = partialDotsIterator.next().get();
      while (partialDotsIterator.hasNext()) {
        Vector toAdd = partialDotsIterator.next().get();
        Iterator<Vector.Element> nonZeroElements = toAdd.iterateNonZero();
        while (nonZeroElements.hasNext()) {
          Vector.Element nonZeroElement = nonZeroElements.next();
          dots.setQuick(
              nonZeroElement.index(), dots.getQuick(nonZeroElement.index()) + nonZeroElement.get());
        }
      }

      Vector similarities = dots.like();
      double normA = norms.getQuick(row.get());
      Iterator<Vector.Element> dotsWith = dots.iterateNonZero();
      while (dotsWith.hasNext()) {
        Vector.Element b = dotsWith.next();
        double similarityValue =
            similarity.similarity(b.get(), normA, norms.getQuick(b.index()), numberOfColumns);
        if (similarityValue >= treshold) {
          similarities.set(b.index(), similarityValue);
        }
      }
      if (excludeSelfSimilarity) {
        similarities.setQuick(row.get(), 0);
      }
      ctx.write(row, new VectorWritable(similarities));
    }
 //		@Override
 public void reduce(VarLongWritable userID, Iterable<VarLongWritable> itemPrefs, Context context)
     throws IOException, InterruptedException {
   Vector userVector = new RandomAccessSparseVector(Integer.MAX_VALUE, 100);
   for (VarLongWritable itemPref : itemPrefs) {
     userVector.set((int) itemPref.get(), 1.0f);
   }
   context.write(userID, (new VectorWritable(userVector)));
 }
 protected void updateHdfsState() throws IOException {
   if (conf == null) {
     return;
   }
   int numBasisVectorsOnDisk = 0;
   Path nextBasisVectorPath = new Path(basisPath, BASIS_PREFIX + '_' + numBasisVectorsOnDisk);
   while (fs.exists(nextBasisVectorPath)) {
     nextBasisVectorPath = new Path(basisPath, BASIS_PREFIX + '_' + ++numBasisVectorsOnDisk);
   }
   Vector nextVector;
   while (numBasisVectorsOnDisk < iterationNumber
       && (nextVector = getBasisVector(numBasisVectorsOnDisk)) != null) {
     persistVector(nextBasisVectorPath, numBasisVectorsOnDisk, nextVector);
     nextBasisVectorPath = new Path(basisPath, BASIS_PREFIX + '_' + ++numBasisVectorsOnDisk);
   }
   if (scaleFactor <= 0) {
     scaleFactor = getScaleFactor(); // load from disk if possible
   }
   diagonalMatrix = getDiagonalMatrix(); // load from disk if possible
   Vector norms = new DenseVector(diagonalMatrix.numCols() - 1);
   Vector projections = new DenseVector(diagonalMatrix.numCols());
   int i = 0;
   while (i < diagonalMatrix.numCols() - 1) {
     norms.set(i, diagonalMatrix.get(i, i + 1));
     projections.set(i, diagonalMatrix.get(i, i));
     i++;
   }
   projections.set(i, diagonalMatrix.get(i, i));
   persistVector(new Path(baseDir, "projections"), 0, projections);
   persistVector(new Path(baseDir, "norms"), 0, norms);
   persistVector(new Path(baseDir, "scaleFactor"), 0, new DenseVector(new double[] {scaleFactor}));
   for (Map.Entry<Integer, Vector> entry : singularVectors.entrySet()) {
     persistVector(
         new Path(singularVectorPath, SINGULAR_PREFIX + '_' + entry.getKey()),
         entry.getKey(),
         entry.getValue());
   }
   super.setIterationNumber(numBasisVectorsOnDisk);
 }
  @Test
  public void testVectorDistanceMapper() throws Exception {
    Mapper<WritableComparable<?>, VectorWritable, StringTuple, DoubleWritable>.Context context =
        EasyMock.createMock(Mapper.Context.class);
    StringTuple tuple;
    tuple = new StringTuple();
    tuple.add("foo");
    tuple.add("123");
    context.write(tuple, new DoubleWritable(Math.sqrt(2.0)));
    tuple = new StringTuple();
    tuple.add("foo2");
    tuple.add("123");
    context.write(tuple, new DoubleWritable(1));

    EasyMock.replay(context);

    Vector vector = new RandomAccessSparseVector(2);
    vector.set(0, 2);
    vector.set(1, 2);

    VectorDistanceMapper mapper = new VectorDistanceMapper();
    setField(mapper, "measure", new EuclideanDistanceMeasure());
    List<NamedVector> seedVectors = new ArrayList<NamedVector>();
    Vector seed1 = new RandomAccessSparseVector(2);
    seed1.set(0, 1);
    seed1.set(1, 1);
    Vector seed2 = new RandomAccessSparseVector(2);
    seed2.set(0, 2);
    seed2.set(1, 1);

    seedVectors.add(new NamedVector(seed1, "foo"));
    seedVectors.add(new NamedVector(seed2, "foo2"));
    setField(mapper, "seedVectors", seedVectors);

    mapper.map(new IntWritable(123), new VectorWritable(vector), context);

    EasyMock.verify(context);
  }
Example #11
0
 public static Matrix sampledCorpus(
     Matrix matrix, Random random, int numDocs, int numSamples, int numTopicsPerDoc) {
   Matrix corpus = new SparseRowMatrix(numDocs, matrix.numCols());
   LDASampler modelSampler = new LDASampler(matrix, random);
   Vector topicVector = new DenseVector(matrix.numRows());
   for (int i = 0; i < numTopicsPerDoc; i++) {
     int topic = random.nextInt(topicVector.size());
     topicVector.set(topic, topicVector.get(topic) + 1);
   }
   for (int docId = 0; docId < numDocs; docId++) {
     for (int sample : modelSampler.sample(topicVector, numSamples)) {
       corpus.set(docId, sample, corpus.get(docId, sample) + 1);
     }
   }
   return corpus;
 }
Example #12
0
  private List<? extends WeightedVector> cubishTestData(double radius) {
    List<WeightedVector> data = Lists.newArrayListWithCapacity(K1 + 5000);
    int row = 0;

    MultiNormal g = new MultiNormal(radius, new ConstantVector(0, 10));
    for (int i = 0; i < K1; i++) {
      data.add(new WeightedVector(g.sample(), 1, row++));
    }

    for (int i = 0; i < 5; i++) {
      Vector m = new DenseVector(10);
      m.set(i, i == 0 ? 6 : 6);
      MultiNormal gx = new MultiNormal(radius, m);
      for (int j = 0; j < 1000; j++) {
        data.add(new WeightedVector(gx.sample(), 1, row++));
      }
    }
    return data;
  }
  @Override
  protected void reduce(VarLongWritable itemID, Iterable<VarLongWritable> values, Context ctx)
      throws IOException, InterruptedException {

    int itemIDIndex = TasteHadoopUtils.idToIndex(itemID.get());
    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
    /* artificial NaN summand to exclude this item from the recommendations for all users specified in userIDs */
    vector.set(itemIDIndex, Double.NaN);
    // 这是过滤的trick
    // 从这里可以反推出来,AggregateAndRecommendReducer里面过滤了评分过的 user,item这种pair

    List<Long> userIDs = Lists.newArrayList();
    List<Float> prefValues = Lists.newArrayList();
    for (VarLongWritable userID : values) {
      userIDs.add(userID.get());
      prefValues.add(1.0f);
    }

    itemIDIndexWritable.set(itemIDIndex);
    vectorAndPrefs.set(vector, userIDs, prefValues);
    ctx.write(itemIDIndexWritable, vectorAndPrefs);
  }