/** * Sample a dataset * * @param numSamples the number of samples to getFromOrigin * @param rng the rng to use * @param withReplacement whether to allow duplicates (only tracked by example row number) * @return the sample dataset */ public FloatDataSet sample(int numSamples, RandomGenerator rng, boolean withReplacement) { if (numSamples >= numExamples()) return this; else { FloatMatrix examples = new FloatMatrix(numSamples, getFirst().columns); FloatMatrix outcomes = new FloatMatrix(numSamples, numOutcomes()); Set<Integer> added = new HashSet<Integer>(); for (int i = 0; i < numSamples; i++) { int picked = rng.nextInt(numExamples()); if (!withReplacement) while (added.contains(picked)) { picked = rng.nextInt(numExamples()); } examples.putRow(i, get(picked).getFirst()); outcomes.putRow(i, get(picked).getSecond()); } return new FloatDataSet(examples, outcomes); } }
@Test public void testNextBitSetRandom() { RandomGenerator random = RandomManager.getRandom(); for (int i = 0; i < 100; i++) { BitSet bitSet = new BitSet(NUM_BITS); for (int j = 0; j < 20 + random.nextInt(50); j++) { bitSet.set(random.nextInt(NUM_BITS)); } int from = random.nextInt(NUM_BITS); int nextSet = bitSet.nextSetBit(from); if (nextSet == -1) { for (int j = from; j < NUM_BITS; j++) { assertFalse(bitSet.get(j)); } } else { for (int j = from; j < nextSet; j++) { assertFalse(bitSet.get(j)); } assertTrue(bitSet.get(nextSet)); } } }
@Test public void testLSHEffect() { RandomGenerator random = RandomManager.getRandom(); PoissonDistribution itemPerUserDist = new PoissonDistribution( random, 20, PoissonDistribution.DEFAULT_EPSILON, PoissonDistribution.DEFAULT_MAX_ITERATIONS); int features = 20; ALSServingModel mainModel = new ALSServingModel(features, true, 1.0, null); ALSServingModel lshModel = new ALSServingModel(features, true, 0.5, null); int userItemCount = 20000; for (int user = 0; user < userItemCount; user++) { String userID = "U" + user; float[] vec = VectorMath.randomVectorF(features, random); mainModel.setUserVector(userID, vec); lshModel.setUserVector(userID, vec); int itemsPerUser = itemPerUserDist.sample(); Collection<String> knownIDs = new ArrayList<>(itemsPerUser); for (int i = 0; i < itemsPerUser; i++) { knownIDs.add("I" + random.nextInt(userItemCount)); } mainModel.addKnownItems(userID, knownIDs); lshModel.addKnownItems(userID, knownIDs); } for (int item = 0; item < userItemCount; item++) { String itemID = "I" + item; float[] vec = VectorMath.randomVectorF(features, random); mainModel.setItemVector(itemID, vec); lshModel.setItemVector(itemID, vec); } int numRecs = 10; Mean meanMatchLength = new Mean(); for (int user = 0; user < userItemCount; user++) { String userID = "U" + user; List<Pair<String, Double>> mainRecs = mainModel.topN(new DotsFunction(mainModel.getUserVector(userID)), null, numRecs, null); List<Pair<String, Double>> lshRecs = lshModel.topN(new DotsFunction(lshModel.getUserVector(userID)), null, numRecs, null); int i = 0; while (i < lshRecs.size() && i < mainRecs.size() && lshRecs.get(i).equals(mainRecs.get(i))) { i++; } meanMatchLength.increment(i); } log.info("Mean matching prefix: {}", meanMatchLength.getResult()); assertTrue(meanMatchLength.getResult() >= 4.0); meanMatchLength.clear(); for (int item = 0; item < userItemCount; item++) { String itemID = "I" + item; List<Pair<String, Double>> mainRecs = mainModel.topN( new CosineAverageFunction(mainModel.getItemVector(itemID)), null, numRecs, null); List<Pair<String, Double>> lshRecs = lshModel.topN( new CosineAverageFunction(lshModel.getItemVector(itemID)), null, numRecs, null); int i = 0; while (i < lshRecs.size() && i < mainRecs.size() && lshRecs.get(i).equals(mainRecs.get(i))) { i++; } meanMatchLength.increment(i); } log.info("Mean matching prefix: {}", meanMatchLength.getResult()); assertTrue(meanMatchLength.getResult() >= 5.0); }