示例#1
0
  @Test
  public void testBloomFilter() {
    DataFrame df = context.range(1000);

    BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03);
    Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter1.mightContain(i));
    }

    BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03);
    Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter2.mightContain(i * 3));
    }

    BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5);
    Assert.assertTrue(filter3.bitSize() == 64 * 5);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter3.mightContain(i));
    }

    BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5);
    Assert.assertTrue(filter4.bitSize() == 64 * 5);
    for (int i = 0; i < 1000; i++) {
      Assert.assertTrue(filter4.mightContain(i * 3));
    }
  }
示例#2
0
 @Test
 public void testSampleBy() {
   DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
   DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
   Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
   Row[] expected = {RowFactory.create(0, 5), RowFactory.create(1, 8)};
   Assert.assertArrayEquals(expected, actual);
 }
示例#3
0
 @Test
 public void testSampleBy() {
   DataFrame df = context.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
   DataFrame sampled = df.stat().<Integer>sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
   Row[] actual = sampled.groupBy("key").count().orderBy("key").collect();
   Assert.assertEquals(0, actual[0].getLong(0));
   Assert.assertTrue(0 <= actual[0].getLong(1) && actual[0].getLong(1) <= 8);
   Assert.assertEquals(1, actual[1].getLong(0));
   Assert.assertTrue(2 <= actual[1].getLong(1) && actual[1].getLong(1) <= 13);
 }
示例#4
0
  @Test
  public void testCountMinSketch() {
    DataFrame df = context.range(1000);

    CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42);
    Assert.assertEquals(sketch1.totalCount(), 1000);
    Assert.assertEquals(sketch1.depth(), 10);
    Assert.assertEquals(sketch1.width(), 20);

    CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42);
    Assert.assertEquals(sketch2.totalCount(), 1000);
    Assert.assertEquals(sketch2.depth(), 10);
    Assert.assertEquals(sketch2.width(), 20);

    CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42);
    Assert.assertEquals(sketch3.totalCount(), 1000);
    Assert.assertEquals(sketch3.relativeError(), 0.001, 1e-4);
    Assert.assertEquals(sketch3.confidence(), 0.99, 5e-3);

    CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42);
    Assert.assertEquals(sketch4.totalCount(), 1000);
    Assert.assertEquals(sketch4.relativeError(), 0.001, 1e-4);
    Assert.assertEquals(sketch4.confidence(), 0.99, 5e-3);
  }