示例#1
0
 /**
  * Combines this bloom filter with another bloom filter by performing a bitwise OR of the
  * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the bloom
  * filters are appropriately sized to avoid saturating them.
  *
  * @param that The bloom filter to combine this bloom filter with. It is not mutated.
  * @throws IllegalArgumentException if {@code isCompatible(that) == false}
  * @since 15.0
  */
 public void putAll(BloomFilter<T> that) {
   checkNotNull(that);
   checkArgument(this != that, "Cannot combine a BloomFilter with itself.");
   checkArgument(
       this.numHashFunctions == that.numHashFunctions,
       "BloomFilters must have the same number of hash functions (%s != %s)",
       this.numHashFunctions,
       that.numHashFunctions);
   checkArgument(
       this.bitSize() == that.bitSize(),
       "BloomFilters must have the same size underlying bit arrays (%s != %s)",
       this.bitSize(),
       that.bitSize());
   checkArgument(
       this.strategy.equals(that.strategy),
       "BloomFilters must have equal strategies (%s != %s)",
       this.strategy,
       that.strategy);
   checkArgument(
       this.funnel.equals(that.funnel),
       "BloomFilters must have equal funnels (%s != %s)",
       this.funnel,
       that.funnel);
   this.bits.putAll(that.bits);
 }
示例#2
0
 @SuppressWarnings("CheckReturnValue")
 @AndroidIncompatible // OutOfMemoryError
 public void testLargeNumberOfInsertions() {
   // We use horrible FPPs here to keep Java from OOM'ing
   BloomFilter.create(Funnels.unencodedCharsFunnel(), 42L + Integer.MAX_VALUE, 0.28);
   BloomFilter.create(Funnels.unencodedCharsFunnel(), 50L * Integer.MAX_VALUE, 0.99);
 }
示例#3
0
 public void testPutAllWithSelf() {
   BloomFilter<Integer> bf1 = BloomFilter.create(Funnels.integerFunnel(), 1);
   try {
     assertFalse(bf1.isCompatible(bf1));
     bf1.putAll(bf1);
     fail();
   } catch (IllegalArgumentException expected) {
   }
 }
示例#4
0
 public void testBitSize() {
   double fpp = 0.03;
   for (int i = 1; i < 10000; i++) {
     long numBits = BloomFilter.optimalNumOfBits(i, fpp);
     int arraySize = Ints.checkedCast(LongMath.divide(numBits, 64, RoundingMode.CEILING));
     assertEquals(
         arraySize * Long.SIZE,
         BloomFilter.create(Funnels.unencodedCharsFunnel(), i, fpp).bitSize());
   }
 }
示例#5
0
 private void checkSanity(BloomFilter<Object> bf) {
   assertFalse(bf.mightContain(new Object()));
   assertFalse(bf.apply(new Object()));
   for (int i = 0; i < 100; i++) {
     Object o = new Object();
     bf.put(o);
     assertTrue(bf.mightContain(o));
     assertTrue(bf.apply(o));
   }
 }
示例#6
0
 public void testPutReturnValue() {
   for (int i = 0; i < 10; i++) {
     BloomFilter<String> bf = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100);
     for (int j = 0; j < 10; j++) {
       String value = new Object().toString();
       boolean mightContain = bf.mightContain(value);
       boolean put = bf.put(value);
       assertTrue(mightContain != put);
     }
   }
 }
示例#7
0
  public void testCustomSerialization() throws Exception {
    Funnel<byte[]> funnel = Funnels.byteArrayFunnel();
    BloomFilter<byte[]> bf = BloomFilter.create(funnel, 100);
    for (int i = 0; i < 100; i++) {
      bf.put(Ints.toByteArray(i));
    }

    ByteArrayOutputStream out = new ByteArrayOutputStream();
    bf.writeTo(out);

    assertEquals(bf, BloomFilter.readFrom(new ByteArrayInputStream(out.toByteArray()), funnel));
  }
示例#8
0
 public void testExpectedFpp() {
   BloomFilter<Object> bf = BloomFilter.create(HashTestUtils.BAD_FUNNEL, 10, 0.03);
   double fpp = bf.expectedFpp();
   assertEquals(0.0, fpp);
   // usually completed in less than 200 iterations
   while (fpp != 1.0) {
     boolean changed = bf.put(new Object());
     double newFpp = bf.expectedFpp();
     // if changed, the new fpp is strictly higher, otherwise it is the same
     assertTrue(changed ? newFpp > fpp : newFpp == fpp);
     fpp = newFpp;
   }
 }
  public void testBloom() throws Exception {
    int numInsertions = 1000000;
    double fpp = 0.03D;
    Random random = new Random(1L);

    BloomFilter<Long> filter = BloomFilter.create(Funnels.longFunnel(), numInsertions, fpp);
    for (int l = 0; l < numInsertions; l++) {
      filter.put(random.nextLong());
    }

    random = new Random(1L);
    for (int l = 0; l < numInsertions; l++) {
      assertTrue(filter.mightContain(random.nextLong()));
    }
  }
 @Override
 public void configure(Map<String, Object> config) {
   expectedInsertions = toInt(config.get(EXPECTED_INSERTIONS_KEY));
   falsePositiveRate = toDouble(config.get(FALSE_POSITIVE_RATE_KEY));
   name = config.get(NAME_KEY).toString();
   filter = BloomFilter.create(LOOKUPKEY_FUNNEL, expectedInsertions, falsePositiveRate);
 }
示例#11
0
 /** Tests that we never get an optimal hashes number of zero. */
 public void testOptimalHashes() {
   for (int n = 1; n < 1000; n++) {
     for (int m = 0; m < 1000; m++) {
       assertTrue(BloomFilter.optimalNumOfHashFunctions(n, m) > 0);
     }
   }
 }
示例#12
0
 public SpillMap(SpillFile file, int thresholdBytes) throws IOException {
   this.thresholdBytes = thresholdBytes;
   this.spillFile = file;
   byteMap = MappedByteBufferMap.newMappedByteBufferMap(thresholdBytes, spillFile);
   bFilters = Lists.newArrayList();
   bFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(), 1000));
 }
示例#13
0
 /** Sanity checking with many combinations of false positive rates and expected insertions */
 public void testBasic() {
   for (double fpr = 0.0000001; fpr < 0.1; fpr *= 10) {
     for (int expectedInsertions = 1; expectedInsertions <= 10000; expectedInsertions *= 10) {
       checkSanity(BloomFilter.create(HashTestUtils.BAD_FUNNEL, expectedInsertions, fpr));
     }
   }
 }
示例#14
0
 /**
  * Determines whether a given bloom filter is compatible with this bloom filter. For two bloom
  * filters to be compatible, they must:
  *
  * <ul>
  *   <li>not be the same instance
  *   <li>have the same number of hash functions
  *   <li>have the same bit size
  *   <li>have the same strategy
  *   <li>have equal funnels
  *       <ul>
  *
  * @param that The bloom filter to check for compatibility.
  * @since 15.0
  */
 public boolean isCompatible(BloomFilter<T> that) {
   checkNotNull(that);
   return (this != that)
       && (this.numHashFunctions == that.numHashFunctions)
       && (this.bitSize() == that.bitSize())
       && (this.strategy.equals(that.strategy))
       && (this.funnel.equals(that.funnel));
 }
示例#15
0
 @SuppressWarnings("CheckReturnValue")
 public void testFailureWhenMoreThan255HashFunctionsAreNeeded() {
   try {
     int n = 1000;
     double p = 0.00000000000000000000000000000000000000000000000000000000000000000000000000000001;
     BloomFilter.create(Funnels.unencodedCharsFunnel(), n, p);
     fail();
   } catch (IllegalArgumentException expected) {
   }
 }
示例#16
0
 public void testEquals_empty() {
   new EqualsTester()
       .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 100, 0.01))
       .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 100, 0.02))
       .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 200, 0.01))
       .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 200, 0.02))
       .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 100, 0.01))
       .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 100, 0.02))
       .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 200, 0.01))
       .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 200, 0.02))
       .testEquals();
 }
示例#17
0
 // Funtion checks if current page has enough space to fit the new serialized tuple
 // If not it flushes the current buffer and gets a new page
 // TODO: The code does not ensure that pages are optimally packed.
 //       It only tries to fill up the current page as much as possbile, if its
 //       exhausted it requests a new page. Instead it would be nice to load the next page
 //       that could fit the new value.
 private void ensureSpace(byte[] value) {
   if (!byteMap.canFit(value)) {
     // Flush current buffer
     byteMap.flushBuffer();
     // Get next page
     byteMap = MappedByteBufferMap.newMappedByteBufferMap(thresholdBytes, spillFile);
     // Create new bloomfilter
     bFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(), 1000));
   }
 }
  private TitleNameNormalizer(String pathToEvaluationRedirectsData) throws IOException {

    if (useBloomFilter) {
      redirectFilter = BloomFilter.create(Funnels.stringFunnel(), ESTIMATED_REDIRECTS);
      redirects =
          new LRUCache<String, String>(5000) {
            protected String loadValue(String src) {
              String normalized = TitleNameIndexer.normalize(src);
              if (normalized == null) return src;
              return TitleNameIndexer.normalize(src);
            }
          };
    } else redirects = new StringMap<String>();
    if (showInitProgress)
      System.out.println(
          "Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version");
    if (pathToEvaluationRedirectsData != null) {
      InputStream is = CompressionUtils.readSnappyCompressed(pathToEvaluationRedirectsData);
      LineIterator iterator = IOUtils.lineIterator(is, StandardCharsets.UTF_8);

      long linecount = 0;
      while (iterator.hasNext()) {
        String line = iterator.nextLine();
        if (showInitProgress && linecount++ % 100000 == 0)
          System.out.println("loading the latest redirects; linecount=" + linecount);
        String[] parts = StringUtils.split(line, '\t');

        String src = parts[0].trim().replace(' ', '_');
        String trg = parts[1].trim().replace(' ', '_');
        if (useBloomFilter) redirectFilter.put(src);
        else redirects.put(src, trg);
      }
      iterator.close();
    }
    redirects = Collections.unmodifiableMap(redirects);
    if (showInitProgress)
      System.out.println(
          "Done  - Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version");
  }
 @Override
 public AccessTracker union(AccessTracker tracker) {
   if (filter == null) {
     throw new IllegalStateException(
         "Unable to union access tracker, because this tracker is not initialized.");
   }
   if (tracker instanceof BloomAccessTracker) {
     filter.putAll(((BloomAccessTracker) tracker).getFilter());
     return this;
   } else {
     throw new IllegalStateException(
         "Unable to union access tracker, because it's not of the right type (BloomAccessTracker)");
   }
 }
示例#20
0
 @SuppressWarnings("CheckReturnValue")
 public void testPreconditions() {
   try {
     BloomFilter.create(Funnels.unencodedCharsFunnel(), -1);
     fail();
   } catch (IllegalArgumentException expected) {
   }
   try {
     BloomFilter.create(Funnels.unencodedCharsFunnel(), -1, 0.03);
     fail();
   } catch (IllegalArgumentException expected) {
   }
   try {
     BloomFilter.create(Funnels.unencodedCharsFunnel(), 1, 0.0);
     fail();
   } catch (IllegalArgumentException expected) {
   }
   try {
     BloomFilter.create(Funnels.unencodedCharsFunnel(), 1, 1.0);
     fail();
   } catch (IllegalArgumentException expected) {
   }
 }
示例#21
0
  /** Tests that we always get a non-negative optimal size. */
  @SuppressWarnings("CheckReturnValue")
  public void testOptimalSize() {
    for (int n = 1; n < 1000; n++) {
      for (double fpp = Double.MIN_VALUE; fpp < 1.0; fpp += 0.001) {
        assertTrue(BloomFilter.optimalNumOfBits(n, fpp) >= 0);
      }
    }

    // some random values
    Random random = new Random(0);
    for (int repeats = 0; repeats < 10000; repeats++) {
      assertTrue(BloomFilter.optimalNumOfBits(random.nextInt(1 << 16), random.nextDouble()) >= 0);
    }

    // and some crazy values (this used to be capped to Integer.MAX_VALUE, now it can go bigger
    assertEquals(3327428144502L, BloomFilter.optimalNumOfBits(Integer.MAX_VALUE, Double.MIN_VALUE));
    try {
      BloomFilter.create(HashTestUtils.BAD_FUNNEL, Integer.MAX_VALUE, Double.MIN_VALUE);
      fail("we can't represent such a large BF!");
    } catch (IllegalArgumentException expected) {
      assertThat(expected).hasMessage("Could not create BloomFilter of 3327428144502 bits");
    }
  }
示例#22
0
  public void testJavaSerialization() {
    BloomFilter<byte[]> bf = BloomFilter.create(Funnels.byteArrayFunnel(), 100);
    for (int i = 0; i < 10; i++) {
      bf.put(Ints.toByteArray(i));
    }

    BloomFilter<byte[]> copy = SerializableTester.reserialize(bf);
    for (int i = 0; i < 10; i++) {
      assertTrue(copy.mightContain(Ints.toByteArray(i)));
    }
    assertEquals(bf.expectedFpp(), copy.expectedFpp());

    SerializableTester.reserializeAndAssert(bf);
  }
示例#23
0
  public void testPutAllDifferentSizes() {
    BloomFilter<Integer> bf1 = BloomFilter.create(Funnels.integerFunnel(), 1);
    BloomFilter<Integer> bf2 = BloomFilter.create(Funnels.integerFunnel(), 10);

    try {
      assertFalse(bf1.isCompatible(bf2));
      bf1.putAll(bf2);
      fail();
    } catch (IllegalArgumentException expected) {
    }

    try {
      assertFalse(bf2.isCompatible(bf1));
      bf2.putAll(bf1);
      fail();
    } catch (IllegalArgumentException expected) {
    }
  }
示例#24
0
  public void testEquals() {
    BloomFilter<String> bf1 = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100);
    bf1.put("1");
    bf1.put("2");

    BloomFilter<String> bf2 = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100);
    bf2.put("1");
    bf2.put("2");

    new EqualsTester().addEqualityGroup(bf1, bf2).testEquals();

    bf2.put("3");

    new EqualsTester().addEqualityGroup(bf1).addEqualityGroup(bf2).testEquals();
  }
  protected String redirect(String title) {
    if (useBloomFilter && !redirectFilter.mightContain(title)) return title;
    String to = redirects.get(title);
    int count = 0;
    while (to != null && !to.equals(title) && count++ <= 50) {
      title = to;
      to = redirects.get(title);
    }

    if (count >= 50) {
      System.out.println(
          "Fixed point reached with title : "
              + title
              + " ; stopping to loop in redirects at this moment");
    }
    return title;
  }
示例#26
0
  public void testCreateAndCheckMitz32BloomFilterWithKnownFalsePositives() {
    int numInsertions = 1000000;
    BloomFilter<String> bf =
        BloomFilter.create(
            Funnels.unencodedCharsFunnel(),
            numInsertions,
            0.03,
            BloomFilterStrategies.MURMUR128_MITZ_32);

    // Insert "numInsertions" even numbers into the BF.
    for (int i = 0; i < numInsertions * 2; i += 2) {
      bf.put(Integer.toString(i));
    }

    // Assert that the BF "might" have all of the even numbers.
    for (int i = 0; i < numInsertions * 2; i += 2) {
      assertTrue(bf.mightContain(Integer.toString(i)));
    }

    // Now we check for known false positives using a set of known false positives.
    // (These are all of the false positives under 900.)
    ImmutableSet<Integer> falsePositives =
        ImmutableSet.of(
            49, 51, 59, 163, 199, 321, 325, 363, 367, 469, 545, 561, 727, 769, 773, 781);
    for (int i = 1; i < 900; i += 2) {
      if (!falsePositives.contains(i)) {
        assertFalse("BF should not contain " + i, bf.mightContain(Integer.toString(i)));
      }
    }

    // Check that there are exactly 29824 false positives for this BF.
    int knownNumberOfFalsePositives = 29824;
    int numFpp = 0;
    for (int i = 1; i < numInsertions * 2; i += 2) {
      if (bf.mightContain(Integer.toString(i))) {
        numFpp++;
      }
    }
    assertEquals(knownNumberOfFalsePositives, numFpp);
    double actualFpp = (double) knownNumberOfFalsePositives / numInsertions;
    double expectedFpp = bf.expectedFpp();
    // The normal order of (expected, actual) is reversed here on purpose.
    assertEquals(actualFpp, expectedFpp, 0.00015);
  }
示例#27
0
  public void testCreateAndCheckBloomFilterWithKnownUtf8FalsePositives64() {
    int numInsertions = 1000000;
    BloomFilter<String> bf =
        BloomFilter.create(
            Funnels.stringFunnel(UTF_8),
            numInsertions,
            0.03,
            BloomFilterStrategies.MURMUR128_MITZ_64);

    // Insert "numInsertions" even numbers into the BF.
    for (int i = 0; i < numInsertions * 2; i += 2) {
      bf.put(Integer.toString(i));
    }

    // Assert that the BF "might" have all of the even numbers.
    for (int i = 0; i < numInsertions * 2; i += 2) {
      assertTrue(bf.mightContain(Integer.toString(i)));
    }

    // Now we check for known false positives using a set of known false positives.
    // (These are all of the false positives under 900.)
    ImmutableSet<Integer> falsePositives = ImmutableSet.of(129, 471, 723, 89, 751, 835, 871);
    for (int i = 1; i < 900; i += 2) {
      if (!falsePositives.contains(i)) {
        assertFalse("BF should not contain " + i, bf.mightContain(Integer.toString(i)));
      }
    }

    // Check that there are exactly 29763 false positives for this BF.
    int knownNumberOfFalsePositives = 29763;
    int numFpp = 0;
    for (int i = 1; i < numInsertions * 2; i += 2) {
      if (bf.mightContain(Integer.toString(i))) {
        numFpp++;
      }
    }
    assertEquals(knownNumberOfFalsePositives, numFpp);
    double actualFpp = (double) knownNumberOfFalsePositives / numInsertions;
    double expectedFpp = bf.expectedFpp();
    // The normal order of (expected, actual) is reversed here on purpose.
    assertEquals(actualFpp, expectedFpp, 0.00033);
  }
 @Override
 public void reset() {
   filter = BloomFilter.create(LOOKUPKEY_FUNNEL, expectedInsertions, falsePositiveRate);
 }
 @Override
 public boolean hasSeen(LookupKey key) {
   return filter.mightContain(key);
 }
 @Override
 public void logAccess(LookupKey key) {
   numInsertions++;
   filter.put(key);
 }