/** * Combines this bloom filter with another bloom filter by performing a bitwise OR of the * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the bloom * filters are appropriately sized to avoid saturating them. * * @param that The bloom filter to combine this bloom filter with. It is not mutated. * @throws IllegalArgumentException if {@code isCompatible(that) == false} * @since 15.0 */ public void putAll(BloomFilter<T> that) { checkNotNull(that); checkArgument(this != that, "Cannot combine a BloomFilter with itself."); checkArgument( this.numHashFunctions == that.numHashFunctions, "BloomFilters must have the same number of hash functions (%s != %s)", this.numHashFunctions, that.numHashFunctions); checkArgument( this.bitSize() == that.bitSize(), "BloomFilters must have the same size underlying bit arrays (%s != %s)", this.bitSize(), that.bitSize()); checkArgument( this.strategy.equals(that.strategy), "BloomFilters must have equal strategies (%s != %s)", this.strategy, that.strategy); checkArgument( this.funnel.equals(that.funnel), "BloomFilters must have equal funnels (%s != %s)", this.funnel, that.funnel); this.bits.putAll(that.bits); }
@SuppressWarnings("CheckReturnValue") @AndroidIncompatible // OutOfMemoryError public void testLargeNumberOfInsertions() { // We use horrible FPPs here to keep Java from OOM'ing BloomFilter.create(Funnels.unencodedCharsFunnel(), 42L + Integer.MAX_VALUE, 0.28); BloomFilter.create(Funnels.unencodedCharsFunnel(), 50L * Integer.MAX_VALUE, 0.99); }
public void testPutAllWithSelf() { BloomFilter<Integer> bf1 = BloomFilter.create(Funnels.integerFunnel(), 1); try { assertFalse(bf1.isCompatible(bf1)); bf1.putAll(bf1); fail(); } catch (IllegalArgumentException expected) { } }
public void testBitSize() { double fpp = 0.03; for (int i = 1; i < 10000; i++) { long numBits = BloomFilter.optimalNumOfBits(i, fpp); int arraySize = Ints.checkedCast(LongMath.divide(numBits, 64, RoundingMode.CEILING)); assertEquals( arraySize * Long.SIZE, BloomFilter.create(Funnels.unencodedCharsFunnel(), i, fpp).bitSize()); } }
private void checkSanity(BloomFilter<Object> bf) { assertFalse(bf.mightContain(new Object())); assertFalse(bf.apply(new Object())); for (int i = 0; i < 100; i++) { Object o = new Object(); bf.put(o); assertTrue(bf.mightContain(o)); assertTrue(bf.apply(o)); } }
public void testPutReturnValue() { for (int i = 0; i < 10; i++) { BloomFilter<String> bf = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100); for (int j = 0; j < 10; j++) { String value = new Object().toString(); boolean mightContain = bf.mightContain(value); boolean put = bf.put(value); assertTrue(mightContain != put); } } }
public void testCustomSerialization() throws Exception { Funnel<byte[]> funnel = Funnels.byteArrayFunnel(); BloomFilter<byte[]> bf = BloomFilter.create(funnel, 100); for (int i = 0; i < 100; i++) { bf.put(Ints.toByteArray(i)); } ByteArrayOutputStream out = new ByteArrayOutputStream(); bf.writeTo(out); assertEquals(bf, BloomFilter.readFrom(new ByteArrayInputStream(out.toByteArray()), funnel)); }
public void testExpectedFpp() { BloomFilter<Object> bf = BloomFilter.create(HashTestUtils.BAD_FUNNEL, 10, 0.03); double fpp = bf.expectedFpp(); assertEquals(0.0, fpp); // usually completed in less than 200 iterations while (fpp != 1.0) { boolean changed = bf.put(new Object()); double newFpp = bf.expectedFpp(); // if changed, the new fpp is strictly higher, otherwise it is the same assertTrue(changed ? newFpp > fpp : newFpp == fpp); fpp = newFpp; } }
public void testBloom() throws Exception { int numInsertions = 1000000; double fpp = 0.03D; Random random = new Random(1L); BloomFilter<Long> filter = BloomFilter.create(Funnels.longFunnel(), numInsertions, fpp); for (int l = 0; l < numInsertions; l++) { filter.put(random.nextLong()); } random = new Random(1L); for (int l = 0; l < numInsertions; l++) { assertTrue(filter.mightContain(random.nextLong())); } }
@Override public void configure(Map<String, Object> config) { expectedInsertions = toInt(config.get(EXPECTED_INSERTIONS_KEY)); falsePositiveRate = toDouble(config.get(FALSE_POSITIVE_RATE_KEY)); name = config.get(NAME_KEY).toString(); filter = BloomFilter.create(LOOKUPKEY_FUNNEL, expectedInsertions, falsePositiveRate); }
/** Tests that we never get an optimal hashes number of zero. */ public void testOptimalHashes() { for (int n = 1; n < 1000; n++) { for (int m = 0; m < 1000; m++) { assertTrue(BloomFilter.optimalNumOfHashFunctions(n, m) > 0); } } }
public SpillMap(SpillFile file, int thresholdBytes) throws IOException { this.thresholdBytes = thresholdBytes; this.spillFile = file; byteMap = MappedByteBufferMap.newMappedByteBufferMap(thresholdBytes, spillFile); bFilters = Lists.newArrayList(); bFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(), 1000)); }
/** Sanity checking with many combinations of false positive rates and expected insertions */ public void testBasic() { for (double fpr = 0.0000001; fpr < 0.1; fpr *= 10) { for (int expectedInsertions = 1; expectedInsertions <= 10000; expectedInsertions *= 10) { checkSanity(BloomFilter.create(HashTestUtils.BAD_FUNNEL, expectedInsertions, fpr)); } } }
/** * Determines whether a given bloom filter is compatible with this bloom filter. For two bloom * filters to be compatible, they must: * * <ul> * <li>not be the same instance * <li>have the same number of hash functions * <li>have the same bit size * <li>have the same strategy * <li>have equal funnels * <ul> * * @param that The bloom filter to check for compatibility. * @since 15.0 */ public boolean isCompatible(BloomFilter<T> that) { checkNotNull(that); return (this != that) && (this.numHashFunctions == that.numHashFunctions) && (this.bitSize() == that.bitSize()) && (this.strategy.equals(that.strategy)) && (this.funnel.equals(that.funnel)); }
@SuppressWarnings("CheckReturnValue") public void testFailureWhenMoreThan255HashFunctionsAreNeeded() { try { int n = 1000; double p = 0.00000000000000000000000000000000000000000000000000000000000000000000000000000001; BloomFilter.create(Funnels.unencodedCharsFunnel(), n, p); fail(); } catch (IllegalArgumentException expected) { } }
public void testEquals_empty() { new EqualsTester() .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 100, 0.01)) .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 100, 0.02)) .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 200, 0.01)) .addEqualityGroup(BloomFilter.create(Funnels.byteArrayFunnel(), 200, 0.02)) .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 100, 0.01)) .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 100, 0.02)) .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 200, 0.01)) .addEqualityGroup(BloomFilter.create(Funnels.unencodedCharsFunnel(), 200, 0.02)) .testEquals(); }
// Funtion checks if current page has enough space to fit the new serialized tuple // If not it flushes the current buffer and gets a new page // TODO: The code does not ensure that pages are optimally packed. // It only tries to fill up the current page as much as possbile, if its // exhausted it requests a new page. Instead it would be nice to load the next page // that could fit the new value. private void ensureSpace(byte[] value) { if (!byteMap.canFit(value)) { // Flush current buffer byteMap.flushBuffer(); // Get next page byteMap = MappedByteBufferMap.newMappedByteBufferMap(thresholdBytes, spillFile); // Create new bloomfilter bFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(), 1000)); } }
private TitleNameNormalizer(String pathToEvaluationRedirectsData) throws IOException { if (useBloomFilter) { redirectFilter = BloomFilter.create(Funnels.stringFunnel(), ESTIMATED_REDIRECTS); redirects = new LRUCache<String, String>(5000) { protected String loadValue(String src) { String normalized = TitleNameIndexer.normalize(src); if (normalized == null) return src; return TitleNameIndexer.normalize(src); } }; } else redirects = new StringMap<String>(); if (showInitProgress) System.out.println( "Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version"); if (pathToEvaluationRedirectsData != null) { InputStream is = CompressionUtils.readSnappyCompressed(pathToEvaluationRedirectsData); LineIterator iterator = IOUtils.lineIterator(is, StandardCharsets.UTF_8); long linecount = 0; while (iterator.hasNext()) { String line = iterator.nextLine(); if (showInitProgress && linecount++ % 100000 == 0) System.out.println("loading the latest redirects; linecount=" + linecount); String[] parts = StringUtils.split(line, '\t'); String src = parts[0].trim().replace(' ', '_'); String trg = parts[1].trim().replace(' ', '_'); if (useBloomFilter) redirectFilter.put(src); else redirects.put(src, trg); } iterator.close(); } redirects = Collections.unmodifiableMap(redirects); if (showInitProgress) System.out.println( "Done - Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version"); }
@Override public AccessTracker union(AccessTracker tracker) { if (filter == null) { throw new IllegalStateException( "Unable to union access tracker, because this tracker is not initialized."); } if (tracker instanceof BloomAccessTracker) { filter.putAll(((BloomAccessTracker) tracker).getFilter()); return this; } else { throw new IllegalStateException( "Unable to union access tracker, because it's not of the right type (BloomAccessTracker)"); } }
@SuppressWarnings("CheckReturnValue") public void testPreconditions() { try { BloomFilter.create(Funnels.unencodedCharsFunnel(), -1); fail(); } catch (IllegalArgumentException expected) { } try { BloomFilter.create(Funnels.unencodedCharsFunnel(), -1, 0.03); fail(); } catch (IllegalArgumentException expected) { } try { BloomFilter.create(Funnels.unencodedCharsFunnel(), 1, 0.0); fail(); } catch (IllegalArgumentException expected) { } try { BloomFilter.create(Funnels.unencodedCharsFunnel(), 1, 1.0); fail(); } catch (IllegalArgumentException expected) { } }
/** Tests that we always get a non-negative optimal size. */ @SuppressWarnings("CheckReturnValue") public void testOptimalSize() { for (int n = 1; n < 1000; n++) { for (double fpp = Double.MIN_VALUE; fpp < 1.0; fpp += 0.001) { assertTrue(BloomFilter.optimalNumOfBits(n, fpp) >= 0); } } // some random values Random random = new Random(0); for (int repeats = 0; repeats < 10000; repeats++) { assertTrue(BloomFilter.optimalNumOfBits(random.nextInt(1 << 16), random.nextDouble()) >= 0); } // and some crazy values (this used to be capped to Integer.MAX_VALUE, now it can go bigger assertEquals(3327428144502L, BloomFilter.optimalNumOfBits(Integer.MAX_VALUE, Double.MIN_VALUE)); try { BloomFilter.create(HashTestUtils.BAD_FUNNEL, Integer.MAX_VALUE, Double.MIN_VALUE); fail("we can't represent such a large BF!"); } catch (IllegalArgumentException expected) { assertThat(expected).hasMessage("Could not create BloomFilter of 3327428144502 bits"); } }
public void testJavaSerialization() { BloomFilter<byte[]> bf = BloomFilter.create(Funnels.byteArrayFunnel(), 100); for (int i = 0; i < 10; i++) { bf.put(Ints.toByteArray(i)); } BloomFilter<byte[]> copy = SerializableTester.reserialize(bf); for (int i = 0; i < 10; i++) { assertTrue(copy.mightContain(Ints.toByteArray(i))); } assertEquals(bf.expectedFpp(), copy.expectedFpp()); SerializableTester.reserializeAndAssert(bf); }
public void testPutAllDifferentSizes() { BloomFilter<Integer> bf1 = BloomFilter.create(Funnels.integerFunnel(), 1); BloomFilter<Integer> bf2 = BloomFilter.create(Funnels.integerFunnel(), 10); try { assertFalse(bf1.isCompatible(bf2)); bf1.putAll(bf2); fail(); } catch (IllegalArgumentException expected) { } try { assertFalse(bf2.isCompatible(bf1)); bf2.putAll(bf1); fail(); } catch (IllegalArgumentException expected) { } }
public void testEquals() { BloomFilter<String> bf1 = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100); bf1.put("1"); bf1.put("2"); BloomFilter<String> bf2 = BloomFilter.create(Funnels.unencodedCharsFunnel(), 100); bf2.put("1"); bf2.put("2"); new EqualsTester().addEqualityGroup(bf1, bf2).testEquals(); bf2.put("3"); new EqualsTester().addEqualityGroup(bf1).addEqualityGroup(bf2).testEquals(); }
protected String redirect(String title) { if (useBloomFilter && !redirectFilter.mightContain(title)) return title; String to = redirects.get(title); int count = 0; while (to != null && !to.equals(title) && count++ <= 50) { title = to; to = redirects.get(title); } if (count >= 50) { System.out.println( "Fixed point reached with title : " + title + " ; stopping to loop in redirects at this moment"); } return title; }
public void testCreateAndCheckMitz32BloomFilterWithKnownFalsePositives() { int numInsertions = 1000000; BloomFilter<String> bf = BloomFilter.create( Funnels.unencodedCharsFunnel(), numInsertions, 0.03, BloomFilterStrategies.MURMUR128_MITZ_32); // Insert "numInsertions" even numbers into the BF. for (int i = 0; i < numInsertions * 2; i += 2) { bf.put(Integer.toString(i)); } // Assert that the BF "might" have all of the even numbers. for (int i = 0; i < numInsertions * 2; i += 2) { assertTrue(bf.mightContain(Integer.toString(i))); } // Now we check for known false positives using a set of known false positives. // (These are all of the false positives under 900.) ImmutableSet<Integer> falsePositives = ImmutableSet.of( 49, 51, 59, 163, 199, 321, 325, 363, 367, 469, 545, 561, 727, 769, 773, 781); for (int i = 1; i < 900; i += 2) { if (!falsePositives.contains(i)) { assertFalse("BF should not contain " + i, bf.mightContain(Integer.toString(i))); } } // Check that there are exactly 29824 false positives for this BF. int knownNumberOfFalsePositives = 29824; int numFpp = 0; for (int i = 1; i < numInsertions * 2; i += 2) { if (bf.mightContain(Integer.toString(i))) { numFpp++; } } assertEquals(knownNumberOfFalsePositives, numFpp); double actualFpp = (double) knownNumberOfFalsePositives / numInsertions; double expectedFpp = bf.expectedFpp(); // The normal order of (expected, actual) is reversed here on purpose. assertEquals(actualFpp, expectedFpp, 0.00015); }
public void testCreateAndCheckBloomFilterWithKnownUtf8FalsePositives64() { int numInsertions = 1000000; BloomFilter<String> bf = BloomFilter.create( Funnels.stringFunnel(UTF_8), numInsertions, 0.03, BloomFilterStrategies.MURMUR128_MITZ_64); // Insert "numInsertions" even numbers into the BF. for (int i = 0; i < numInsertions * 2; i += 2) { bf.put(Integer.toString(i)); } // Assert that the BF "might" have all of the even numbers. for (int i = 0; i < numInsertions * 2; i += 2) { assertTrue(bf.mightContain(Integer.toString(i))); } // Now we check for known false positives using a set of known false positives. // (These are all of the false positives under 900.) ImmutableSet<Integer> falsePositives = ImmutableSet.of(129, 471, 723, 89, 751, 835, 871); for (int i = 1; i < 900; i += 2) { if (!falsePositives.contains(i)) { assertFalse("BF should not contain " + i, bf.mightContain(Integer.toString(i))); } } // Check that there are exactly 29763 false positives for this BF. int knownNumberOfFalsePositives = 29763; int numFpp = 0; for (int i = 1; i < numInsertions * 2; i += 2) { if (bf.mightContain(Integer.toString(i))) { numFpp++; } } assertEquals(knownNumberOfFalsePositives, numFpp); double actualFpp = (double) knownNumberOfFalsePositives / numInsertions; double expectedFpp = bf.expectedFpp(); // The normal order of (expected, actual) is reversed here on purpose. assertEquals(actualFpp, expectedFpp, 0.00033); }
@Override public void reset() { filter = BloomFilter.create(LOOKUPKEY_FUNNEL, expectedInsertions, falsePositiveRate); }
@Override public boolean hasSeen(LookupKey key) { return filter.mightContain(key); }
@Override public void logAccess(LookupKey key) { numInsertions++; filter.put(key); }