/** * Get a singleton instance of hash function of a given type. * * @param type predefined hash type * @return hash function instance, or null if type is invalid */ public static Hash getInstance(int type) { switch (type) { case JENKINS_HASH: return JenkinsHash.getInstance(); case MURMUR_HASH: return MurmurHash.getInstance(); default: return null; } }
/** * Loads bloom filter meta data from file input. * * @param meta * @throws IllegalArgumentException */ public ByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException { int version = meta.getInt(); if (version != VERSION) throw new IllegalArgumentException("Bad version"); this.byteSize = meta.getInt(); this.hashCount = meta.getInt(); this.keyCount = new AtomicInteger(meta.getInt()); this.maxKeys = this.keyCount.intValue(); this.hash = MurmurHash.getInstance(); sanityCheck(); allocBloom(); }
public ByteBloomFilter(int maxKeys, float errorRate, int foldFactor) throws IllegalArgumentException { /* * Bloom filters are very sensitive to the number of elements inserted * into them. For HBase, the number of entries depends on the size of * the data stored in the column. Currently the default region size is * 256MB, so entry count ~= 256MB / (average value size for column). * Despite this rule of thumb, there is no efficient way to calculate * the entry count after compactions. Therefore, it is often easier to * use a dynamic bloom filter that will add extra space instead of * allowing the error rate to grow. * * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/ * BloomFilterSurvey.pdf ) * * m denotes the number of bits in the Bloom filter (bitSize) n denotes * the number of elements inserted into the Bloom filter (maxKeys) k * represents the number of hash functions used (nbHash) e represents * the desired false positive rate for the bloom (err) * * If we fix the error rate (e) and know the number of entries, then the * optimal bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / * ln(0.6185) * * The probability of false positives is minimized when k = m/n ln(2). */ int bitSize = (int) Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185))); int functionCount = (int) Math.ceil(Math.log(2) * (bitSize / maxKeys)); // increase byteSize so folding is possible int byteSize = (bitSize + 7) / 8; int mask = (1 << foldFactor) - 1; if ((mask & byteSize) != 0) { byteSize >>= foldFactor; ++byteSize; byteSize <<= foldFactor; } this.byteSize = byteSize; this.hashCount = functionCount; this.hash = MurmurHash.getInstance(); this.keyCount = new AtomicInteger(0); this.maxKeys = maxKeys; sanityCheck(); allocBloom(); }