Example #1
0
 /**
  * Get a singleton instance of hash function of a given type.
  *
  * @param type predefined hash type
  * @return hash function instance, or null if type is invalid
  */
 public static Hash getInstance(int type) {
   switch (type) {
     case JENKINS_HASH:
       return JenkinsHash.getInstance();
     case MURMUR_HASH:
       return MurmurHash.getInstance();
     default:
       return null;
   }
 }
Example #2
0
  /**
   * Loads bloom filter meta data from file input.
   *
   * @param meta
   * @throws IllegalArgumentException
   */
  public ByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException {
    int version = meta.getInt();
    if (version != VERSION) throw new IllegalArgumentException("Bad version");

    this.byteSize = meta.getInt();
    this.hashCount = meta.getInt();
    this.keyCount = new AtomicInteger(meta.getInt());
    this.maxKeys = this.keyCount.intValue();

    this.hash = MurmurHash.getInstance();
    sanityCheck();

    allocBloom();
  }
Example #3
0
  public ByteBloomFilter(int maxKeys, float errorRate, int foldFactor)
      throws IllegalArgumentException {
    /*
     * Bloom filters are very sensitive to the number of elements inserted
     * into them. For HBase, the number of entries depends on the size of
     * the data stored in the column. Currently the default region size is
     * 256MB, so entry count ~= 256MB / (average value size for column).
     * Despite this rule of thumb, there is no efficient way to calculate
     * the entry count after compactions. Therefore, it is often easier to
     * use a dynamic bloom filter that will add extra space instead of
     * allowing the error rate to grow.
     *
     * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/
     * BloomFilterSurvey.pdf )
     *
     * m denotes the number of bits in the Bloom filter (bitSize) n denotes
     * the number of elements inserted into the Bloom filter (maxKeys) k
     * represents the number of hash functions used (nbHash) e represents
     * the desired false positive rate for the bloom (err)
     *
     * If we fix the error rate (e) and know the number of entries, then the
     * optimal bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) /
     * ln(0.6185)
     *
     * The probability of false positives is minimized when k = m/n ln(2).
     */
    int bitSize = (int) Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185)));
    int functionCount = (int) Math.ceil(Math.log(2) * (bitSize / maxKeys));

    // increase byteSize so folding is possible
    int byteSize = (bitSize + 7) / 8;
    int mask = (1 << foldFactor) - 1;
    if ((mask & byteSize) != 0) {
      byteSize >>= foldFactor;
      ++byteSize;
      byteSize <<= foldFactor;
    }

    this.byteSize = byteSize;
    this.hashCount = functionCount;
    this.hash = MurmurHash.getInstance();
    this.keyCount = new AtomicInteger(0);
    this.maxKeys = maxKeys;

    sanityCheck();

    allocBloom();
  }