// Murmur is faster than an SHA-based approach and provides as-good collision
 // resistance.  The combinatorial generation approach described in
 // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
 // does prove to work in actual tests, and is obviously faster
 // than performing further iterations of murmur.
 static long[] getHashBuckets(byte[] b, int offset, int length, int hashCount, long max) {
   long[] result = new long[hashCount];
   long hash1 = MurmurHash.hash64(b, offset, length, 0L);
   long hash2 = MurmurHash.hash64(b, offset, length, hash1);
   for (int i = 0; i < hashCount; ++i) {
     result[i] = Math.abs((hash1 + (long) i * hash2) % max);
   }
   return result;
 }
 // Murmur is faster than an SHA-based approach and provides as-good collision
 // resistance.  The combinatorial generation approach described in
 // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
 // does prove to work in actual tests, and is obviously faster
 // than performing further iterations of murmur.
 static long[] getHashBuckets(ByteBuffer b, int hashCount, long max) {
   long[] result = new long[hashCount];
   long hash1 = MurmurHash.hash64(b, b.position(), b.remaining(), 0L);
   long hash2 = MurmurHash.hash64(b, b.position(), b.remaining(), hash1);
   for (int i = 0; i < hashCount; ++i) {
     result[i] = Math.abs((hash1 + (long) i * hash2) % max);
   }
   return result;
 }
Example #3
0
 /**
  * Get a singleton instance of hash function of a given type.
  *
  * @param type predefined hash type
  * @return hash function instance, or null if type is invalid
  */
 public static Hash getInstance(int type) {
   switch (type) {
     case JENKINS_HASH:
       return JenkinsHash.getInstance();
     case MURMUR_HASH:
       return MurmurHash.getInstance();
     default:
       return null;
   }
 }
Example #4
0
 @Test
 public void testHash64ByteArrayIntInt() {
   for (int i = 0; i < input.length; i++) {
     final long hash = MurmurHash.hash64(input[i], input[i].length, 0x344d1f5c);
     assertTrue(
         String.format(
             "Unexpected hash64 result for example %d: 0x%016x instead of 0x%016x",
             i, hash, results64_seed[i]),
         hash == results64_seed[i]);
   }
 }
  /**
   * 文本向量化
   *
   * @param analyzer - 选择的分词器对象
   * @param field - lucene域名
   * @param content - 文本内容
   */
  public Map<Long, Integer> vectorize(Analyzer analyzer, String field, String content) {

    Map<Long, Integer> map = new TreeMap<Long, Integer>();
    DocWordHashMap wordHash = DocWordHashMap.getInstance();
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream(field, content);

      // 迭代获取分词结果

      // 重置TokenStream
      ts.reset();

      while (ts.incrementToken()) {

        String word = ts.addAttribute(CharTermAttribute.class).toString();
        // 逐个MurmurHash词元
        long hash = MurmurHash.hash64(word);
        if (!wordHash.isContainKey(hash)) {
          wordHash.setWordStringHash(hash, word);
        }
        if (!map.containsKey(hash)) {
          map.put(hash, 1);
        } else {
          map.put(hash, map.get(hash) + 1);
        }
      }

      // 关闭TokenStream
      ts.end(); // Perform end-of-stream operations, e.g. set the final
      // offset.
    } catch (CorruptIndexException e) {
      e.printStackTrace();
      map.clear();
    } catch (LockObtainFailedException e) {
      e.printStackTrace();
      map.clear();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
      map.clear();
    } finally {
      // 释放TokenStream的所有资源
      if (ts != null) {
        try {
          ts.close();
        } catch (IOException e) {
          e.printStackTrace();
          map.clear();
        }
      }
    }
    return map;
  }
Example #6
0
 @Test
 public void testHash32ByteArrayIntInt() {
   for (int i = 0; i < input.length; i++) {
     final int hash = MurmurHash.hash32(input[i], input[i].length, 0x71b4954d);
     if (hash != results32_seed[i]) {
       fail(
           String.format(
               "Unexpected hash32 result for example %d: 0x%08x instead of 0x%08x",
               i, hash, results32_seed[i]));
     }
   }
 }
Example #7
0
  /**
   * Loads bloom filter meta data from file input.
   *
   * @param meta
   * @throws IllegalArgumentException
   */
  public ByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException {
    int version = meta.getInt();
    if (version != VERSION) throw new IllegalArgumentException("Bad version");

    this.byteSize = meta.getInt();
    this.hashCount = meta.getInt();
    this.keyCount = new AtomicInteger(meta.getInt());
    this.maxKeys = this.keyCount.intValue();

    this.hash = MurmurHash.getInstance();
    sanityCheck();

    allocBloom();
  }
Example #8
0
  public ByteBloomFilter(int maxKeys, float errorRate, int foldFactor)
      throws IllegalArgumentException {
    /*
     * Bloom filters are very sensitive to the number of elements inserted
     * into them. For HBase, the number of entries depends on the size of
     * the data stored in the column. Currently the default region size is
     * 256MB, so entry count ~= 256MB / (average value size for column).
     * Despite this rule of thumb, there is no efficient way to calculate
     * the entry count after compactions. Therefore, it is often easier to
     * use a dynamic bloom filter that will add extra space instead of
     * allowing the error rate to grow.
     *
     * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/
     * BloomFilterSurvey.pdf )
     *
     * m denotes the number of bits in the Bloom filter (bitSize) n denotes
     * the number of elements inserted into the Bloom filter (maxKeys) k
     * represents the number of hash functions used (nbHash) e represents
     * the desired false positive rate for the bloom (err)
     *
     * If we fix the error rate (e) and know the number of entries, then the
     * optimal bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) /
     * ln(0.6185)
     *
     * The probability of false positives is minimized when k = m/n ln(2).
     */
    int bitSize = (int) Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185)));
    int functionCount = (int) Math.ceil(Math.log(2) * (bitSize / maxKeys));

    // increase byteSize so folding is possible
    int byteSize = (bitSize + 7) / 8;
    int mask = (1 << foldFactor) - 1;
    if ((mask & byteSize) != 0) {
      byteSize >>= foldFactor;
      ++byteSize;
      byteSize <<= foldFactor;
    }

    this.byteSize = byteSize;
    this.hashCount = functionCount;
    this.hash = MurmurHash.getInstance();
    this.keyCount = new AtomicInteger(0);
    this.maxKeys = maxKeys;

    sanityCheck();

    allocBloom();
  }
Example #9
0
 @Test
 public void testHash64StringIntInt() {
   final long hash = MurmurHash.hash64(text, 2, text.length() - 4);
   assertTrue(hash == 0xa8b33145194985a2l);
 }
Example #10
0
 @Test
 public void testHash64String() {
   final long hash = MurmurHash.hash64(text);
   assertTrue(hash == 0x0920e0c1b7eeb261l);
 }
Example #11
0
 @Test
 public void testHash32StringIntInt() {
   final int hash = MurmurHash.hash32(text, 2, text.length() - 4);
   assertTrue(hash == 0x4d666d90);
 }
Example #12
0
 @Test
 public void testHash32String() {
   final int hash = MurmurHash.hash32(text);
   assertTrue(hash == 0xb3bf597e);
 }