// Murmur is faster than an SHA-based approach and provides as-good collision // resistance. The combinatorial generation approach described in // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf // does prove to work in actual tests, and is obviously faster // than performing further iterations of murmur. static long[] getHashBuckets(byte[] b, int offset, int length, int hashCount, long max) { long[] result = new long[hashCount]; long hash1 = MurmurHash.hash64(b, offset, length, 0L); long hash2 = MurmurHash.hash64(b, offset, length, hash1); for (int i = 0; i < hashCount; ++i) { result[i] = Math.abs((hash1 + (long) i * hash2) % max); } return result; }
// Murmur is faster than an SHA-based approach and provides as-good collision // resistance. The combinatorial generation approach described in // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf // does prove to work in actual tests, and is obviously faster // than performing further iterations of murmur. static long[] getHashBuckets(ByteBuffer b, int hashCount, long max) { long[] result = new long[hashCount]; long hash1 = MurmurHash.hash64(b, b.position(), b.remaining(), 0L); long hash2 = MurmurHash.hash64(b, b.position(), b.remaining(), hash1); for (int i = 0; i < hashCount; ++i) { result[i] = Math.abs((hash1 + (long) i * hash2) % max); } return result; }
/** * Get a singleton instance of hash function of a given type. * * @param type predefined hash type * @return hash function instance, or null if type is invalid */ public static Hash getInstance(int type) { switch (type) { case JENKINS_HASH: return JenkinsHash.getInstance(); case MURMUR_HASH: return MurmurHash.getInstance(); default: return null; } }
@Test public void testHash64ByteArrayIntInt() { for (int i = 0; i < input.length; i++) { final long hash = MurmurHash.hash64(input[i], input[i].length, 0x344d1f5c); assertTrue( String.format( "Unexpected hash64 result for example %d: 0x%016x instead of 0x%016x", i, hash, results64_seed[i]), hash == results64_seed[i]); } }
/** * 文本向量化 * * @param analyzer - 选择的分词器对象 * @param field - lucene域名 * @param content - 文本内容 */ public Map<Long, Integer> vectorize(Analyzer analyzer, String field, String content) { Map<Long, Integer> map = new TreeMap<Long, Integer>(); DocWordHashMap wordHash = DocWordHashMap.getInstance(); TokenStream ts = null; try { ts = analyzer.tokenStream(field, content); // 迭代获取分词结果 // 重置TokenStream ts.reset(); while (ts.incrementToken()) { String word = ts.addAttribute(CharTermAttribute.class).toString(); // 逐个MurmurHash词元 long hash = MurmurHash.hash64(word); if (!wordHash.isContainKey(hash)) { wordHash.setWordStringHash(hash, word); } if (!map.containsKey(hash)) { map.put(hash, 1); } else { map.put(hash, map.get(hash) + 1); } } // 关闭TokenStream ts.end(); // Perform end-of-stream operations, e.g. set the final // offset. } catch (CorruptIndexException e) { e.printStackTrace(); map.clear(); } catch (LockObtainFailedException e) { e.printStackTrace(); map.clear(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); map.clear(); } finally { // 释放TokenStream的所有资源 if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); map.clear(); } } } return map; }
@Test public void testHash32ByteArrayIntInt() { for (int i = 0; i < input.length; i++) { final int hash = MurmurHash.hash32(input[i], input[i].length, 0x71b4954d); if (hash != results32_seed[i]) { fail( String.format( "Unexpected hash32 result for example %d: 0x%08x instead of 0x%08x", i, hash, results32_seed[i])); } } }
/** * Loads bloom filter meta data from file input. * * @param meta * @throws IllegalArgumentException */ public ByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException { int version = meta.getInt(); if (version != VERSION) throw new IllegalArgumentException("Bad version"); this.byteSize = meta.getInt(); this.hashCount = meta.getInt(); this.keyCount = new AtomicInteger(meta.getInt()); this.maxKeys = this.keyCount.intValue(); this.hash = MurmurHash.getInstance(); sanityCheck(); allocBloom(); }
public ByteBloomFilter(int maxKeys, float errorRate, int foldFactor) throws IllegalArgumentException { /* * Bloom filters are very sensitive to the number of elements inserted * into them. For HBase, the number of entries depends on the size of * the data stored in the column. Currently the default region size is * 256MB, so entry count ~= 256MB / (average value size for column). * Despite this rule of thumb, there is no efficient way to calculate * the entry count after compactions. Therefore, it is often easier to * use a dynamic bloom filter that will add extra space instead of * allowing the error rate to grow. * * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/ * BloomFilterSurvey.pdf ) * * m denotes the number of bits in the Bloom filter (bitSize) n denotes * the number of elements inserted into the Bloom filter (maxKeys) k * represents the number of hash functions used (nbHash) e represents * the desired false positive rate for the bloom (err) * * If we fix the error rate (e) and know the number of entries, then the * optimal bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / * ln(0.6185) * * The probability of false positives is minimized when k = m/n ln(2). */ int bitSize = (int) Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185))); int functionCount = (int) Math.ceil(Math.log(2) * (bitSize / maxKeys)); // increase byteSize so folding is possible int byteSize = (bitSize + 7) / 8; int mask = (1 << foldFactor) - 1; if ((mask & byteSize) != 0) { byteSize >>= foldFactor; ++byteSize; byteSize <<= foldFactor; } this.byteSize = byteSize; this.hashCount = functionCount; this.hash = MurmurHash.getInstance(); this.keyCount = new AtomicInteger(0); this.maxKeys = maxKeys; sanityCheck(); allocBloom(); }
@Test public void testHash64StringIntInt() { final long hash = MurmurHash.hash64(text, 2, text.length() - 4); assertTrue(hash == 0xa8b33145194985a2l); }
@Test public void testHash64String() { final long hash = MurmurHash.hash64(text); assertTrue(hash == 0x0920e0c1b7eeb261l); }
@Test public void testHash32StringIntInt() { final int hash = MurmurHash.hash32(text, 2, text.length() - 4); assertTrue(hash == 0x4d666d90); }
@Test public void testHash32String() { final int hash = MurmurHash.hash32(text); assertTrue(hash == 0xb3bf597e); }