@Test public void testOrcHiveBloomFilterSerde() throws Exception { BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05); bloomFilterWrite.addString(TEST_STRING); assertTrue(bloomFilterWrite.testString(TEST_STRING)); OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder(); bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet())); bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions()); OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build(); OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance(); byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex); // Read through method InputStream inputStream = new ByteArrayInputStream(bytes); OrcMetadataReader metadataReader = new OrcMetadataReader(); List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream); assertEquals(bloomFilters.size(), 1); assertTrue(bloomFilters.get(0).testString(TEST_STRING)); assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN)); assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize()); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions()); // Validate bit set assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet())); // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs) CodedInputStream input = CodedInputStream.newInstance(bytes); OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input); List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList(); assertEquals(bloomFilterList.size(), 1); OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0); // Validate contents of ORC bloom filter bit set assertTrue( Arrays.equals( Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet())); // hash functions assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions()); // bit size assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount()); }
private static byte[] serializeBloomFilterToIndex( OrcProto.BloomFilter bloomFilter, OrcProto.BloomFilterIndex bloomFilterIndex) throws IOException { assertTrue(bloomFilter.isInitialized()); OrcProto.BloomFilterIndex.Builder builder = bloomFilterIndex.toBuilder(); builder.addBloomFilter(bloomFilter); OrcProto.BloomFilterIndex index = builder.build(); assertTrue(index.isInitialized()); assertEquals(index.getBloomFilterCount(), 1); ByteArrayOutputStream os = new ByteArrayOutputStream(); index.writeTo(os); os.flush(); return os.toByteArray(); }
private static HiveBloomFilter toHiveBloomFilter(OrcProto.BloomFilter emptyOrcBloomFilter) { return new HiveBloomFilter( emptyOrcBloomFilter.getBitsetList(), emptyOrcBloomFilter.getBitsetCount() * 64, emptyOrcBloomFilter.getNumHashFunctions()); }
private static OrcProto.BloomFilter toOrcBloomFilter(BloomFilter bloomFilter) { OrcProto.BloomFilter.Builder builder = OrcProto.BloomFilter.newBuilder(); builder.addAllBitset(Longs.asList(bloomFilter.getBitSet())); builder.setNumHashFunctions(bloomFilter.getNumHashFunctions()); return builder.build(); }