@Test public void testHiveBloomFilterSerde() throws Exception { BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05); // String bloomFilter.addString(TEST_STRING); assertTrue(bloomFilter.testString(TEST_STRING)); assertFalse(bloomFilter.testString(TEST_STRING_NOT_WRITTEN)); // Integer bloomFilter.addLong(TEST_INTEGER); assertTrue(bloomFilter.testLong(TEST_INTEGER)); assertFalse(bloomFilter.testLong(TEST_INTEGER + 1)); // Re-construct HiveBloomFilter hiveBloomFilter = new HiveBloomFilter( ImmutableList.copyOf(Longs.asList(bloomFilter.getBitSet())), bloomFilter.getBitSize(), bloomFilter.getNumHashFunctions()); // String assertTrue(hiveBloomFilter.testString(TEST_STRING)); assertFalse(hiveBloomFilter.testString(TEST_STRING_NOT_WRITTEN)); // Integer assertTrue(hiveBloomFilter.testLong(TEST_INTEGER)); assertFalse(hiveBloomFilter.testLong(TEST_INTEGER + 1)); }
@Test public void testBloomFilterPredicateValuesExisting() throws Exception { BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01); for (Object o : TEST_VALUES.keySet()) { if (o instanceof Long) { bloomFilter.addLong((Long) o); } else if (o instanceof Integer) { bloomFilter.addLong((Integer) o); } else if (o instanceof String) { bloomFilter.addString((String) o); } else if (o instanceof BigDecimal) { bloomFilter.addString(o.toString()); } else if (o instanceof Slice) { bloomFilter.addString(((Slice) o).toStringUtf8()); } else if (o instanceof Timestamp) { bloomFilter.addLong(((Timestamp) o).getTime()); } else if (o instanceof Double) { bloomFilter.addDouble((Double) o); } else { fail("Unsupported type " + o.getClass()); } } for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) { boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue()); assertTrue(matched, "type " + testValue.getClass()); } // test unsupported type: can be supported by ORC but is not implemented yet assertTrue( checkInBloomFilter(bloomFilter, new Date(), DATE), "unsupported type DATE should always return true"); }
private static OrcProto.BloomFilter toOrcBloomFilter(BloomFilter bloomFilter) { OrcProto.BloomFilter.Builder builder = OrcProto.BloomFilter.newBuilder(); builder.addAllBitset(Longs.asList(bloomFilter.getBitSet())); builder.setNumHashFunctions(bloomFilter.getNumHashFunctions()); return builder.build(); }
@Test public void testOrcHiveBloomFilterSerde() throws Exception { BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05); bloomFilterWrite.addString(TEST_STRING); assertTrue(bloomFilterWrite.testString(TEST_STRING)); OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder(); bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet())); bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions()); OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build(); OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance(); byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex); // Read through method InputStream inputStream = new ByteArrayInputStream(bytes); OrcMetadataReader metadataReader = new OrcMetadataReader(); List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream); assertEquals(bloomFilters.size(), 1); assertTrue(bloomFilters.get(0).testString(TEST_STRING)); assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN)); assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize()); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions()); // Validate bit set assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet())); // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs) CodedInputStream input = CodedInputStream.newInstance(bytes); OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input); List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList(); assertEquals(bloomFilterList.size(), 1); OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0); // Validate contents of ORC bloom filter bit set assertTrue( Arrays.equals( Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet())); // hash functions assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions()); // bit size assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount()); }