コード例 #1
0
  @Test
  public void testHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05);

    // String
    bloomFilter.addString(TEST_STRING);
    assertTrue(bloomFilter.testString(TEST_STRING));
    assertFalse(bloomFilter.testString(TEST_STRING_NOT_WRITTEN));

    // Integer
    bloomFilter.addLong(TEST_INTEGER);
    assertTrue(bloomFilter.testLong(TEST_INTEGER));
    assertFalse(bloomFilter.testLong(TEST_INTEGER + 1));

    // Re-construct
    HiveBloomFilter hiveBloomFilter =
        new HiveBloomFilter(
            ImmutableList.copyOf(Longs.asList(bloomFilter.getBitSet())),
            bloomFilter.getBitSize(),
            bloomFilter.getNumHashFunctions());

    // String
    assertTrue(hiveBloomFilter.testString(TEST_STRING));
    assertFalse(hiveBloomFilter.testString(TEST_STRING_NOT_WRITTEN));

    // Integer
    assertTrue(hiveBloomFilter.testLong(TEST_INTEGER));
    assertFalse(hiveBloomFilter.testLong(TEST_INTEGER + 1));
  }
コード例 #2
0
  @Test
  public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);

    bloomFilterWrite.addString(TEST_STRING);
    assertTrue(bloomFilterWrite.testString(TEST_STRING));

    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());

    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);

    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);

    assertEquals(bloomFilters.size(), 1);

    assertTrue(bloomFilters.get(0).testString(TEST_STRING));
    assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN));

    assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());

    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));

    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex =
        OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);

    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);

    // Validate contents of ORC bloom filter bit set
    assertTrue(
        Arrays.equals(
            Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));

    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());

    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
  }