Пример #1
0
  @Test
  public void testHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05);

    // String
    bloomFilter.addString(TEST_STRING);
    assertTrue(bloomFilter.testString(TEST_STRING));
    assertFalse(bloomFilter.testString(TEST_STRING_NOT_WRITTEN));

    // Integer
    bloomFilter.addLong(TEST_INTEGER);
    assertTrue(bloomFilter.testLong(TEST_INTEGER));
    assertFalse(bloomFilter.testLong(TEST_INTEGER + 1));

    // Re-construct
    HiveBloomFilter hiveBloomFilter =
        new HiveBloomFilter(
            ImmutableList.copyOf(Longs.asList(bloomFilter.getBitSet())),
            bloomFilter.getBitSize(),
            bloomFilter.getNumHashFunctions());

    // String
    assertTrue(hiveBloomFilter.testString(TEST_STRING));
    assertFalse(hiveBloomFilter.testString(TEST_STRING_NOT_WRITTEN));

    // Integer
    assertTrue(hiveBloomFilter.testLong(TEST_INTEGER));
    assertFalse(hiveBloomFilter.testLong(TEST_INTEGER + 1));
  }
Пример #2
0
  @Test
  public void testBloomFilterPredicateValuesExisting() throws Exception {
    BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01);

    for (Object o : TEST_VALUES.keySet()) {
      if (o instanceof Long) {
        bloomFilter.addLong((Long) o);
      } else if (o instanceof Integer) {
        bloomFilter.addLong((Integer) o);
      } else if (o instanceof String) {
        bloomFilter.addString((String) o);
      } else if (o instanceof BigDecimal) {
        bloomFilter.addString(o.toString());
      } else if (o instanceof Slice) {
        bloomFilter.addString(((Slice) o).toStringUtf8());
      } else if (o instanceof Timestamp) {
        bloomFilter.addLong(((Timestamp) o).getTime());
      } else if (o instanceof Double) {
        bloomFilter.addDouble((Double) o);
      } else {
        fail("Unsupported type " + o.getClass());
      }
    }

    for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
      boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue());
      assertTrue(matched, "type " + testValue.getClass());
    }

    // test unsupported type: can be supported by ORC but is not implemented yet
    assertTrue(
        checkInBloomFilter(bloomFilter, new Date(), DATE),
        "unsupported type DATE should always return true");
  }
Пример #3
0
 private static OrcProto.BloomFilter toOrcBloomFilter(BloomFilter bloomFilter) {
   OrcProto.BloomFilter.Builder builder = OrcProto.BloomFilter.newBuilder();
   builder.addAllBitset(Longs.asList(bloomFilter.getBitSet()));
   builder.setNumHashFunctions(bloomFilter.getNumHashFunctions());
   return builder.build();
 }
Пример #4
0
  @Test
  public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);

    bloomFilterWrite.addString(TEST_STRING);
    assertTrue(bloomFilterWrite.testString(TEST_STRING));

    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());

    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);

    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);

    assertEquals(bloomFilters.size(), 1);

    assertTrue(bloomFilters.get(0).testString(TEST_STRING));
    assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN));

    assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());

    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));

    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex =
        OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);

    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);

    // Validate contents of ORC bloom filter bit set
    assertTrue(
        Arrays.equals(
            Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));

    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());

    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
  }