public void testShortNameCollisionsDirectNew() throws IOException {
    final int COUNT = 700;
    {
      ByteQuadsCanonicalizer symbols =
          ByteQuadsCanonicalizer.createRoot(333).makeChild(JsonFactory.Feature.collectDefaults());
      for (int i = 0; i < COUNT; ++i) {
        String id = String.valueOf((char) i);
        int[] quads = calcQuads(id.getBytes("UTF-8"));
        symbols.addName(id, quads, quads.length);
      }
      assertEquals(COUNT, symbols.size());

      assertEquals(1024, symbols.bucketCount());

      // Primary is good, but secondary spills cluster in nasty way...
      assertEquals(564, symbols.primaryCount());
      assertEquals(122, symbols.secondaryCount());
      assertEquals(14, symbols.tertiaryCount());
      assertEquals(0, symbols.spilloverCount());

      assertEquals(
          COUNT,
          symbols.primaryCount()
              + symbols.secondaryCount()
              + symbols.tertiaryCount()
              + symbols.spilloverCount());
    }
  }
  public void testSyntheticWithBytesNew() throws IOException {
    // pass seed, to keep results consistent:
    final int SEED = 33333;
    ByteQuadsCanonicalizer symbols =
        ByteQuadsCanonicalizer.createRoot(SEED).makeChild(JsonFactory.Feature.collectDefaults());

    final int COUNT = 12000;
    for (int i = 0; i < COUNT; ++i) {
      String id = fieldNameFor(i);
      int[] quads = calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }
    assertEquals(COUNT, symbols.size());
    assertEquals(16384, symbols.bucketCount());

    // fragile, but essential to verify low collision counts;
    // anywhere between 70-80% primary matches
    assertEquals(8534, symbols.primaryCount());
    // secondary between 10-20%
    assertEquals(2534, symbols.secondaryCount());
    // and most of remaining in tertiary
    assertEquals(932, symbols.tertiaryCount());
    // so that spill-over is empty or close to
    assertEquals(0, symbols.spilloverCount());
  }
  @SuppressWarnings("deprecation")
  public void testThousandsOfSymbolsWithOldBytes() throws IOException {
    final int SEED = 33333;

    BytesToNameCanonicalizer symbolsBRoot = BytesToNameCanonicalizer.createRoot(SEED);
    final Charset utf8 = Charset.forName("UTF-8");
    int exp = 0;

    for (int doc = 0; doc < 100; ++doc) {
      BytesToNameCanonicalizer symbolsB =
          symbolsBRoot.makeChild(JsonFactory.Feature.collectDefaults());
      for (int i = 0; i < 250; ++i) {
        String name = "f_" + doc + "_" + i;

        int[] quads = BytesToNameCanonicalizer.calcQuads(name.getBytes(utf8));
        symbolsB.addName(name, quads, quads.length);
        Name n = symbolsB.findName(quads, quads.length);
        assertEquals(name, n.getName());
      }
      symbolsB.release();
      exp += 250;
      if (exp > BytesToNameCanonicalizer.MAX_ENTRIES_FOR_REUSE) {
        exp = 0;
      }
      assertEquals(exp, symbolsBRoot.size());
    }
  }
  // [core#191]
  @SuppressWarnings("deprecation")
  public void testShortNameCollisionsDirect() throws IOException {
    final int COUNT = 600;

    // First, char-based
    {
      CharsToNameCanonicalizer symbols = CharsToNameCanonicalizer.createRoot(1);
      for (int i = 0; i < COUNT; ++i) {
        String id = String.valueOf((char) i);
        char[] ch = id.toCharArray();
        symbols.findSymbol(ch, 0, ch.length, symbols.calcHash(id));
      }
      assertEquals(COUNT, symbols.size());
      assertEquals(1024, symbols.bucketCount());

      assertEquals(16, symbols.collisionCount());
      assertEquals(1, symbols.maxCollisionLength());
    }

    // then byte-based
    {
      BytesToNameCanonicalizer symbols =
          BytesToNameCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults());
      for (int i = 0; i < COUNT; ++i) {
        String id = String.valueOf((char) i);
        int[] quads = calcQuads(id.getBytes("UTF-8"));
        symbols.addName(id, quads, quads.length);
      }
      assertEquals(COUNT, symbols.size());
      assertEquals(1024, symbols.bucketCount());

      assertEquals(209, symbols.collisionCount());
      assertEquals(1, symbols.maxCollisionLength());
    }
  }
  @SuppressWarnings("deprecation")
  public void testShortQuotedDirectBytesOld() throws IOException {
    final int COUNT = 400;
    BytesToNameCanonicalizer symbols =
        BytesToNameCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults());
    for (int i = 0; i < COUNT; ++i) {
      String id = String.format("\\u%04x", i);
      int[] quads = BytesToNameCanonicalizer.calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }
    assertEquals(COUNT, symbols.size());
    assertEquals(1024, symbols.bucketCount());

    assertEquals(44, symbols.collisionCount());
    assertEquals(2, symbols.maxCollisionLength());
  }
  public void testShortQuotedDirectBytes() throws IOException {
    final int COUNT = 400;
    ByteQuadsCanonicalizer symbols =
        ByteQuadsCanonicalizer.createRoot(123).makeChild(JsonFactory.Feature.collectDefaults());
    for (int i = 0; i < COUNT; ++i) {
      String id = String.format("\\u%04x", i);
      int[] quads = calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }
    assertEquals(COUNT, symbols.size());
    assertEquals(512, symbols.bucketCount());

    assertEquals(285, symbols.primaryCount());
    assertEquals(90, symbols.secondaryCount());
    assertEquals(25, symbols.tertiaryCount());
    assertEquals(0, symbols.spilloverCount());
  }
  // [core#187]: unexpectedly high number of collisions for straight numbers
  @SuppressWarnings("deprecation")
  public void testCollisionsWithBytes187() throws IOException {
    BytesToNameCanonicalizer symbols =
        BytesToNameCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults());
    final int COUNT = 30000;
    for (int i = 0; i < COUNT; ++i) {
      String id = String.valueOf(10000 + i);
      int[] quads = BytesToNameCanonicalizer.calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }

    // System.out.printf("Byte stuff: collisions %d, max-coll %d\n", symbols.collisionCount(),
    // symbols.maxCollisionLength());

    assertEquals(COUNT, symbols.size());
    assertEquals(65536, symbols.bucketCount());

    // collision count acceptable
    assertEquals(5782, symbols.collisionCount());
    // as well as collision counts
    assertEquals(24, symbols.maxCollisionLength());
  }
  // [Issue#145]
  public void testThousandsOfSymbolsWithChars() throws IOException {
    final int SEED = 33333;

    CharsToNameCanonicalizer symbolsCRoot = CharsToNameCanonicalizer.createRoot(SEED);
    int exp = 0;

    for (int doc = 0; doc < 100; ++doc) {
      CharsToNameCanonicalizer symbolsC =
          symbolsCRoot.makeChild(JsonFactory.Feature.collectDefaults());
      for (int i = 0; i < 250; ++i) {
        String name = "f_" + doc + "_" + i;
        char[] ch = name.toCharArray();
        String str = symbolsC.findSymbol(ch, 0, ch.length, symbolsC.calcHash(name));
        assertNotNull(str);
      }
      symbolsC.release();
      exp += 250;
      if (exp > CharsToNameCanonicalizer.MAX_ENTRIES_FOR_REUSE) {
        exp = 0;
      }
      assertEquals(exp, symbolsCRoot.size());
    }
  }
  // Since 2.6
  public void testThousandsOfSymbolsWithNew() throws IOException {
    final int SEED = 33333;

    ByteQuadsCanonicalizer symbolsBRoot = ByteQuadsCanonicalizer.createRoot(SEED);
    final Charset utf8 = Charset.forName("UTF-8");
    int exp = 0;
    ByteQuadsCanonicalizer symbolsB = null;

    // loop to get
    for (int doc = 0; doc < 100; ++doc) {
      symbolsB = symbolsBRoot.makeChild(JsonFactory.Feature.collectDefaults());
      for (int i = 0; i < 250; ++i) {
        String name = "f_" + doc + "_" + i;

        int[] quads = calcQuads(name.getBytes(utf8));

        symbolsB.addName(name, quads, quads.length);
        String n = symbolsB.findName(quads, quads.length);
        assertEquals(name, n);
      }
      symbolsB.release();

      exp += 250;
      if (exp > ByteQuadsCanonicalizer.MAX_ENTRIES_FOR_REUSE) {
        exp = 0;
      }
      assertEquals(exp, symbolsBRoot.size());
    }
    /* 05-Feb-2015, tatu: Fragile, but it is important to ensure that collision
     *   rates are not accidentally increased...
     */
    assertEquals(6250, symbolsB.size());
    assertEquals(4761, symbolsB.primaryCount()); // 80% primary hit rate
    assertEquals(1190, symbolsB.secondaryCount()); // 13% secondary
    assertEquals(299, symbolsB.tertiaryCount()); // 7% tertiary
    assertEquals(0, symbolsB.spilloverCount()); // and couple of leftovers
  }
Example #10
0
  // Test for verifying stability of hashCode, wrt collisions, using
  // synthetic field name generation and byte-based input (UTF-8)
  @SuppressWarnings("deprecation")
  public void testSyntheticWithBytesOld() throws IOException {
    // pass seed, to keep results consistent:
    final int SEED = 33333;
    BytesToNameCanonicalizer symbols =
        BytesToNameCanonicalizer.createRoot(SEED).makeChild(JsonFactory.Feature.collectDefaults());

    final int COUNT = 12000;
    for (int i = 0; i < COUNT; ++i) {
      String id = fieldNameFor(i);
      int[] quads = calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }
    assertEquals(COUNT, symbols.size());
    assertEquals(16384, symbols.bucketCount());

    // System.out.printf("Byte stuff: collisions %d, max-coll %d\n", symbols.collisionCount(),
    // symbols.maxCollisionLength());
    assertEquals(3476, symbols.collisionCount());
    // longest collision chain not optimal but ok:
    assertEquals(15, symbols.maxCollisionLength());

    // But also verify entries are actually found?
  }
Example #11
0
  // Another variant, but with 1-quad names
  public void testCollisionsWithBytesNew187b() throws IOException {
    ByteQuadsCanonicalizer symbols =
        ByteQuadsCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults());

    final int COUNT = 10000;
    for (int i = 0; i < COUNT; ++i) {
      String id = String.valueOf(i);
      int[] quads = calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }
    assertEquals(COUNT, symbols.size());

    assertEquals(16384, symbols.bucketCount());

    // fragile, but essential to verify low collision counts;
    // here bit low primary, 55%
    assertEquals(5402, symbols.primaryCount());
    // secondary higher than usual, above 25%
    assertEquals(2744, symbols.secondaryCount());
    // and most of remaining in tertiary
    assertEquals(1834, symbols.tertiaryCount());
    // with a bit of spillover
    assertEquals(20, symbols.spilloverCount());
  }
Example #12
0
  // [core#187]: unexpectedly high number of collisions for straight numbers
  public void testCollisionsWithBytesNew187a() throws IOException {
    ByteQuadsCanonicalizer symbols =
        ByteQuadsCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults());

    final int COUNT = 43000;
    for (int i = 0; i < COUNT; ++i) {
      String id = String.valueOf(10000 + i);
      int[] quads = calcQuads(id.getBytes("UTF-8"));
      symbols.addName(id, quads, quads.length);
    }

    assertEquals(COUNT, symbols.size());
    assertEquals(65536, symbols.bucketCount());

    /* 29-Mar-2015, tatu: To get collision counts down for this
     *    test took quite a bit of tweaking...
     */
    assertEquals(32342, symbols.primaryCount());
    assertEquals(8863, symbols.secondaryCount());
    assertEquals(1795, symbols.tertiaryCount());

    // finally managed to get this to 0; other variants produced thousands
    assertEquals(0, symbols.spilloverCount());
  }