public void testShortNameCollisionsDirectNew() throws IOException { final int COUNT = 700; { ByteQuadsCanonicalizer symbols = ByteQuadsCanonicalizer.createRoot(333).makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < COUNT; ++i) { String id = String.valueOf((char) i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(1024, symbols.bucketCount()); // Primary is good, but secondary spills cluster in nasty way... assertEquals(564, symbols.primaryCount()); assertEquals(122, symbols.secondaryCount()); assertEquals(14, symbols.tertiaryCount()); assertEquals(0, symbols.spilloverCount()); assertEquals( COUNT, symbols.primaryCount() + symbols.secondaryCount() + symbols.tertiaryCount() + symbols.spilloverCount()); } }
public void testSyntheticWithBytesNew() throws IOException { // pass seed, to keep results consistent: final int SEED = 33333; ByteQuadsCanonicalizer symbols = ByteQuadsCanonicalizer.createRoot(SEED).makeChild(JsonFactory.Feature.collectDefaults()); final int COUNT = 12000; for (int i = 0; i < COUNT; ++i) { String id = fieldNameFor(i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(16384, symbols.bucketCount()); // fragile, but essential to verify low collision counts; // anywhere between 70-80% primary matches assertEquals(8534, symbols.primaryCount()); // secondary between 10-20% assertEquals(2534, symbols.secondaryCount()); // and most of remaining in tertiary assertEquals(932, symbols.tertiaryCount()); // so that spill-over is empty or close to assertEquals(0, symbols.spilloverCount()); }
@SuppressWarnings("deprecation") public void testThousandsOfSymbolsWithOldBytes() throws IOException { final int SEED = 33333; BytesToNameCanonicalizer symbolsBRoot = BytesToNameCanonicalizer.createRoot(SEED); final Charset utf8 = Charset.forName("UTF-8"); int exp = 0; for (int doc = 0; doc < 100; ++doc) { BytesToNameCanonicalizer symbolsB = symbolsBRoot.makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < 250; ++i) { String name = "f_" + doc + "_" + i; int[] quads = BytesToNameCanonicalizer.calcQuads(name.getBytes(utf8)); symbolsB.addName(name, quads, quads.length); Name n = symbolsB.findName(quads, quads.length); assertEquals(name, n.getName()); } symbolsB.release(); exp += 250; if (exp > BytesToNameCanonicalizer.MAX_ENTRIES_FOR_REUSE) { exp = 0; } assertEquals(exp, symbolsBRoot.size()); } }
// [core#191] @SuppressWarnings("deprecation") public void testShortNameCollisionsDirect() throws IOException { final int COUNT = 600; // First, char-based { CharsToNameCanonicalizer symbols = CharsToNameCanonicalizer.createRoot(1); for (int i = 0; i < COUNT; ++i) { String id = String.valueOf((char) i); char[] ch = id.toCharArray(); symbols.findSymbol(ch, 0, ch.length, symbols.calcHash(id)); } assertEquals(COUNT, symbols.size()); assertEquals(1024, symbols.bucketCount()); assertEquals(16, symbols.collisionCount()); assertEquals(1, symbols.maxCollisionLength()); } // then byte-based { BytesToNameCanonicalizer symbols = BytesToNameCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < COUNT; ++i) { String id = String.valueOf((char) i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(1024, symbols.bucketCount()); assertEquals(209, symbols.collisionCount()); assertEquals(1, symbols.maxCollisionLength()); } }
@SuppressWarnings("deprecation") public void testShortQuotedDirectBytesOld() throws IOException { final int COUNT = 400; BytesToNameCanonicalizer symbols = BytesToNameCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < COUNT; ++i) { String id = String.format("\\u%04x", i); int[] quads = BytesToNameCanonicalizer.calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(1024, symbols.bucketCount()); assertEquals(44, symbols.collisionCount()); assertEquals(2, symbols.maxCollisionLength()); }
public void testShortQuotedDirectBytes() throws IOException { final int COUNT = 400; ByteQuadsCanonicalizer symbols = ByteQuadsCanonicalizer.createRoot(123).makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < COUNT; ++i) { String id = String.format("\\u%04x", i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(512, symbols.bucketCount()); assertEquals(285, symbols.primaryCount()); assertEquals(90, symbols.secondaryCount()); assertEquals(25, symbols.tertiaryCount()); assertEquals(0, symbols.spilloverCount()); }
// [core#187]: unexpectedly high number of collisions for straight numbers @SuppressWarnings("deprecation") public void testCollisionsWithBytes187() throws IOException { BytesToNameCanonicalizer symbols = BytesToNameCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults()); final int COUNT = 30000; for (int i = 0; i < COUNT; ++i) { String id = String.valueOf(10000 + i); int[] quads = BytesToNameCanonicalizer.calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } // System.out.printf("Byte stuff: collisions %d, max-coll %d\n", symbols.collisionCount(), // symbols.maxCollisionLength()); assertEquals(COUNT, symbols.size()); assertEquals(65536, symbols.bucketCount()); // collision count acceptable assertEquals(5782, symbols.collisionCount()); // as well as collision counts assertEquals(24, symbols.maxCollisionLength()); }
// [Issue#145] public void testThousandsOfSymbolsWithChars() throws IOException { final int SEED = 33333; CharsToNameCanonicalizer symbolsCRoot = CharsToNameCanonicalizer.createRoot(SEED); int exp = 0; for (int doc = 0; doc < 100; ++doc) { CharsToNameCanonicalizer symbolsC = symbolsCRoot.makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < 250; ++i) { String name = "f_" + doc + "_" + i; char[] ch = name.toCharArray(); String str = symbolsC.findSymbol(ch, 0, ch.length, symbolsC.calcHash(name)); assertNotNull(str); } symbolsC.release(); exp += 250; if (exp > CharsToNameCanonicalizer.MAX_ENTRIES_FOR_REUSE) { exp = 0; } assertEquals(exp, symbolsCRoot.size()); } }
// Since 2.6 public void testThousandsOfSymbolsWithNew() throws IOException { final int SEED = 33333; ByteQuadsCanonicalizer symbolsBRoot = ByteQuadsCanonicalizer.createRoot(SEED); final Charset utf8 = Charset.forName("UTF-8"); int exp = 0; ByteQuadsCanonicalizer symbolsB = null; // loop to get for (int doc = 0; doc < 100; ++doc) { symbolsB = symbolsBRoot.makeChild(JsonFactory.Feature.collectDefaults()); for (int i = 0; i < 250; ++i) { String name = "f_" + doc + "_" + i; int[] quads = calcQuads(name.getBytes(utf8)); symbolsB.addName(name, quads, quads.length); String n = symbolsB.findName(quads, quads.length); assertEquals(name, n); } symbolsB.release(); exp += 250; if (exp > ByteQuadsCanonicalizer.MAX_ENTRIES_FOR_REUSE) { exp = 0; } assertEquals(exp, symbolsBRoot.size()); } /* 05-Feb-2015, tatu: Fragile, but it is important to ensure that collision * rates are not accidentally increased... */ assertEquals(6250, symbolsB.size()); assertEquals(4761, symbolsB.primaryCount()); // 80% primary hit rate assertEquals(1190, symbolsB.secondaryCount()); // 13% secondary assertEquals(299, symbolsB.tertiaryCount()); // 7% tertiary assertEquals(0, symbolsB.spilloverCount()); // and couple of leftovers }
// Test for verifying stability of hashCode, wrt collisions, using // synthetic field name generation and byte-based input (UTF-8) @SuppressWarnings("deprecation") public void testSyntheticWithBytesOld() throws IOException { // pass seed, to keep results consistent: final int SEED = 33333; BytesToNameCanonicalizer symbols = BytesToNameCanonicalizer.createRoot(SEED).makeChild(JsonFactory.Feature.collectDefaults()); final int COUNT = 12000; for (int i = 0; i < COUNT; ++i) { String id = fieldNameFor(i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(16384, symbols.bucketCount()); // System.out.printf("Byte stuff: collisions %d, max-coll %d\n", symbols.collisionCount(), // symbols.maxCollisionLength()); assertEquals(3476, symbols.collisionCount()); // longest collision chain not optimal but ok: assertEquals(15, symbols.maxCollisionLength()); // But also verify entries are actually found? }
// Another variant, but with 1-quad names public void testCollisionsWithBytesNew187b() throws IOException { ByteQuadsCanonicalizer symbols = ByteQuadsCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults()); final int COUNT = 10000; for (int i = 0; i < COUNT; ++i) { String id = String.valueOf(i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(16384, symbols.bucketCount()); // fragile, but essential to verify low collision counts; // here bit low primary, 55% assertEquals(5402, symbols.primaryCount()); // secondary higher than usual, above 25% assertEquals(2744, symbols.secondaryCount()); // and most of remaining in tertiary assertEquals(1834, symbols.tertiaryCount()); // with a bit of spillover assertEquals(20, symbols.spilloverCount()); }
// [core#187]: unexpectedly high number of collisions for straight numbers public void testCollisionsWithBytesNew187a() throws IOException { ByteQuadsCanonicalizer symbols = ByteQuadsCanonicalizer.createRoot(1).makeChild(JsonFactory.Feature.collectDefaults()); final int COUNT = 43000; for (int i = 0; i < COUNT; ++i) { String id = String.valueOf(10000 + i); int[] quads = calcQuads(id.getBytes("UTF-8")); symbols.addName(id, quads, quads.length); } assertEquals(COUNT, symbols.size()); assertEquals(65536, symbols.bucketCount()); /* 29-Mar-2015, tatu: To get collision counts down for this * test took quite a bit of tweaking... */ assertEquals(32342, symbols.primaryCount()); assertEquals(8863, symbols.secondaryCount()); assertEquals(1795, symbols.tertiaryCount()); // finally managed to get this to 0; other variants produced thousands assertEquals(0, symbols.spilloverCount()); }