public SpillMap(SpillFile file, int thresholdBytes) throws IOException { this.thresholdBytes = thresholdBytes; this.spillFile = file; byteMap = MappedByteBufferMap.newMappedByteBufferMap(thresholdBytes, spillFile); bFilters = Lists.newArrayList(); bFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(), 1000)); }
// Funtion checks if current page has enough space to fit the new serialized tuple // If not it flushes the current buffer and gets a new page // TODO: The code does not ensure that pages are optimally packed. // It only tries to fill up the current page as much as possbile, if its // exhausted it requests a new page. Instead it would be nice to load the next page // that could fit the new value. private void ensureSpace(byte[] value) { if (!byteMap.canFit(value)) { // Flush current buffer byteMap.flushBuffer(); // Get next page byteMap = MappedByteBufferMap.newMappedByteBufferMap(thresholdBytes, spillFile); // Create new bloomfilter bFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(), 1000)); } }
private TitleNameNormalizer(String pathToEvaluationRedirectsData) throws IOException { if (useBloomFilter) { redirectFilter = BloomFilter.create(Funnels.stringFunnel(), ESTIMATED_REDIRECTS); redirects = new LRUCache<String, String>(5000) { protected String loadValue(String src) { String normalized = TitleNameIndexer.normalize(src); if (normalized == null) return src; return TitleNameIndexer.normalize(src); } }; } else redirects = new StringMap<String>(); if (showInitProgress) System.out.println( "Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version"); if (pathToEvaluationRedirectsData != null) { InputStream is = CompressionUtils.readSnappyCompressed(pathToEvaluationRedirectsData); LineIterator iterator = IOUtils.lineIterator(is, StandardCharsets.UTF_8); long linecount = 0; while (iterator.hasNext()) { String line = iterator.nextLine(); if (showInitProgress && linecount++ % 100000 == 0) System.out.println("loading the latest redirects; linecount=" + linecount); String[] parts = StringUtils.split(line, '\t'); String src = parts[0].trim().replace(' ', '_'); String trg = parts[1].trim().replace(' ', '_'); if (useBloomFilter) redirectFilter.put(src); else redirects.put(src, trg); } iterator.close(); } redirects = Collections.unmodifiableMap(redirects); if (showInitProgress) System.out.println( "Done - Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version"); }
/** For comparing the load differences between consistent hash and HRW */ public class Compare { private static final HashFunction hfunc = Hashing.murmur3_128(); private static final Funnel<CharSequence> strFunnel = Funnels.stringFunnel(Charset.defaultCharset()); public static void main(String[] args) { Map<String, AtomicInteger> distribution = Maps.newHashMap(); System.out.println("======: ConsistentHash :========"); ConsistentHash<String, String> c = new ConsistentHash(hfunc, strFunnel, strFunnel, getNodes(distribution)); for (int i = 0; i < 10000; i++) { distribution.get(c.get("" + i)).incrementAndGet(); } for (Entry<String, AtomicInteger> e : distribution.entrySet()) { System.out.println(e.getKey() + "," + e.getValue().get()); e.getValue().set(0); } System.out.println("====== remove 2 ========"); for (int i = 0; i < 2; i++) { c.remove("Node" + i); distribution.remove("Node" + i); } for (int i = 0; i < 10000; i++) { distribution.get(c.get("" + i)).incrementAndGet(); } for (Entry<String, AtomicInteger> e : distribution.entrySet()) { System.out.println(e.getKey() + "," + e.getValue().get()); } System.out.println("======: RendezvousHash :========"); distribution = Maps.newHashMap(); RendezvousHash<String, String> r = new RendezvousHash(hfunc, strFunnel, strFunnel, getNodes(distribution)); for (int i = 0; i < 10000; i++) { distribution.get(r.get("" + i)).incrementAndGet(); } for (Entry<String, AtomicInteger> e : distribution.entrySet()) { System.out.println(e.getKey() + "," + e.getValue().get()); e.getValue().set(0); } System.out.println("====== remove 2 ========"); for (int i = 0; i < 2; i++) { r.remove("Node" + i); distribution.remove("Node" + i); } for (int i = 0; i < 10000; i++) { distribution.get(r.get("" + i)).incrementAndGet(); } for (Entry<String, AtomicInteger> e : distribution.entrySet()) { System.out.println(e.getKey() + "," + e.getValue().get()); } } private static List<String> getNodes(Map<String, AtomicInteger> distribution) { List<String> nodes = Lists.newArrayList(); for (int i = 0; i < 5; i++) { nodes.add("Node" + i); distribution.put("Node" + i, new AtomicInteger()); } return nodes; } }
/** * Hashes the contents of this byte source using the given hash function. * * @throws IOException if an I/O error occurs in the process of reading from this source */ public HashCode hash(HashFunction hashFunction) throws IOException { Hasher hasher = hashFunction.newHasher(); copyTo(Funnels.asOutputStream(hasher)); return hasher.hash(); }
private void createNewBloomFilter(int expectedInsertions) throws IOException { bloomFilter = BloomFilter.create( Funnels.byteArrayFunnel(), expectedInsertions, FALSE_POSITIVE_PROBABILITY); this.persistBloomFilter(); }