/** * Copies entries from an existing environment db to a new one. If historyMap is not provided, * only logs the entries that would have been copied. * * @param sourceDir existing environment database directory * @param historyMap new environment db (or null for a dry run) * @return number of records * @throws DatabaseException */ private static int copyPersistEnv(File sourceDir, StoredSortedMap<String, Map> historyMap) throws DatabaseException { int count = 0; // open the source env history DB, copying entries to target env EnhancedEnvironment sourceEnv = setupCopyEnvironment(sourceDir, true); StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog(); DatabaseConfig historyDbConfig = HISTORY_DB_CONFIG.toDatabaseConfig(); historyDbConfig.setReadOnly(true); Database sourceHistoryDB = sourceEnv.openDatabase(null, URI_HISTORY_DBNAME, historyDbConfig); StoredSortedMap<String, Map> sourceHistoryMap = new StoredSortedMap<String, Map>( sourceHistoryDB, new StringBinding(), new SerialBinding<Map>(sourceClassCatalog, Map.class), true); Iterator<Entry<String, Map>> iter = sourceHistoryMap.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Map> item = iter.next(); if (logger.isLoggable(Level.FINE)) { logger.fine(item.getKey() + " " + new JSONObject(item.getValue())); } if (historyMap != null) { historyMap.put(item.getKey(), item.getValue()); } count++; } StoredIterator.close(iter); sourceHistoryDB.close(); sourceEnv.close(); return count; }
/** * Expunge an entry from memMap while updating diskMap. * * @param entry a SoftEntry<V> obtained from refQueuePoll() */ private synchronized void pageOutStaleEntry(SoftEntry<V> entry) { PhantomEntry<V> phantom = entry.phantom; // Still in memMap? if not, was paged-out by earlier direct access // before placed into reference-queue; just return if (memMap.get(phantom.key) != entry) { // NOTE: intentional identity compare return; } // recover hidden value V phantomValue = phantom.doctoredGet(); // Expected value present? (should be; only clear is at end of // this method, after entry removal from memMap) if (phantomValue == null) { logger.log(Level.WARNING, "unexpected null phantomValue", new Exception()); return; // nothing to do } // given instance entry still in memMap; // we have the key and phantom Value, // the diskMap can be updated. diskMap.put(phantom.key, phantomValue); // unchecked cast expungeStatsDiskPut.incrementAndGet(); // remove memMap entry boolean removed = memMap.remove(phantom.key, entry); if (!removed) { logger.log(Level.WARNING, "expunge memMap.remove() ineffective", new Exception()); } phantom.clear(); // truly allows GC of unreferenced V object }
/** * An incremental, poll-based expunger. * * <p>Package-protected for unit-test visibility. */ @SuppressWarnings("unchecked") synchronized void pageOutStaleEntries() { int c = 0; long startTime = System.currentTimeMillis(); for (SoftEntry<V> entry; (entry = (SoftEntry<V>) refQueue.poll()) != null; ) { pageOutStaleEntry(entry); c++; } if (c > 0 && logger.isLoggable(Level.FINER)) { long endTime = System.currentTimeMillis(); try { logger.finer( "DB: " + db.getDatabaseName() + ", Expunged: " + c + ", Diskmap size: " + diskMap.size() + ", Cache size: " + memMap.size() + ", in " + (endTime - startTime) + "ms"); } catch (DatabaseException e) { logger.log(Level.FINER, "exception while logging", e); } } }
/** * Merge any data from the Map stored in the URI-history store into the current instance. * * <p>TODO: ensure compatibility with use of PersistLoadProcessor; suppress double-loading * * @param curi CrawlURI to receive prior state data */ protected void mergePrior(CrawlURI curi) { String key = PersistProcessor.persistKeyFor(curi); @SuppressWarnings({"rawtypes", "unchecked"}) Map<String, Map> prior = (Map<String, Map>) store.get(key); if (prior != null) { // merge in keys curi.getData().putAll(prior); } }
/** * Populates an environment db from a persist log. If historyMap is not provided, only logs the * entries that would have been populated. * * @param persistLogReader persist log * @param historyMap new environment db (or null for a dry run) * @return number of records * @throws UnsupportedEncodingException * @throws DatabaseException */ private static int populatePersistEnvFromLog( BufferedReader persistLogReader, StoredSortedMap<String, Map> historyMap) throws UnsupportedEncodingException, DatabaseException { int count = 0; Iterator<String> iter = new LineReadingIterator(persistLogReader); while (iter.hasNext()) { String line = iter.next(); if (line.length() == 0) { continue; } String[] splits = line.split(" "); if (splits.length != 2) { logger.severe("bad line has " + splits.length + " fields (should be 2): " + line); continue; } Map alist; try { alist = (Map) SerializationUtils.deserialize(Base64.decodeBase64(splits[1].getBytes("UTF-8"))); } catch (Exception e) { logger.severe("caught exception " + e + " deserializing line: " + line); continue; } if (logger.isLoggable(Level.FINE)) { logger.fine(splits[0] + " " + ArchiveUtils.prettyString(alist)); } if (historyMap != null) try { historyMap.put(splits[0], alist); } catch (Exception e) { logger.log( Level.SEVERE, "caught exception after loading " + count + " urls from the persist log (perhaps crawl was stopped by user?)", e); IOUtils.closeQuietly(persistLogReader); // seems to finish most cleanly when we return rather than throw something return count; } count++; } IOUtils.closeQuietly(persistLogReader); return count; }
/** * Call this method when you have an instance when you used the default constructor or when you * have a deserialized instance that you want to reconnect with an extant bdbje environment. Do * not call this method if you used the {@link #CachedBdbMap(File, String, Class, Class)} * constructor. * * @param env * @param keyClass * @param valueClass * @param classCatalog * @throws DatabaseException */ @SuppressWarnings("unchecked") public void initialize( final Environment env, String dbName, final Class valueClass, final StoredClassCatalog classCatalog) throws DatabaseException { // TODO: initial capacity should be related to number of seeds, max depth, max docs this.memMap = new ConcurrentHashMap<String, SoftEntry<V>>( 8192, // initial capacity 0.9f, // acceptable load factor 64 // est. number of concurrent threads ); this.refQueue = new ReferenceQueue<V>(); canary = new SoftReference<LowMemoryCanary>(new LowMemoryCanary()); this.db = openDatabase(env, dbName); this.diskMap = createDiskMap(this.db, classCatalog, valueClass); this.count = new AtomicLong(diskMap.size()); }
/** Return an entry set view of the shipment storage container. */ public final StoredEntrySet getShipmentEntrySet() { return (StoredEntrySet) shipmentMap.entrySet(); }
/** Return an entry set view of the supplier storage container. */ public final StoredEntrySet getSupplierEntrySet() { return (StoredEntrySet) supplierMap.entrySet(); }
/** Return an entry set view of the part storage container. */ public final StoredEntrySet getPartEntrySet() { return (StoredEntrySet) partMap.entrySet(); }
/* (non-Javadoc) * @see org.archive.util.ObjectIdentityCache#keySet() */ public Set<String> keySet() { return diskMap.keySet(); }
/* (non-Javadoc) * @see org.archive.util.ObjectIdentityCache#get(java.lang.String, org.archive.util.ObjectIdentityBdbCache) */ public V getOrUse(final String key, Supplier<V> supplierOrNull) { countOfGets.incrementAndGet(); if (countOfGets.get() % 10000 == 0) { logCacheSummary(); } // check mem cache SoftEntry<V> entry = memMap.get(key); if (entry != null) { V val = entry.get(); if (val != null) { // the concurrent garden path: in mem, valid cacheHit.incrementAndGet(); return val; } } // everything in other difficult cases happens inside this block synchronized (this) { // recheck mem cache -- if another thread beat us into sync // block and already filled the key entry = memMap.get(key); if (entry != null) { V val = entry.get(); if (val != null) { cacheHit.incrementAndGet(); return val; } } // persist to disk all ref-enqueued stale (soft-ref-cleared) entries now pageOutStaleEntries(); // and catch if this exact entry not yet ref-enqueued if (memMap.get(key) != null) { pageOutStaleEntry(entry); if (memMap.get(key) != null) { logger.log(Level.SEVERE, "nulled key " + key + " not paged-out", new Exception()); } } // check disk V valDisk = (V) diskMap.get(key); if (valDisk == null) { // never yet created, consider creating if (supplierOrNull == null) { return null; } // create using provided Supplier valDisk = supplierOrNull.get(); supplierUsed.incrementAndGet(); // putting initial value directly into diskMap // (rather than just the memMap until page-out) // ensures diskMap.keySet() provides complete view V prevVal = diskMap.putIfAbsent(key, valDisk); count.incrementAndGet(); if (prevVal != null) { // ERROR: diskMap modification since previous // diskMap.get() should be impossible logger.log(Level.SEVERE, "diskMap modified outside synchronized block?"); } } else { diskHit.incrementAndGet(); } // keep new val in memMap SoftEntry<V> newEntry = new SoftEntry<V>(key, valDisk, refQueue); SoftEntry<V> prevVal = memMap.putIfAbsent(key, newEntry); if (prevVal != null) { // ERROR: memMap modification since previous // memMap.get() should be impossible logger.log(Level.SEVERE, "memMap modified outside synchronized block?", new Exception()); } return valDisk; } }
/** Return an entity set view of the shipment storage container. */ public StoredSortedValueSet getShipmentSet() { return (StoredSortedValueSet) shipmentMap.values(); }
/** Return an entity set view of the supplier storage container. */ public StoredSortedValueSet getSupplierSet() { return (StoredSortedValueSet) supplierMap.values(); }
/** Return an entity set view of the part storage container. */ public StoredSortedValueSet getPartSet() { return (StoredSortedValueSet) partMap.values(); }