/**
   * Copies entries from an existing environment db to a new one. If historyMap is not provided,
   * only logs the entries that would have been copied.
   *
   * @param sourceDir existing environment database directory
   * @param historyMap new environment db (or null for a dry run)
   * @return number of records
   * @throws DatabaseException
   */
  private static int copyPersistEnv(File sourceDir, StoredSortedMap<String, Map> historyMap)
      throws DatabaseException {
    int count = 0;

    // open the source env history DB, copying entries to target env
    EnhancedEnvironment sourceEnv = setupCopyEnvironment(sourceDir, true);
    StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
    DatabaseConfig historyDbConfig = HISTORY_DB_CONFIG.toDatabaseConfig();
    historyDbConfig.setReadOnly(true);
    Database sourceHistoryDB = sourceEnv.openDatabase(null, URI_HISTORY_DBNAME, historyDbConfig);
    StoredSortedMap<String, Map> sourceHistoryMap =
        new StoredSortedMap<String, Map>(
            sourceHistoryDB,
            new StringBinding(),
            new SerialBinding<Map>(sourceClassCatalog, Map.class),
            true);

    Iterator<Entry<String, Map>> iter = sourceHistoryMap.entrySet().iterator();
    while (iter.hasNext()) {
      Entry<String, Map> item = iter.next();
      if (logger.isLoggable(Level.FINE)) {
        logger.fine(item.getKey() + " " + new JSONObject(item.getValue()));
      }

      if (historyMap != null) {
        historyMap.put(item.getKey(), item.getValue());
      }
      count++;
    }
    StoredIterator.close(iter);
    sourceHistoryDB.close();
    sourceEnv.close();

    return count;
  }
  /**
   * Expunge an entry from memMap while updating diskMap.
   *
   * @param entry a SoftEntry<V> obtained from refQueuePoll()
   */
  private synchronized void pageOutStaleEntry(SoftEntry<V> entry) {
    PhantomEntry<V> phantom = entry.phantom;

    // Still in memMap? if not, was paged-out by earlier direct access
    // before placed into reference-queue; just return
    if (memMap.get(phantom.key) != entry) { // NOTE: intentional identity compare
      return;
    }

    // recover hidden value
    V phantomValue = phantom.doctoredGet();

    // Expected value present? (should be; only clear is at end of
    // this method, after entry removal from memMap)
    if (phantomValue == null) {
      logger.log(Level.WARNING, "unexpected null phantomValue", new Exception());
      return; // nothing to do
    }

    // given instance entry still in memMap;
    // we have the key and phantom Value,
    // the diskMap can be updated.
    diskMap.put(phantom.key, phantomValue); // unchecked cast
    expungeStatsDiskPut.incrementAndGet();

    //  remove memMap entry
    boolean removed = memMap.remove(phantom.key, entry);
    if (!removed) {
      logger.log(Level.WARNING, "expunge memMap.remove() ineffective", new Exception());
    }
    phantom.clear(); // truly allows GC of unreferenced V object
  }
 /**
  * An incremental, poll-based expunger.
  *
  * <p>Package-protected for unit-test visibility.
  */
 @SuppressWarnings("unchecked")
 synchronized void pageOutStaleEntries() {
   int c = 0;
   long startTime = System.currentTimeMillis();
   for (SoftEntry<V> entry; (entry = (SoftEntry<V>) refQueue.poll()) != null; ) {
     pageOutStaleEntry(entry);
     c++;
   }
   if (c > 0 && logger.isLoggable(Level.FINER)) {
     long endTime = System.currentTimeMillis();
     try {
       logger.finer(
           "DB: "
               + db.getDatabaseName()
               + ",  Expunged: "
               + c
               + ", Diskmap size: "
               + diskMap.size()
               + ", Cache size: "
               + memMap.size()
               + ", in "
               + (endTime - startTime)
               + "ms");
     } catch (DatabaseException e) {
       logger.log(Level.FINER, "exception while logging", e);
     }
   }
 }
 /**
  * Merge any data from the Map stored in the URI-history store into the current instance.
  *
  * <p>TODO: ensure compatibility with use of PersistLoadProcessor; suppress double-loading
  *
  * @param curi CrawlURI to receive prior state data
  */
 protected void mergePrior(CrawlURI curi) {
   String key = PersistProcessor.persistKeyFor(curi);
   @SuppressWarnings({"rawtypes", "unchecked"})
   Map<String, Map> prior = (Map<String, Map>) store.get(key);
   if (prior != null) {
     // merge in keys
     curi.getData().putAll(prior);
   }
 }
  /**
   * Populates an environment db from a persist log. If historyMap is not provided, only logs the
   * entries that would have been populated.
   *
   * @param persistLogReader persist log
   * @param historyMap new environment db (or null for a dry run)
   * @return number of records
   * @throws UnsupportedEncodingException
   * @throws DatabaseException
   */
  private static int populatePersistEnvFromLog(
      BufferedReader persistLogReader, StoredSortedMap<String, Map> historyMap)
      throws UnsupportedEncodingException, DatabaseException {
    int count = 0;

    Iterator<String> iter = new LineReadingIterator(persistLogReader);
    while (iter.hasNext()) {
      String line = iter.next();
      if (line.length() == 0) {
        continue;
      }
      String[] splits = line.split(" ");
      if (splits.length != 2) {
        logger.severe("bad line has " + splits.length + " fields (should be 2): " + line);
        continue;
      }

      Map alist;
      try {
        alist =
            (Map) SerializationUtils.deserialize(Base64.decodeBase64(splits[1].getBytes("UTF-8")));
      } catch (Exception e) {
        logger.severe("caught exception " + e + " deserializing line: " + line);
        continue;
      }

      if (logger.isLoggable(Level.FINE)) {
        logger.fine(splits[0] + " " + ArchiveUtils.prettyString(alist));
      }

      if (historyMap != null)
        try {
          historyMap.put(splits[0], alist);
        } catch (Exception e) {
          logger.log(
              Level.SEVERE,
              "caught exception after loading "
                  + count
                  + " urls from the persist log (perhaps crawl was stopped by user?)",
              e);
          IOUtils.closeQuietly(persistLogReader);

          // seems to finish most cleanly when we return rather than throw something
          return count;
        }

      count++;
    }
    IOUtils.closeQuietly(persistLogReader);

    return count;
  }
  /**
   * Call this method when you have an instance when you used the default constructor or when you
   * have a deserialized instance that you want to reconnect with an extant bdbje environment. Do
   * not call this method if you used the {@link #CachedBdbMap(File, String, Class, Class)}
   * constructor.
   *
   * @param env
   * @param keyClass
   * @param valueClass
   * @param classCatalog
   * @throws DatabaseException
   */
  @SuppressWarnings("unchecked")
  public void initialize(
      final Environment env,
      String dbName,
      final Class valueClass,
      final StoredClassCatalog classCatalog)
      throws DatabaseException {
    // TODO: initial capacity should be related to number of seeds, max depth, max docs
    this.memMap =
        new ConcurrentHashMap<String, SoftEntry<V>>(
            8192, // initial capacity
            0.9f, // acceptable load factor
            64 // est. number of concurrent threads
            );
    this.refQueue = new ReferenceQueue<V>();
    canary = new SoftReference<LowMemoryCanary>(new LowMemoryCanary());

    this.db = openDatabase(env, dbName);
    this.diskMap = createDiskMap(this.db, classCatalog, valueClass);
    this.count = new AtomicLong(diskMap.size());
  }
Exemple #7
0
  /** Return an entry set view of the shipment storage container. */
  public final StoredEntrySet getShipmentEntrySet() {

    return (StoredEntrySet) shipmentMap.entrySet();
  }
Exemple #8
0
  /** Return an entry set view of the supplier storage container. */
  public final StoredEntrySet getSupplierEntrySet() {

    return (StoredEntrySet) supplierMap.entrySet();
  }
Exemple #9
0
  /** Return an entry set view of the part storage container. */
  public final StoredEntrySet getPartEntrySet() {

    return (StoredEntrySet) partMap.entrySet();
  }
 /* (non-Javadoc)
  * @see org.archive.util.ObjectIdentityCache#keySet()
  */
 public Set<String> keySet() {
   return diskMap.keySet();
 }
  /* (non-Javadoc)
   * @see org.archive.util.ObjectIdentityCache#get(java.lang.String, org.archive.util.ObjectIdentityBdbCache)
   */
  public V getOrUse(final String key, Supplier<V> supplierOrNull) {
    countOfGets.incrementAndGet();

    if (countOfGets.get() % 10000 == 0) {
      logCacheSummary();
    }

    // check mem cache
    SoftEntry<V> entry = memMap.get(key);
    if (entry != null) {
      V val = entry.get();
      if (val != null) {
        // the concurrent garden path: in mem, valid
        cacheHit.incrementAndGet();
        return val;
      }
    }

    // everything in other difficult cases happens inside this block
    synchronized (this) {
      // recheck mem cache -- if another thread beat us into sync
      // block and already filled the key
      entry = memMap.get(key);
      if (entry != null) {
        V val = entry.get();
        if (val != null) {
          cacheHit.incrementAndGet();
          return val;
        }
      }
      // persist to disk all ref-enqueued stale (soft-ref-cleared) entries now
      pageOutStaleEntries();
      // and catch if this exact entry not yet ref-enqueued
      if (memMap.get(key) != null) {
        pageOutStaleEntry(entry);
        if (memMap.get(key) != null) {
          logger.log(Level.SEVERE, "nulled key " + key + " not paged-out", new Exception());
        }
      }

      // check disk
      V valDisk = (V) diskMap.get(key);
      if (valDisk == null) {
        // never yet created, consider creating
        if (supplierOrNull == null) {
          return null;
        }
        // create using provided Supplier
        valDisk = supplierOrNull.get();
        supplierUsed.incrementAndGet();
        // putting initial value directly into diskMap
        // (rather than just the memMap until page-out)
        // ensures diskMap.keySet() provides complete view
        V prevVal = diskMap.putIfAbsent(key, valDisk);
        count.incrementAndGet();
        if (prevVal != null) {
          // ERROR: diskMap modification since previous
          // diskMap.get() should be impossible
          logger.log(Level.SEVERE, "diskMap modified outside synchronized block?");
        }
      } else {
        diskHit.incrementAndGet();
      }

      // keep new val in memMap
      SoftEntry<V> newEntry = new SoftEntry<V>(key, valDisk, refQueue);
      SoftEntry<V> prevVal = memMap.putIfAbsent(key, newEntry);
      if (prevVal != null) {
        // ERROR: memMap modification since previous
        // memMap.get() should be impossible
        logger.log(Level.SEVERE, "memMap modified outside synchronized block?", new Exception());
      }
      return valDisk;
    }
  }
Exemple #12
0
  /** Return an entity set view of the shipment storage container. */
  public StoredSortedValueSet getShipmentSet() {

    return (StoredSortedValueSet) shipmentMap.values();
  }
Exemple #13
0
  /** Return an entity set view of the supplier storage container. */
  public StoredSortedValueSet getSupplierSet() {

    return (StoredSortedValueSet) supplierMap.values();
  }
Exemple #14
0
  /** Return an entity set view of the part storage container. */
  public StoredSortedValueSet getPartSet() {

    return (StoredSortedValueSet) partMap.values();
  }