Exemplo n.º 1
0
public class Memtable implements Comparable<Memtable> {
  private static final Logger logger = LoggerFactory.getLogger(Memtable.class);

  static final MemtablePool MEMORY_POOL = DatabaseDescriptor.getMemtableAllocatorPool();
  private static final int ROW_OVERHEAD_HEAP_SIZE =
      estimateRowOverhead(
          Integer.parseInt(
              System.getProperty("cassandra.memtable_row_overhead_computation_step", "100000")));

  private final MemtableAllocator allocator;
  private final AtomicLong liveDataSize = new AtomicLong(0);
  private final AtomicLong currentOperations = new AtomicLong(0);

  // the write barrier for directing writes to this memtable during a switch
  private volatile OpOrder.Barrier writeBarrier;
  // the precise upper bound of ReplayPosition owned by this memtable
  private volatile AtomicReference<ReplayPosition> commitLogUpperBound;
  // the precise lower bound of ReplayPosition owned by this memtable; equal to its predecessor's
  // commitLogUpperBound
  private AtomicReference<ReplayPosition> commitLogLowerBound;
  // the approximate lower bound by this memtable; must be <= commitLogLowerBound once our
  // predecessor
  // has been finalised, and this is enforced in the ColumnFamilyStore.setCommitLogUpperBound
  private final ReplayPosition approximateCommitLogLowerBound = CommitLog.instance.getContext();

  public int compareTo(Memtable that) {
    return this.approximateCommitLogLowerBound.compareTo(that.approximateCommitLogLowerBound);
  }

  public static final class LastReplayPosition extends ReplayPosition {
    public LastReplayPosition(ReplayPosition copy) {
      super(copy.segment, copy.position);
    }
  }

  // We index the memtable by PartitionPosition only for the purpose of being able
  // to select key range using Token.KeyBound. However put() ensures that we
  // actually only store DecoratedKey.
  private final ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> partitions =
      new ConcurrentSkipListMap<>();
  public final ColumnFamilyStore cfs;
  private final long creationNano = System.nanoTime();

  // The smallest timestamp for all partitions stored in this memtable
  private long minTimestamp = Long.MAX_VALUE;

  // Record the comparator of the CFS at the creation of the memtable. This
  // is only used when a user update the CF comparator, to know if the
  // memtable was created with the new or old comparator.
  public final ClusteringComparator initialComparator;

  private final ColumnsCollector columnsCollector;
  private final StatsCollector statsCollector = new StatsCollector();

  // only to be used by init(), to setup the very first memtable for the cfs
  public Memtable(AtomicReference<ReplayPosition> commitLogLowerBound, ColumnFamilyStore cfs) {
    this.cfs = cfs;
    this.commitLogLowerBound = commitLogLowerBound;
    this.allocator = MEMORY_POOL.newAllocator();
    this.initialComparator = cfs.metadata.comparator;
    this.cfs.scheduleFlush();
    this.columnsCollector = new ColumnsCollector(cfs.metadata.partitionColumns());
  }

  // ONLY to be used for testing, to create a mock Memtable
  @VisibleForTesting
  public Memtable(CFMetaData metadata) {
    this.initialComparator = metadata.comparator;
    this.cfs = null;
    this.allocator = null;
    this.columnsCollector = new ColumnsCollector(metadata.partitionColumns());
  }

  public MemtableAllocator getAllocator() {
    return allocator;
  }

  public long getLiveDataSize() {
    return liveDataSize.get();
  }

  public long getOperations() {
    return currentOperations.get();
  }

  @VisibleForTesting
  public void setDiscarding(
      OpOrder.Barrier writeBarrier, AtomicReference<ReplayPosition> lastReplayPosition) {
    assert this.writeBarrier == null;
    this.commitLogUpperBound = lastReplayPosition;
    this.writeBarrier = writeBarrier;
    allocator.setDiscarding();
  }

  void setDiscarded() {
    allocator.setDiscarded();
  }

  // decide if this memtable should take the write, or if it should go to the next memtable
  public boolean accepts(OpOrder.Group opGroup, ReplayPosition replayPosition) {
    // if the barrier hasn't been set yet, then this memtable is still taking ALL writes
    OpOrder.Barrier barrier = this.writeBarrier;
    if (barrier == null) return true;
    // if the barrier has been set, but is in the past, we are definitely destined for a future
    // memtable
    if (!barrier.isAfter(opGroup)) return false;
    // if we aren't durable we are directed only by the barrier
    if (replayPosition == null) return true;
    while (true) {
      // otherwise we check if we are in the past/future wrt the CL boundary;
      // if the boundary hasn't been finalised yet, we simply update it to the max of
      // its current value and ours; if it HAS been finalised, we simply accept its judgement
      // this permits us to coordinate a safe boundary, as the boundary choice is made
      // atomically wrt our max() maintenance, so an operation cannot sneak into the past
      ReplayPosition currentLast = commitLogUpperBound.get();
      if (currentLast instanceof LastReplayPosition)
        return currentLast.compareTo(replayPosition) >= 0;
      if (currentLast != null && currentLast.compareTo(replayPosition) >= 0) return true;
      if (commitLogUpperBound.compareAndSet(currentLast, replayPosition)) return true;
    }
  }

  public ReplayPosition getCommitLogLowerBound() {
    return commitLogLowerBound.get();
  }

  public ReplayPosition getCommitLogUpperBound() {
    return commitLogUpperBound.get();
  }

  public boolean isLive() {
    return allocator.isLive();
  }

  public boolean isClean() {
    return partitions.isEmpty();
  }

  public boolean mayContainDataBefore(ReplayPosition position) {
    return approximateCommitLogLowerBound.compareTo(position) < 0;
  }

  /**
   * @return true if this memtable is expired. Expiration time is determined by CF's
   *     memtable_flush_period_in_ms.
   */
  public boolean isExpired() {
    int period = cfs.metadata.params.memtableFlushPeriodInMs;
    return period > 0
        && (System.nanoTime() - creationNano >= TimeUnit.MILLISECONDS.toNanos(period));
  }

  /**
   * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the
   * appropriate OpOrdering.
   *
   * <p>replayPosition should only be null if this is a secondary index, in which case it is
   * *expected* to be null
   */
  long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) {
    AtomicBTreePartition previous = partitions.get(update.partitionKey());

    long initialSize = 0;
    if (previous == null) {
      final DecoratedKey cloneKey = allocator.clone(update.partitionKey(), opGroup);
      AtomicBTreePartition empty = new AtomicBTreePartition(cfs.metadata, cloneKey, allocator);
      // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
      previous = partitions.putIfAbsent(cloneKey, empty);
      if (previous == null) {
        previous = empty;
        // allocate the row overhead after the fact; this saves over allocating and having to free
        // after, but
        // means we can overshoot our declared limit.
        int overhead = (int) (cloneKey.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE);
        allocator.onHeap().allocate(overhead, opGroup);
        initialSize = 8;
      } else {
        allocator.reclaimer().reclaimImmediately(cloneKey);
      }
    }

    long[] pair = previous.addAllWithSizeDelta(update, opGroup, indexer);
    minTimestamp = Math.min(minTimestamp, previous.stats().minTimestamp);
    liveDataSize.addAndGet(initialSize + pair[0]);
    columnsCollector.update(update.columns());
    statsCollector.update(update.stats());
    currentOperations.addAndGet(update.operationCount());
    return pair[1];
  }

  public int partitionCount() {
    return partitions.size();
  }

  public String toString() {
    return String.format(
        "Memtable-%s@%s(%s serialized bytes, %s ops, %.0f%%/%.0f%% of on/off-heap limit)",
        cfs.name,
        hashCode(),
        FBUtilities.prettyPrintMemory(liveDataSize.get()),
        currentOperations,
        100 * allocator.onHeap().ownershipRatio(),
        100 * allocator.offHeap().ownershipRatio());
  }

  public MemtableUnfilteredPartitionIterator makePartitionIterator(
      final ColumnFilter columnFilter, final DataRange dataRange, final boolean isForThrift) {
    AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange();

    boolean startIsMin = keyRange.left.isMinimum();
    boolean stopIsMin = keyRange.right.isMinimum();

    boolean isBound = keyRange instanceof Bounds;
    boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds;
    boolean includeStop = isBound || keyRange instanceof Range;
    Map<PartitionPosition, AtomicBTreePartition> subMap;
    if (startIsMin)
      subMap = stopIsMin ? partitions : partitions.headMap(keyRange.right, includeStop);
    else
      subMap =
          stopIsMin
              ? partitions.tailMap(keyRange.left, includeStart)
              : partitions.subMap(keyRange.left, includeStart, keyRange.right, includeStop);

    int minLocalDeletionTime = Integer.MAX_VALUE;

    // avoid iterating over the memtable if we purge all tombstones
    if (cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones())
      minLocalDeletionTime = findMinLocalDeletionTime(subMap.entrySet().iterator());

    final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter =
        subMap.entrySet().iterator();

    return new MemtableUnfilteredPartitionIterator(
        cfs, iter, isForThrift, minLocalDeletionTime, columnFilter, dataRange);
  }

  private int findMinLocalDeletionTime(
      Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iterator) {
    int minLocalDeletionTime = Integer.MAX_VALUE;
    while (iterator.hasNext()) {
      Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iterator.next();
      minLocalDeletionTime =
          Math.min(minLocalDeletionTime, entry.getValue().stats().minLocalDeletionTime);
    }
    return minLocalDeletionTime;
  }

  public Partition getPartition(DecoratedKey key) {
    return partitions.get(key);
  }

  public Collection<SSTableReader> flush() {
    long estimatedSize = estimatedSize();
    Directories.DataDirectory dataDirectory =
        cfs.getDirectories().getWriteableLocation(estimatedSize);
    if (dataDirectory == null)
      throw new RuntimeException("Insufficient disk space to write " + estimatedSize + " bytes");
    File sstableDirectory = cfs.getDirectories().getLocationForDisk(dataDirectory);
    assert sstableDirectory != null : "Flush task is not bound to any disk";
    return writeSortedContents(sstableDirectory);
  }

  public long getMinTimestamp() {
    return minTimestamp;
  }

  /** For testing only. Give this memtable too big a size to make it always fail flushing. */
  @VisibleForTesting
  public void makeUnflushable() {
    liveDataSize.addAndGet(1L * 1024 * 1024 * 1024 * 1024 * 1024);
  }

  private long estimatedSize() {
    long keySize = 0;
    for (PartitionPosition key : partitions.keySet()) {
      //  make sure we don't write non-sensical keys
      assert key instanceof DecoratedKey;
      keySize += ((DecoratedKey) key).getKey().remaining();
    }
    return (long)
        ((keySize // index entries
                + keySize // keys in data file
                + liveDataSize.get()) // data
            * 1.2); // bloom filter and row index overhead
  }

  private Collection<SSTableReader> writeSortedContents(File sstableDirectory) {
    boolean isBatchLogTable =
        cfs.name.equals(SystemKeyspace.BATCHES)
            && cfs.keyspace.getName().equals(SystemKeyspace.NAME);

    logger.debug("Writing {}", Memtable.this.toString());

    Collection<SSTableReader> ssTables;
    try (SSTableTxnWriter writer =
        createFlushWriter(
            cfs.getSSTablePath(sstableDirectory), columnsCollector.get(), statsCollector.get())) {
      boolean trackContention = logger.isTraceEnabled();
      int heavilyContendedRowCount = 0;
      // (we can't clear out the map as-we-go to free up memory,
      //  since the memtable is being used for queries in the "pending flush" category)
      for (AtomicBTreePartition partition : partitions.values()) {
        // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
        // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly
        // local,
        // we don't need to preserve tombstones for repair. So if both operation are in this
        // memtable (which will almost always be the case if there is no ongoing failure), we can
        // just skip the entry (CASSANDRA-4667).
        if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
          continue;

        if (trackContention && partition.usePessimisticLocking()) heavilyContendedRowCount++;

        if (!partition.isEmpty()) {
          try (UnfilteredRowIterator iter = partition.unfilteredIterator()) {
            writer.append(iter);
          }
        }
      }

      if (writer.getFilePointer() > 0) {
        logger.debug(
            String.format(
                "Completed flushing %s (%s) for commitlog position %s",
                writer.getFilename(),
                FBUtilities.prettyPrintMemory(writer.getFilePointer()),
                commitLogUpperBound));

        // sstables should contain non-repaired data.
        ssTables = writer.finish(true);
      } else {
        logger.debug(
            "Completed flushing {}; nothing needed to be retained.  Commitlog position was {}",
            writer.getFilename(),
            commitLogUpperBound);
        writer.abort();
        ssTables = Collections.emptyList();
      }

      if (heavilyContendedRowCount > 0)
        logger.trace(
            String.format(
                "High update contention in %d/%d partitions of %s ",
                heavilyContendedRowCount, partitions.size(), Memtable.this.toString()));

      return ssTables;
    }
  }

  @SuppressWarnings("resource") // log and writer closed by SSTableTxnWriter
  public SSTableTxnWriter createFlushWriter(
      String filename, PartitionColumns columns, EncodingStats stats) {
    // we operate "offline" here, as we expose the resulting reader consciously when done
    // (although we may want to modify this behaviour in future, to encapsulate full flush behaviour
    // in LifecycleTransaction)
    LifecycleTransaction txn = null;
    try {
      txn = LifecycleTransaction.offline(OperationType.FLUSH);
      MetadataCollector sstableMetadataCollector =
          new MetadataCollector(cfs.metadata.comparator)
              .commitLogIntervals(
                  new IntervalSet(commitLogLowerBound.get(), commitLogUpperBound.get()));

      return new SSTableTxnWriter(
          txn,
          cfs.createSSTableMultiWriter(
              Descriptor.fromFilename(filename),
              (long) partitions.size(),
              ActiveRepairService.UNREPAIRED_SSTABLE,
              sstableMetadataCollector,
              new SerializationHeader(true, cfs.metadata, columns, stats),
              txn));
    } catch (Throwable t) {
      if (txn != null) txn.close();
      throw t;
    }
  }

  private static int estimateRowOverhead(final int count) {
    // calculate row overhead
    try (final OpOrder.Group group = new OpOrder().start()) {
      int rowOverhead;
      MemtableAllocator allocator = MEMORY_POOL.newAllocator();
      ConcurrentNavigableMap<PartitionPosition, Object> partitions = new ConcurrentSkipListMap<>();
      final Object val = new Object();
      for (int i = 0; i < count; i++)
        partitions.put(
            allocator.clone(
                new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group),
            val);
      double avgSize = ObjectSizes.measureDeep(partitions) / (double) count;
      rowOverhead =
          (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize));
      rowOverhead -= ObjectSizes.measureDeep(new LongToken(0));
      rowOverhead += AtomicBTreePartition.EMPTY_SIZE;
      allocator.setDiscarding();
      allocator.setDiscarded();
      return rowOverhead;
    }
  }

  public static class MemtableUnfilteredPartitionIterator
      extends AbstractUnfilteredPartitionIterator {
    private final ColumnFamilyStore cfs;
    private final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter;
    private final boolean isForThrift;
    private final int minLocalDeletionTime;
    private final ColumnFilter columnFilter;
    private final DataRange dataRange;

    public MemtableUnfilteredPartitionIterator(
        ColumnFamilyStore cfs,
        Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter,
        boolean isForThrift,
        int minLocalDeletionTime,
        ColumnFilter columnFilter,
        DataRange dataRange) {
      this.cfs = cfs;
      this.iter = iter;
      this.isForThrift = isForThrift;
      this.minLocalDeletionTime = minLocalDeletionTime;
      this.columnFilter = columnFilter;
      this.dataRange = dataRange;
    }

    public boolean isForThrift() {
      return isForThrift;
    }

    public int getMinLocalDeletionTime() {
      return minLocalDeletionTime;
    }

    public CFMetaData metadata() {
      return cfs.metadata;
    }

    public boolean hasNext() {
      return iter.hasNext();
    }

    public UnfilteredRowIterator next() {
      Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iter.next();
      // Actual stored key should be true DecoratedKey
      assert entry.getKey() instanceof DecoratedKey;
      DecoratedKey key = (DecoratedKey) entry.getKey();
      ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key);
      return filter.getUnfilteredRowIterator(columnFilter, entry.getValue());
    }
  }

  private static class ColumnsCollector {
    private final HashMap<ColumnDefinition, AtomicBoolean> predefined = new HashMap<>();
    private final ConcurrentSkipListSet<ColumnDefinition> extra = new ConcurrentSkipListSet<>();

    ColumnsCollector(PartitionColumns columns) {
      for (ColumnDefinition def : columns.statics) predefined.put(def, new AtomicBoolean());
      for (ColumnDefinition def : columns.regulars) predefined.put(def, new AtomicBoolean());
    }

    public void update(PartitionColumns columns) {
      for (ColumnDefinition s : columns.statics) update(s);
      for (ColumnDefinition r : columns.regulars) update(r);
    }

    private void update(ColumnDefinition definition) {
      AtomicBoolean present = predefined.get(definition);
      if (present != null) {
        if (!present.get()) present.set(true);
      } else {
        extra.add(definition);
      }
    }

    public PartitionColumns get() {
      PartitionColumns.Builder builder = PartitionColumns.builder();
      for (Map.Entry<ColumnDefinition, AtomicBoolean> e : predefined.entrySet())
        if (e.getValue().get()) builder.add(e.getKey());
      return builder.addAll(extra).build();
    }
  }

  private static class StatsCollector {
    private final AtomicReference<EncodingStats> stats =
        new AtomicReference<>(EncodingStats.NO_STATS);

    public void update(EncodingStats newStats) {
      while (true) {
        EncodingStats current = stats.get();
        EncodingStats updated = current.mergeWith(newStats);
        if (stats.compareAndSet(current, updated)) return;
      }
    }

    public EncodingStats get() {
      return stats.get();
    }
  }
}
Exemplo n.º 2
0
 /**
  * @return true if this memtable is expired. Expiration time is determined by CF's
  *     memtable_flush_period_in_ms.
  */
 public boolean isExpired() {
   int period = cfs.metadata.params.memtableFlushPeriodInMs;
   return period > 0
       && (System.nanoTime() - creationNano >= TimeUnit.MILLISECONDS.toNanos(period));
 }