예제 #1
0
  public MemtableUnfilteredPartitionIterator makePartitionIterator(
      final ColumnFilter columnFilter, final DataRange dataRange, final boolean isForThrift) {
    AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange();

    boolean startIsMin = keyRange.left.isMinimum();
    boolean stopIsMin = keyRange.right.isMinimum();

    boolean isBound = keyRange instanceof Bounds;
    boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds;
    boolean includeStop = isBound || keyRange instanceof Range;
    Map<PartitionPosition, AtomicBTreePartition> subMap;
    if (startIsMin)
      subMap = stopIsMin ? partitions : partitions.headMap(keyRange.right, includeStop);
    else
      subMap =
          stopIsMin
              ? partitions.tailMap(keyRange.left, includeStart)
              : partitions.subMap(keyRange.left, includeStart, keyRange.right, includeStop);

    int minLocalDeletionTime = Integer.MAX_VALUE;

    // avoid iterating over the memtable if we purge all tombstones
    if (cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones())
      minLocalDeletionTime = findMinLocalDeletionTime(subMap.entrySet().iterator());

    final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter =
        subMap.entrySet().iterator();

    return new MemtableUnfilteredPartitionIterator(
        cfs, iter, isForThrift, minLocalDeletionTime, columnFilter, dataRange);
  }
예제 #2
0
  /**
   * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the
   * appropriate OpOrdering.
   *
   * <p>replayPosition should only be null if this is a secondary index, in which case it is
   * *expected* to be null
   */
  long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) {
    AtomicBTreePartition previous = partitions.get(update.partitionKey());

    long initialSize = 0;
    if (previous == null) {
      final DecoratedKey cloneKey = allocator.clone(update.partitionKey(), opGroup);
      AtomicBTreePartition empty = new AtomicBTreePartition(cfs.metadata, cloneKey, allocator);
      // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
      previous = partitions.putIfAbsent(cloneKey, empty);
      if (previous == null) {
        previous = empty;
        // allocate the row overhead after the fact; this saves over allocating and having to free
        // after, but
        // means we can overshoot our declared limit.
        int overhead = (int) (cloneKey.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE);
        allocator.onHeap().allocate(overhead, opGroup);
        initialSize = 8;
      } else {
        allocator.reclaimer().reclaimImmediately(cloneKey);
      }
    }

    long[] pair = previous.addAllWithSizeDelta(update, opGroup, indexer);
    minTimestamp = Math.min(minTimestamp, previous.stats().minTimestamp);
    liveDataSize.addAndGet(initialSize + pair[0]);
    columnsCollector.update(update.columns());
    statsCollector.update(update.stats());
    currentOperations.addAndGet(update.operationCount());
    return pair[1];
  }
예제 #3
0
  @SuppressWarnings("resource") // log and writer closed by SSTableTxnWriter
  public SSTableTxnWriter createFlushWriter(
      String filename, PartitionColumns columns, EncodingStats stats) {
    // we operate "offline" here, as we expose the resulting reader consciously when done
    // (although we may want to modify this behaviour in future, to encapsulate full flush behaviour
    // in LifecycleTransaction)
    LifecycleTransaction txn = null;
    try {
      txn = LifecycleTransaction.offline(OperationType.FLUSH);
      MetadataCollector sstableMetadataCollector =
          new MetadataCollector(cfs.metadata.comparator)
              .commitLogIntervals(
                  new IntervalSet(commitLogLowerBound.get(), commitLogUpperBound.get()));

      return new SSTableTxnWriter(
          txn,
          cfs.createSSTableMultiWriter(
              Descriptor.fromFilename(filename),
              (long) partitions.size(),
              ActiveRepairService.UNREPAIRED_SSTABLE,
              sstableMetadataCollector,
              new SerializationHeader(true, cfs.metadata, columns, stats),
              txn));
    } catch (Throwable t) {
      if (txn != null) txn.close();
      throw t;
    }
  }
예제 #4
0
 private static int estimateRowOverhead(final int count) {
   // calculate row overhead
   try (final OpOrder.Group group = new OpOrder().start()) {
     int rowOverhead;
     MemtableAllocator allocator = MEMORY_POOL.newAllocator();
     ConcurrentNavigableMap<PartitionPosition, Object> partitions = new ConcurrentSkipListMap<>();
     final Object val = new Object();
     for (int i = 0; i < count; i++)
       partitions.put(
           allocator.clone(
               new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group),
           val);
     double avgSize = ObjectSizes.measureDeep(partitions) / (double) count;
     rowOverhead =
         (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize));
     rowOverhead -= ObjectSizes.measureDeep(new LongToken(0));
     rowOverhead += AtomicBTreePartition.EMPTY_SIZE;
     allocator.setDiscarding();
     allocator.setDiscarded();
     return rowOverhead;
   }
 }
예제 #5
0
 private long estimatedSize() {
   long keySize = 0;
   for (PartitionPosition key : partitions.keySet()) {
     //  make sure we don't write non-sensical keys
     assert key instanceof DecoratedKey;
     keySize += ((DecoratedKey) key).getKey().remaining();
   }
   return (long)
       ((keySize // index entries
               + keySize // keys in data file
               + liveDataSize.get()) // data
           * 1.2); // bloom filter and row index overhead
 }
예제 #6
0
  private Collection<SSTableReader> writeSortedContents(File sstableDirectory) {
    boolean isBatchLogTable =
        cfs.name.equals(SystemKeyspace.BATCHES)
            && cfs.keyspace.getName().equals(SystemKeyspace.NAME);

    logger.debug("Writing {}", Memtable.this.toString());

    Collection<SSTableReader> ssTables;
    try (SSTableTxnWriter writer =
        createFlushWriter(
            cfs.getSSTablePath(sstableDirectory), columnsCollector.get(), statsCollector.get())) {
      boolean trackContention = logger.isTraceEnabled();
      int heavilyContendedRowCount = 0;
      // (we can't clear out the map as-we-go to free up memory,
      //  since the memtable is being used for queries in the "pending flush" category)
      for (AtomicBTreePartition partition : partitions.values()) {
        // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
        // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly
        // local,
        // we don't need to preserve tombstones for repair. So if both operation are in this
        // memtable (which will almost always be the case if there is no ongoing failure), we can
        // just skip the entry (CASSANDRA-4667).
        if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
          continue;

        if (trackContention && partition.usePessimisticLocking()) heavilyContendedRowCount++;

        if (!partition.isEmpty()) {
          try (UnfilteredRowIterator iter = partition.unfilteredIterator()) {
            writer.append(iter);
          }
        }
      }

      if (writer.getFilePointer() > 0) {
        logger.debug(
            String.format(
                "Completed flushing %s (%s) for commitlog position %s",
                writer.getFilename(),
                FBUtilities.prettyPrintMemory(writer.getFilePointer()),
                commitLogUpperBound));

        // sstables should contain non-repaired data.
        ssTables = writer.finish(true);
      } else {
        logger.debug(
            "Completed flushing {}; nothing needed to be retained.  Commitlog position was {}",
            writer.getFilename(),
            commitLogUpperBound);
        writer.abort();
        ssTables = Collections.emptyList();
      }

      if (heavilyContendedRowCount > 0)
        logger.trace(
            String.format(
                "High update contention in %d/%d partitions of %s ",
                heavilyContendedRowCount, partitions.size(), Memtable.this.toString()));

      return ssTables;
    }
  }
예제 #7
0
 public Partition getPartition(DecoratedKey key) {
   return partitions.get(key);
 }
예제 #8
0
 public int partitionCount() {
   return partitions.size();
 }
예제 #9
0
 public boolean isClean() {
   return partitions.isEmpty();
 }