public MemtableUnfilteredPartitionIterator makePartitionIterator( final ColumnFilter columnFilter, final DataRange dataRange, final boolean isForThrift) { AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange(); boolean startIsMin = keyRange.left.isMinimum(); boolean stopIsMin = keyRange.right.isMinimum(); boolean isBound = keyRange instanceof Bounds; boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds; boolean includeStop = isBound || keyRange instanceof Range; Map<PartitionPosition, AtomicBTreePartition> subMap; if (startIsMin) subMap = stopIsMin ? partitions : partitions.headMap(keyRange.right, includeStop); else subMap = stopIsMin ? partitions.tailMap(keyRange.left, includeStart) : partitions.subMap(keyRange.left, includeStart, keyRange.right, includeStop); int minLocalDeletionTime = Integer.MAX_VALUE; // avoid iterating over the memtable if we purge all tombstones if (cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones()) minLocalDeletionTime = findMinLocalDeletionTime(subMap.entrySet().iterator()); final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter = subMap.entrySet().iterator(); return new MemtableUnfilteredPartitionIterator( cfs, iter, isForThrift, minLocalDeletionTime, columnFilter, dataRange); }
/** * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the * appropriate OpOrdering. * * <p>replayPosition should only be null if this is a secondary index, in which case it is * *expected* to be null */ long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup) { AtomicBTreePartition previous = partitions.get(update.partitionKey()); long initialSize = 0; if (previous == null) { final DecoratedKey cloneKey = allocator.clone(update.partitionKey(), opGroup); AtomicBTreePartition empty = new AtomicBTreePartition(cfs.metadata, cloneKey, allocator); // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent previous = partitions.putIfAbsent(cloneKey, empty); if (previous == null) { previous = empty; // allocate the row overhead after the fact; this saves over allocating and having to free // after, but // means we can overshoot our declared limit. int overhead = (int) (cloneKey.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE); allocator.onHeap().allocate(overhead, opGroup); initialSize = 8; } else { allocator.reclaimer().reclaimImmediately(cloneKey); } } long[] pair = previous.addAllWithSizeDelta(update, opGroup, indexer); minTimestamp = Math.min(minTimestamp, previous.stats().minTimestamp); liveDataSize.addAndGet(initialSize + pair[0]); columnsCollector.update(update.columns()); statsCollector.update(update.stats()); currentOperations.addAndGet(update.operationCount()); return pair[1]; }
@SuppressWarnings("resource") // log and writer closed by SSTableTxnWriter public SSTableTxnWriter createFlushWriter( String filename, PartitionColumns columns, EncodingStats stats) { // we operate "offline" here, as we expose the resulting reader consciously when done // (although we may want to modify this behaviour in future, to encapsulate full flush behaviour // in LifecycleTransaction) LifecycleTransaction txn = null; try { txn = LifecycleTransaction.offline(OperationType.FLUSH); MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.metadata.comparator) .commitLogIntervals( new IntervalSet(commitLogLowerBound.get(), commitLogUpperBound.get())); return new SSTableTxnWriter( txn, cfs.createSSTableMultiWriter( Descriptor.fromFilename(filename), (long) partitions.size(), ActiveRepairService.UNREPAIRED_SSTABLE, sstableMetadataCollector, new SerializationHeader(true, cfs.metadata, columns, stats), txn)); } catch (Throwable t) { if (txn != null) txn.close(); throw t; } }
private static int estimateRowOverhead(final int count) { // calculate row overhead try (final OpOrder.Group group = new OpOrder().start()) { int rowOverhead; MemtableAllocator allocator = MEMORY_POOL.newAllocator(); ConcurrentNavigableMap<PartitionPosition, Object> partitions = new ConcurrentSkipListMap<>(); final Object val = new Object(); for (int i = 0; i < count; i++) partitions.put( allocator.clone( new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group), val); double avgSize = ObjectSizes.measureDeep(partitions) / (double) count; rowOverhead = (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize)); rowOverhead -= ObjectSizes.measureDeep(new LongToken(0)); rowOverhead += AtomicBTreePartition.EMPTY_SIZE; allocator.setDiscarding(); allocator.setDiscarded(); return rowOverhead; } }
private long estimatedSize() { long keySize = 0; for (PartitionPosition key : partitions.keySet()) { // make sure we don't write non-sensical keys assert key instanceof DecoratedKey; keySize += ((DecoratedKey) key).getKey().remaining(); } return (long) ((keySize // index entries + keySize // keys in data file + liveDataSize.get()) // data * 1.2); // bloom filter and row index overhead }
private Collection<SSTableReader> writeSortedContents(File sstableDirectory) { boolean isBatchLogTable = cfs.name.equals(SystemKeyspace.BATCHES) && cfs.keyspace.getName().equals(SystemKeyspace.NAME); logger.debug("Writing {}", Memtable.this.toString()); Collection<SSTableReader> ssTables; try (SSTableTxnWriter writer = createFlushWriter( cfs.getSSTablePath(sstableDirectory), columnsCollector.get(), statsCollector.get())) { boolean trackContention = logger.isTraceEnabled(); int heavilyContendedRowCount = 0; // (we can't clear out the map as-we-go to free up memory, // since the memtable is being used for queries in the "pending flush" category) for (AtomicBTreePartition partition : partitions.values()) { // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2 // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly // local, // we don't need to preserve tombstones for repair. So if both operation are in this // memtable (which will almost always be the case if there is no ongoing failure), we can // just skip the entry (CASSANDRA-4667). if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows()) continue; if (trackContention && partition.usePessimisticLocking()) heavilyContendedRowCount++; if (!partition.isEmpty()) { try (UnfilteredRowIterator iter = partition.unfilteredIterator()) { writer.append(iter); } } } if (writer.getFilePointer() > 0) { logger.debug( String.format( "Completed flushing %s (%s) for commitlog position %s", writer.getFilename(), FBUtilities.prettyPrintMemory(writer.getFilePointer()), commitLogUpperBound)); // sstables should contain non-repaired data. ssTables = writer.finish(true); } else { logger.debug( "Completed flushing {}; nothing needed to be retained. Commitlog position was {}", writer.getFilename(), commitLogUpperBound); writer.abort(); ssTables = Collections.emptyList(); } if (heavilyContendedRowCount > 0) logger.trace( String.format( "High update contention in %d/%d partitions of %s ", heavilyContendedRowCount, partitions.size(), Memtable.this.toString())); return ssTables; } }
public Partition getPartition(DecoratedKey key) { return partitions.get(key); }
public int partitionCount() { return partitions.size(); }
public boolean isClean() { return partitions.isEmpty(); }