public ColumnFamilyStore testSingleSSTableCompaction(String strategyClassName) throws Exception { Keyspace keyspace = Keyspace.open(KEYSPACE1); ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1); store.clearUnsafe(); store.metadata.gcGraceSeconds(1); store.setCompactionStrategyClass(strategyClassName); // disable compaction while flushing store.disableAutoCompaction(); long timestamp = populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); // ttl=3s store.forceBlockingFlush(); assertEquals(1, store.getSSTables().size()); long originalSize = store.getSSTables().iterator().next().uncompressedLength(); // wait enough to force single compaction TimeUnit.SECONDS.sleep(5); // enable compaction, submit background and wait for it to complete store.enableAutoCompaction(); FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store)); while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1); // and sstable with ttl should be compacted assertEquals(1, store.getSSTables().size()); long size = store.getSSTables().iterator().next().uncompressedLength(); assertTrue("should be less than " + originalSize + ", but was " + size, size < originalSize); // make sure max timestamp of compacted sstables is recorded properly after compaction. assertMaxTimestamp(store, timestamp); return store; }
public int blockForWrites() throws IOException { for (Map.Entry<Integer, AtomicInteger> entry : invalidMutations.entrySet()) logger.info( String.format( "Skipped %d mutations from unknown (probably removed) CF with id %s", entry.getValue().intValue(), entry.getKey())); // wait for all the writes to finish on the mutation stage FBUtilities.waitOnFutures(futures); logger.debug("Finished waiting on mutations from recovery"); // flush replayed tables futures.clear(); for (Table table : tablesRecovered) futures.addAll(table.flush()); FBUtilities.waitOnFutures(futures); return replayedCount.get(); }
public ColumnFamilyStore testSingleSSTableCompaction(String strategyClassName) throws Exception { Keyspace keyspace = Keyspace.open(KEYSPACE1); ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1"); store.clearUnsafe(); store.metadata.gcGraceSeconds(1); store.setCompactionStrategyClass(strategyClassName); // disable compaction while flushing store.disableAutoCompaction(); long timestamp = System.currentTimeMillis(); for (int i = 0; i < 10; i++) { DecoratedKey key = Util.dk(Integer.toString(i)); RowMutation rm = new RowMutation(KEYSPACE1, key.key); for (int j = 0; j < 10; j++) rm.add( "Standard1", ByteBufferUtil.bytes(Integer.toString(j)), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, j > 0 ? 3 : 0); // let first column never expire, since deleting all columns does not produce // sstable rm.apply(); } store.forceBlockingFlush(); assertEquals(1, store.getSSTables().size()); long originalSize = store.getSSTables().iterator().next().uncompressedLength(); // wait enough to force single compaction TimeUnit.SECONDS.sleep(5); // enable compaction, submit background and wait for it to complete store.enableAutoCompaction(); FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store)); while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1); // and sstable with ttl should be compacted assertEquals(1, store.getSSTables().size()); long size = store.getSSTables().iterator().next().uncompressedLength(); assertTrue("should be less than " + originalSize + ", but was " + size, size < originalSize); // make sure max timestamp of compacted sstables is recorded properly after compaction. assertMaxTimestamp(store, timestamp); return store; }
public static void recover(File[] clogs) throws IOException { Set<Table> tablesRecovered = new HashSet<Table>(); List<Future<?>> futures = new ArrayList<Future<?>>(); byte[] bytes = new byte[4096]; Map<Integer, AtomicInteger> invalidMutations = new HashMap<Integer, AtomicInteger>(); for (File file : clogs) { int bufferSize = (int) Math.min(file.length(), 32 * 1024 * 1024); BufferedRandomAccessFile reader = new BufferedRandomAccessFile(file.getAbsolutePath(), "r", bufferSize); try { CommitLogHeader clHeader = null; int replayPosition = 0; String headerPath = CommitLogHeader.getHeaderPathFromSegmentPath(file.getAbsolutePath()); try { clHeader = CommitLogHeader.readCommitLogHeader(headerPath); replayPosition = clHeader.getReplayPosition(); } catch (IOException ioe) { logger.info( headerPath + " incomplete, missing or corrupt. Everything is ok, don't panic. CommitLog will be replayed from the beginning"); logger.debug("exception was", ioe); } if (replayPosition < 0) { logger.debug("skipping replay of fully-flushed {}", file); continue; } reader.seek(replayPosition); if (logger.isDebugEnabled()) logger.debug("Replaying " + file + " starting at " + reader.getFilePointer()); /* read the logs populate RowMutation and apply */ while (!reader.isEOF()) { if (logger.isDebugEnabled()) logger.debug("Reading mutation at " + reader.getFilePointer()); long claimedCRC32; Checksum checksum = new CRC32(); int serializedSize; try { // any of the reads may hit EOF serializedSize = reader.readInt(); long claimedSizeChecksum = reader.readLong(); checksum.update(serializedSize); if (checksum.getValue() != claimedSizeChecksum || serializedSize <= 0) break; // entry wasn't synced correctly/fully. that's ok. if (serializedSize > bytes.length) bytes = new byte[(int) (1.2 * serializedSize)]; reader.readFully(bytes, 0, serializedSize); claimedCRC32 = reader.readLong(); } catch (EOFException eof) { break; // last CL entry didn't get completely written. that's ok. } checksum.update(bytes, 0, serializedSize); if (claimedCRC32 != checksum.getValue()) { // this entry must not have been fsynced. probably the rest is bad too, // but just in case there is no harm in trying them (since we still read on an entry // boundary) continue; } /* deserialize the commit log entry */ ByteArrayInputStream bufIn = new ByteArrayInputStream(bytes, 0, serializedSize); RowMutation rm = null; try { rm = RowMutation.serializer().deserialize(new DataInputStream(bufIn)); } catch (UnserializableColumnFamilyException ex) { AtomicInteger i = invalidMutations.get(ex.cfId); if (i == null) { i = new AtomicInteger(1); invalidMutations.put(ex.cfId, i); } else i.incrementAndGet(); continue; } if (logger.isDebugEnabled()) logger.debug( String.format( "replaying mutation for %s.%s: %s", rm.getTable(), rm.key(), "{" + StringUtils.join(rm.getColumnFamilies(), ", ") + "}")); final Table table = Table.open(rm.getTable()); tablesRecovered.add(table); final Collection<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>(rm.getColumnFamilies()); final long entryLocation = reader.getFilePointer(); final CommitLogHeader finalHeader = clHeader; final RowMutation frm = rm; Runnable runnable = new WrappedRunnable() { public void runMayThrow() throws IOException { RowMutation newRm = new RowMutation(frm.getTable(), frm.key()); // Rebuild the row mutation, omitting column families that a) have already been // flushed, // b) are part of a cf that was dropped. Keep in mind that the cf.name() is // suspect. do every // thing based on the cfid instead. for (ColumnFamily columnFamily : columnFamilies) { if (CFMetaData.getCF(columnFamily.id()) == null) // null means the cf has been dropped continue; if (finalHeader == null || (finalHeader.isDirty(columnFamily.id()) && entryLocation >= finalHeader.getPosition(columnFamily.id()))) newRm.add(columnFamily); } if (!newRm.isEmpty()) { Table.open(newRm.getTable()).apply(newRm, null, false); } } }; futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable)); if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT) { FBUtilities.waitOnFutures(futures); futures.clear(); } } } finally { reader.close(); logger.info("Finished reading " + file); } } for (Map.Entry<Integer, AtomicInteger> entry : invalidMutations.entrySet()) logger.info( String.format( "Skipped %d mutations from unknown (probably removed) CF with id %d", entry.getValue().intValue(), entry.getKey())); // wait for all the writes to finish on the mutation stage FBUtilities.waitOnFutures(futures); logger.debug("Finished waiting on mutations from recovery"); // flush replayed tables futures.clear(); for (Table table : tablesRecovered) futures.addAll(table.flush()); FBUtilities.waitOnFutures(futures); logger.info("Recovery complete"); }
public void recover(File file) throws IOException { logger.info("Replaying " + file.getPath()); final long segment = CommitLogSegment.idFromFilename(file.getName()); RandomAccessReader reader = RandomAccessReader.open(new File(file.getAbsolutePath()), true); try { assert reader.length() <= Integer.MAX_VALUE; int replayPosition; if (globalPosition.segment < segment) replayPosition = 0; else if (globalPosition.segment == segment) replayPosition = globalPosition.position; else replayPosition = (int) reader.length(); if (replayPosition < 0 || replayPosition >= reader.length()) { // replayPosition > reader.length() can happen if some data gets flushed before it is // written to the commitlog // (see https://issues.apache.org/jira/browse/CASSANDRA-2285) logger.debug("skipping replay of fully-flushed {}", file); return; } reader.seek(replayPosition); if (logger.isDebugEnabled()) logger.debug("Replaying " + file + " starting at " + reader.getFilePointer()); /* read the logs populate RowMutation and apply */ while (!reader.isEOF()) { if (logger.isDebugEnabled()) logger.debug("Reading mutation at " + reader.getFilePointer()); long claimedCRC32; int serializedSize; try { // any of the reads may hit EOF serializedSize = reader.readInt(); if (serializedSize == CommitLog.END_OF_SEGMENT_MARKER) { logger.debug("Encountered end of segment marker at " + reader.getFilePointer()); break; } // RowMutation must be at LEAST 10 bytes: // 3 each for a non-empty Table and Key (including the // 2-byte length from writeUTF/writeWithShortLength) and 4 bytes for column count. // This prevents CRC by being fooled by special-case garbage in the file; see // CASSANDRA-2128 if (serializedSize < 10) break; long claimedSizeChecksum = reader.readLong(); checksum.reset(); checksum.update(serializedSize); if (checksum.getValue() != claimedSizeChecksum) break; // entry wasn't synced correctly/fully. that's // ok. if (serializedSize > buffer.length) buffer = new byte[(int) (1.2 * serializedSize)]; reader.readFully(buffer, 0, serializedSize); claimedCRC32 = reader.readLong(); } catch (EOFException eof) { break; // last CL entry didn't get completely written. that's ok. } checksum.update(buffer, 0, serializedSize); if (claimedCRC32 != checksum.getValue()) { // this entry must not have been fsynced. probably the rest is bad too, // but just in case there is no harm in trying them (since we still read on an entry // boundary) continue; } /* deserialize the commit log entry */ FastByteArrayInputStream bufIn = new FastByteArrayInputStream(buffer, 0, serializedSize); RowMutation rm; try { // assuming version here. We've gone to lengths to make sure what gets written to the CL // is in // the current version. so do make sure the CL is drained prior to upgrading a node. rm = RowMutation.serializer() .deserialize( new DataInputStream(bufIn), MessagingService.version_, IColumnSerializer.Flag.LOCAL); } catch (UnknownColumnFamilyException ex) { AtomicInteger i = invalidMutations.get(ex.cfId); if (i == null) { i = new AtomicInteger(1); invalidMutations.put(ex.cfId, i); } else i.incrementAndGet(); continue; } if (logger.isDebugEnabled()) logger.debug( String.format( "replaying mutation for %s.%s: %s", rm.getTable(), ByteBufferUtil.bytesToHex(rm.key()), "{" + StringUtils.join(rm.getColumnFamilies().iterator(), ", ") + "}")); final long entryLocation = reader.getFilePointer(); final RowMutation frm = rm; Runnable runnable = new WrappedRunnable() { public void runMayThrow() throws IOException { if (Schema.instance.getKSMetaData(frm.getTable()) == null) return; if (pointInTimeExceeded(frm)) return; final Table table = Table.open(frm.getTable()); RowMutation newRm = new RowMutation(frm.getTable(), frm.key()); // Rebuild the row mutation, omitting column families that // a) have already been flushed, // b) are part of a cf that was dropped. Keep in mind that the cf.name() is suspect. // do every thing based on the cfid instead. for (ColumnFamily columnFamily : frm.getColumnFamilies()) { if (Schema.instance.getCF(columnFamily.id()) == null) // null means the cf has been dropped continue; ReplayPosition rp = cfPositions.get(columnFamily.id()); // replay if current segment is newer than last flushed one or, // if it is the last known segment, if we are after the replay position if (segment > rp.segment || (segment == rp.segment && entryLocation > rp.position)) { newRm.add(columnFamily); replayedCount.incrementAndGet(); } } if (!newRm.isEmpty()) { Table.open(newRm.getTable()).apply(newRm, false); tablesRecovered.add(table); } } }; futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable)); if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT) { FBUtilities.waitOnFutures(futures); futures.clear(); } } } finally { FileUtils.closeQuietly(reader); logger.info("Finished reading " + file); } }
@Test public void testUncheckedTombstoneSizeTieredCompaction() throws Exception { Keyspace keyspace = Keyspace.open(KEYSPACE1); ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1); store.clearUnsafe(); store.metadata.gcGraceSeconds(1); store.metadata.compactionStrategyOptions.put("tombstone_compaction_interval", "1"); store.metadata.compactionStrategyOptions.put("unchecked_tombstone_compaction", "false"); store.reload(); store.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getName()); // disable compaction while flushing store.disableAutoCompaction(); // Populate sstable1 with with keys [0..9] populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); // ttl=3s store.forceBlockingFlush(); // Populate sstable2 with with keys [10..19] (keys do not overlap with SSTable1) long timestamp2 = populate(KEYSPACE1, CF_STANDARD1, 10, 19, 3); // ttl=3s store.forceBlockingFlush(); assertEquals(2, store.getSSTables().size()); Iterator<SSTableReader> it = store.getSSTables().iterator(); long originalSize1 = it.next().uncompressedLength(); long originalSize2 = it.next().uncompressedLength(); // wait enough to force single compaction TimeUnit.SECONDS.sleep(5); // enable compaction, submit background and wait for it to complete store.enableAutoCompaction(); FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store)); while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1); // even though both sstables were candidate for tombstone compaction // it was not executed because they have an overlapping token range assertEquals(2, store.getSSTables().size()); it = store.getSSTables().iterator(); long newSize1 = it.next().uncompressedLength(); long newSize2 = it.next().uncompressedLength(); assertEquals( "candidate sstable should not be tombstone-compacted because its key range overlap with other sstable", originalSize1, newSize1); assertEquals( "candidate sstable should not be tombstone-compacted because its key range overlap with other sstable", originalSize2, newSize2); // now let's enable the magic property store.metadata.compactionStrategyOptions.put("unchecked_tombstone_compaction", "true"); store.reload(); // submit background task again and wait for it to complete FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store)); while (CompactionManager.instance.getPendingTasks() > 0 || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1); // we still have 2 sstables, since they were not compacted against each other assertEquals(2, store.getSSTables().size()); it = store.getSSTables().iterator(); newSize1 = it.next().uncompressedLength(); newSize2 = it.next().uncompressedLength(); assertTrue( "should be less than " + originalSize1 + ", but was " + newSize1, newSize1 < originalSize1); assertTrue( "should be less than " + originalSize2 + ", but was " + newSize2, newSize2 < originalSize2); // make sure max timestamp of compacted sstables is recorded properly after compaction. assertMaxTimestamp(store, timestamp2); }
@Test public void testParallelLeveledCompaction() throws Exception { String ksname = "Keyspace1"; String cfname = "StandardLeveled"; Keyspace keyspace = Keyspace.open(ksname); ColumnFamilyStore store = keyspace.getColumnFamilyStore(cfname); store.disableAutoCompaction(); LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) store.getCompactionStrategy(); ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files // Enough data to have a level 1 and 2 int rows = 128; int columns = 10; // Adds enough data to trigger multiple sstable per level for (int r = 0; r < rows; r++) { DecoratedKey key = Util.dk(String.valueOf(r)); RowMutation rm = new RowMutation(ksname, key.key); for (int c = 0; c < columns; c++) { rm.add(cfname, ByteBufferUtil.bytes("column" + c), value, 0); } rm.apply(); store.forceBlockingFlush(); } // Execute LCS in parallel ExecutorService executor = new ThreadPoolExecutor( 4, 4, Long.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingDeque<Runnable>()); List<Runnable> tasks = new ArrayList<Runnable>(); while (true) { while (true) { final AbstractCompactionTask t = lcs.getMaximalTask(Integer.MIN_VALUE); if (t == null) break; tasks.add( new Runnable() { public void run() { t.execute(null); } }); } if (tasks.isEmpty()) break; List<Future<?>> futures = new ArrayList<Future<?>>(tasks.size()); for (Runnable r : tasks) futures.add(executor.submit(r)); FBUtilities.waitOnFutures(futures); tasks.clear(); } // Assert all SSTables are lined up correctly. LeveledManifest manifest = lcs.manifest; int levels = manifest.getLevelCount(); for (int level = 0; level < levels; level++) { List<SSTableReader> sstables = manifest.getLevel(level); // score check assert (double) SSTable.getTotalBytes(sstables) / manifest.maxBytesForLevel(level) < 1.00; // overlap check for levels greater than 0 if (level > 0) { for (SSTableReader sstable : sstables) { Set<SSTableReader> overlaps = LeveledManifest.overlapping(sstable, sstables); assert overlaps.size() == 1 && overlaps.contains(sstable); } } } for (SSTableReader sstable : store.getSSTables()) { assert sstable.getSSTableLevel() == sstable.getSSTableLevel(); } }