public ColumnFamilyStore testSingleSSTableCompaction(String strategyClassName) throws Exception {
    Keyspace keyspace = Keyspace.open(KEYSPACE1);
    ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1);
    store.clearUnsafe();
    store.metadata.gcGraceSeconds(1);
    store.setCompactionStrategyClass(strategyClassName);

    // disable compaction while flushing
    store.disableAutoCompaction();

    long timestamp = populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); // ttl=3s

    store.forceBlockingFlush();
    assertEquals(1, store.getSSTables().size());
    long originalSize = store.getSSTables().iterator().next().uncompressedLength();

    // wait enough to force single compaction
    TimeUnit.SECONDS.sleep(5);

    // enable compaction, submit background and wait for it to complete
    store.enableAutoCompaction();
    FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
    while (CompactionManager.instance.getPendingTasks() > 0
        || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1);

    // and sstable with ttl should be compacted
    assertEquals(1, store.getSSTables().size());
    long size = store.getSSTables().iterator().next().uncompressedLength();
    assertTrue("should be less than " + originalSize + ", but was " + size, size < originalSize);

    // make sure max timestamp of compacted sstables is recorded properly after compaction.
    assertMaxTimestamp(store, timestamp);

    return store;
  }
Exemple #2
0
  public int blockForWrites() throws IOException {
    for (Map.Entry<Integer, AtomicInteger> entry : invalidMutations.entrySet())
      logger.info(
          String.format(
              "Skipped %d mutations from unknown (probably removed) CF with id %s",
              entry.getValue().intValue(), entry.getKey()));

    // wait for all the writes to finish on the mutation stage
    FBUtilities.waitOnFutures(futures);
    logger.debug("Finished waiting on mutations from recovery");

    // flush replayed tables
    futures.clear();
    for (Table table : tablesRecovered) futures.addAll(table.flush());
    FBUtilities.waitOnFutures(futures);
    return replayedCount.get();
  }
  public ColumnFamilyStore testSingleSSTableCompaction(String strategyClassName) throws Exception {
    Keyspace keyspace = Keyspace.open(KEYSPACE1);
    ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
    store.clearUnsafe();
    store.metadata.gcGraceSeconds(1);
    store.setCompactionStrategyClass(strategyClassName);

    // disable compaction while flushing
    store.disableAutoCompaction();

    long timestamp = System.currentTimeMillis();
    for (int i = 0; i < 10; i++) {
      DecoratedKey key = Util.dk(Integer.toString(i));
      RowMutation rm = new RowMutation(KEYSPACE1, key.key);
      for (int j = 0; j < 10; j++)
        rm.add(
            "Standard1",
            ByteBufferUtil.bytes(Integer.toString(j)),
            ByteBufferUtil.EMPTY_BYTE_BUFFER,
            timestamp,
            j > 0
                ? 3
                : 0); // let first column never expire, since deleting all columns does not produce
                      // sstable
      rm.apply();
    }
    store.forceBlockingFlush();
    assertEquals(1, store.getSSTables().size());
    long originalSize = store.getSSTables().iterator().next().uncompressedLength();

    // wait enough to force single compaction
    TimeUnit.SECONDS.sleep(5);

    // enable compaction, submit background and wait for it to complete
    store.enableAutoCompaction();
    FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
    while (CompactionManager.instance.getPendingTasks() > 0
        || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1);

    // and sstable with ttl should be compacted
    assertEquals(1, store.getSSTables().size());
    long size = store.getSSTables().iterator().next().uncompressedLength();
    assertTrue("should be less than " + originalSize + ", but was " + size, size < originalSize);

    // make sure max timestamp of compacted sstables is recorded properly after compaction.
    assertMaxTimestamp(store, timestamp);

    return store;
  }
  public static void recover(File[] clogs) throws IOException {
    Set<Table> tablesRecovered = new HashSet<Table>();
    List<Future<?>> futures = new ArrayList<Future<?>>();
    byte[] bytes = new byte[4096];
    Map<Integer, AtomicInteger> invalidMutations = new HashMap<Integer, AtomicInteger>();

    for (File file : clogs) {
      int bufferSize = (int) Math.min(file.length(), 32 * 1024 * 1024);
      BufferedRandomAccessFile reader =
          new BufferedRandomAccessFile(file.getAbsolutePath(), "r", bufferSize);

      try {
        CommitLogHeader clHeader = null;
        int replayPosition = 0;
        String headerPath = CommitLogHeader.getHeaderPathFromSegmentPath(file.getAbsolutePath());
        try {
          clHeader = CommitLogHeader.readCommitLogHeader(headerPath);
          replayPosition = clHeader.getReplayPosition();
        } catch (IOException ioe) {
          logger.info(
              headerPath
                  + " incomplete, missing or corrupt.  Everything is ok, don't panic.  CommitLog will be replayed from the beginning");
          logger.debug("exception was", ioe);
        }
        if (replayPosition < 0) {
          logger.debug("skipping replay of fully-flushed {}", file);
          continue;
        }
        reader.seek(replayPosition);

        if (logger.isDebugEnabled())
          logger.debug("Replaying " + file + " starting at " + reader.getFilePointer());

        /* read the logs populate RowMutation and apply */
        while (!reader.isEOF()) {
          if (logger.isDebugEnabled())
            logger.debug("Reading mutation at " + reader.getFilePointer());

          long claimedCRC32;

          Checksum checksum = new CRC32();
          int serializedSize;
          try {
            // any of the reads may hit EOF
            serializedSize = reader.readInt();
            long claimedSizeChecksum = reader.readLong();
            checksum.update(serializedSize);
            if (checksum.getValue() != claimedSizeChecksum || serializedSize <= 0)
              break; // entry wasn't synced correctly/fully.  that's ok.

            if (serializedSize > bytes.length) bytes = new byte[(int) (1.2 * serializedSize)];
            reader.readFully(bytes, 0, serializedSize);
            claimedCRC32 = reader.readLong();
          } catch (EOFException eof) {
            break; // last CL entry didn't get completely written.  that's ok.
          }

          checksum.update(bytes, 0, serializedSize);
          if (claimedCRC32 != checksum.getValue()) {
            // this entry must not have been fsynced.  probably the rest is bad too,
            // but just in case there is no harm in trying them (since we still read on an entry
            // boundary)
            continue;
          }

          /* deserialize the commit log entry */
          ByteArrayInputStream bufIn = new ByteArrayInputStream(bytes, 0, serializedSize);
          RowMutation rm = null;
          try {
            rm = RowMutation.serializer().deserialize(new DataInputStream(bufIn));
          } catch (UnserializableColumnFamilyException ex) {
            AtomicInteger i = invalidMutations.get(ex.cfId);
            if (i == null) {
              i = new AtomicInteger(1);
              invalidMutations.put(ex.cfId, i);
            } else i.incrementAndGet();
            continue;
          }

          if (logger.isDebugEnabled())
            logger.debug(
                String.format(
                    "replaying mutation for %s.%s: %s",
                    rm.getTable(),
                    rm.key(),
                    "{" + StringUtils.join(rm.getColumnFamilies(), ", ") + "}"));
          final Table table = Table.open(rm.getTable());
          tablesRecovered.add(table);
          final Collection<ColumnFamily> columnFamilies =
              new ArrayList<ColumnFamily>(rm.getColumnFamilies());
          final long entryLocation = reader.getFilePointer();
          final CommitLogHeader finalHeader = clHeader;
          final RowMutation frm = rm;
          Runnable runnable =
              new WrappedRunnable() {
                public void runMayThrow() throws IOException {
                  RowMutation newRm = new RowMutation(frm.getTable(), frm.key());

                  // Rebuild the row mutation, omitting column families that a) have already been
                  // flushed,
                  // b) are part of a cf that was dropped. Keep in mind that the cf.name() is
                  // suspect. do every
                  // thing based on the cfid instead.
                  for (ColumnFamily columnFamily : columnFamilies) {
                    if (CFMetaData.getCF(columnFamily.id()) == null)
                      // null means the cf has been dropped
                      continue;

                    if (finalHeader == null
                        || (finalHeader.isDirty(columnFamily.id())
                            && entryLocation >= finalHeader.getPosition(columnFamily.id())))
                      newRm.add(columnFamily);
                  }
                  if (!newRm.isEmpty()) {
                    Table.open(newRm.getTable()).apply(newRm, null, false);
                  }
                }
              };
          futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable));
          if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT) {
            FBUtilities.waitOnFutures(futures);
            futures.clear();
          }
        }
      } finally {
        reader.close();
        logger.info("Finished reading " + file);
      }
    }

    for (Map.Entry<Integer, AtomicInteger> entry : invalidMutations.entrySet())
      logger.info(
          String.format(
              "Skipped %d mutations from unknown (probably removed) CF with id %d",
              entry.getValue().intValue(), entry.getKey()));

    // wait for all the writes to finish on the mutation stage
    FBUtilities.waitOnFutures(futures);
    logger.debug("Finished waiting on mutations from recovery");

    // flush replayed tables
    futures.clear();
    for (Table table : tablesRecovered) futures.addAll(table.flush());
    FBUtilities.waitOnFutures(futures);
    logger.info("Recovery complete");
  }
Exemple #5
0
  public void recover(File file) throws IOException {
    logger.info("Replaying " + file.getPath());
    final long segment = CommitLogSegment.idFromFilename(file.getName());
    RandomAccessReader reader = RandomAccessReader.open(new File(file.getAbsolutePath()), true);
    try {
      assert reader.length() <= Integer.MAX_VALUE;
      int replayPosition;
      if (globalPosition.segment < segment) replayPosition = 0;
      else if (globalPosition.segment == segment) replayPosition = globalPosition.position;
      else replayPosition = (int) reader.length();

      if (replayPosition < 0 || replayPosition >= reader.length()) {
        // replayPosition > reader.length() can happen if some data gets flushed before it is
        // written to the commitlog
        // (see https://issues.apache.org/jira/browse/CASSANDRA-2285)
        logger.debug("skipping replay of fully-flushed {}", file);
        return;
      }

      reader.seek(replayPosition);

      if (logger.isDebugEnabled())
        logger.debug("Replaying " + file + " starting at " + reader.getFilePointer());

      /* read the logs populate RowMutation and apply */
      while (!reader.isEOF()) {
        if (logger.isDebugEnabled()) logger.debug("Reading mutation at " + reader.getFilePointer());

        long claimedCRC32;
        int serializedSize;
        try {
          // any of the reads may hit EOF
          serializedSize = reader.readInt();
          if (serializedSize == CommitLog.END_OF_SEGMENT_MARKER) {
            logger.debug("Encountered end of segment marker at " + reader.getFilePointer());
            break;
          }

          // RowMutation must be at LEAST 10 bytes:
          // 3 each for a non-empty Table and Key (including the
          // 2-byte length from writeUTF/writeWithShortLength) and 4 bytes for column count.
          // This prevents CRC by being fooled by special-case garbage in the file; see
          // CASSANDRA-2128
          if (serializedSize < 10) break;
          long claimedSizeChecksum = reader.readLong();
          checksum.reset();
          checksum.update(serializedSize);
          if (checksum.getValue() != claimedSizeChecksum)
            break; // entry wasn't synced correctly/fully. that's
          // ok.

          if (serializedSize > buffer.length) buffer = new byte[(int) (1.2 * serializedSize)];
          reader.readFully(buffer, 0, serializedSize);
          claimedCRC32 = reader.readLong();
        } catch (EOFException eof) {
          break; // last CL entry didn't get completely written. that's ok.
        }

        checksum.update(buffer, 0, serializedSize);
        if (claimedCRC32 != checksum.getValue()) {
          // this entry must not have been fsynced. probably the rest is bad too,
          // but just in case there is no harm in trying them (since we still read on an entry
          // boundary)
          continue;
        }

        /* deserialize the commit log entry */
        FastByteArrayInputStream bufIn = new FastByteArrayInputStream(buffer, 0, serializedSize);
        RowMutation rm;
        try {
          // assuming version here. We've gone to lengths to make sure what gets written to the CL
          // is in
          // the current version. so do make sure the CL is drained prior to upgrading a node.
          rm =
              RowMutation.serializer()
                  .deserialize(
                      new DataInputStream(bufIn),
                      MessagingService.version_,
                      IColumnSerializer.Flag.LOCAL);
        } catch (UnknownColumnFamilyException ex) {
          AtomicInteger i = invalidMutations.get(ex.cfId);
          if (i == null) {
            i = new AtomicInteger(1);
            invalidMutations.put(ex.cfId, i);
          } else i.incrementAndGet();
          continue;
        }

        if (logger.isDebugEnabled())
          logger.debug(
              String.format(
                  "replaying mutation for %s.%s: %s",
                  rm.getTable(),
                  ByteBufferUtil.bytesToHex(rm.key()),
                  "{" + StringUtils.join(rm.getColumnFamilies().iterator(), ", ") + "}"));

        final long entryLocation = reader.getFilePointer();
        final RowMutation frm = rm;
        Runnable runnable =
            new WrappedRunnable() {
              public void runMayThrow() throws IOException {
                if (Schema.instance.getKSMetaData(frm.getTable()) == null) return;
                if (pointInTimeExceeded(frm)) return;

                final Table table = Table.open(frm.getTable());
                RowMutation newRm = new RowMutation(frm.getTable(), frm.key());

                // Rebuild the row mutation, omitting column families that
                // a) have already been flushed,
                // b) are part of a cf that was dropped. Keep in mind that the cf.name() is suspect.
                // do every thing based on the cfid instead.
                for (ColumnFamily columnFamily : frm.getColumnFamilies()) {
                  if (Schema.instance.getCF(columnFamily.id()) == null)
                    // null means the cf has been dropped
                    continue;

                  ReplayPosition rp = cfPositions.get(columnFamily.id());

                  // replay if current segment is newer than last flushed one or,
                  // if it is the last known segment, if we are after the replay position
                  if (segment > rp.segment
                      || (segment == rp.segment && entryLocation > rp.position)) {
                    newRm.add(columnFamily);
                    replayedCount.incrementAndGet();
                  }
                }
                if (!newRm.isEmpty()) {
                  Table.open(newRm.getTable()).apply(newRm, false);
                  tablesRecovered.add(table);
                }
              }
            };
        futures.add(StageManager.getStage(Stage.MUTATION).submit(runnable));
        if (futures.size() > MAX_OUTSTANDING_REPLAY_COUNT) {
          FBUtilities.waitOnFutures(futures);
          futures.clear();
        }
      }
    } finally {
      FileUtils.closeQuietly(reader);
      logger.info("Finished reading " + file);
    }
  }
  @Test
  public void testUncheckedTombstoneSizeTieredCompaction() throws Exception {
    Keyspace keyspace = Keyspace.open(KEYSPACE1);
    ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1);
    store.clearUnsafe();
    store.metadata.gcGraceSeconds(1);
    store.metadata.compactionStrategyOptions.put("tombstone_compaction_interval", "1");
    store.metadata.compactionStrategyOptions.put("unchecked_tombstone_compaction", "false");
    store.reload();
    store.setCompactionStrategyClass(SizeTieredCompactionStrategy.class.getName());

    // disable compaction while flushing
    store.disableAutoCompaction();

    // Populate sstable1 with with keys [0..9]
    populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); // ttl=3s
    store.forceBlockingFlush();

    // Populate sstable2 with with keys [10..19] (keys do not overlap with SSTable1)
    long timestamp2 = populate(KEYSPACE1, CF_STANDARD1, 10, 19, 3); // ttl=3s
    store.forceBlockingFlush();

    assertEquals(2, store.getSSTables().size());

    Iterator<SSTableReader> it = store.getSSTables().iterator();
    long originalSize1 = it.next().uncompressedLength();
    long originalSize2 = it.next().uncompressedLength();

    // wait enough to force single compaction
    TimeUnit.SECONDS.sleep(5);

    // enable compaction, submit background and wait for it to complete
    store.enableAutoCompaction();
    FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
    while (CompactionManager.instance.getPendingTasks() > 0
        || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1);

    // even though both sstables were candidate for tombstone compaction
    // it was not executed because they have an overlapping token range
    assertEquals(2, store.getSSTables().size());
    it = store.getSSTables().iterator();
    long newSize1 = it.next().uncompressedLength();
    long newSize2 = it.next().uncompressedLength();
    assertEquals(
        "candidate sstable should not be tombstone-compacted because its key range overlap with other sstable",
        originalSize1,
        newSize1);
    assertEquals(
        "candidate sstable should not be tombstone-compacted because its key range overlap with other sstable",
        originalSize2,
        newSize2);

    // now let's enable the magic property
    store.metadata.compactionStrategyOptions.put("unchecked_tombstone_compaction", "true");
    store.reload();

    // submit background task again and wait for it to complete
    FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
    while (CompactionManager.instance.getPendingTasks() > 0
        || CompactionManager.instance.getActiveCompactions() > 0) TimeUnit.SECONDS.sleep(1);

    // we still have 2 sstables, since they were not compacted against each other
    assertEquals(2, store.getSSTables().size());
    it = store.getSSTables().iterator();
    newSize1 = it.next().uncompressedLength();
    newSize2 = it.next().uncompressedLength();
    assertTrue(
        "should be less than " + originalSize1 + ", but was " + newSize1, newSize1 < originalSize1);
    assertTrue(
        "should be less than " + originalSize2 + ", but was " + newSize2, newSize2 < originalSize2);

    // make sure max timestamp of compacted sstables is recorded properly after compaction.
    assertMaxTimestamp(store, timestamp2);
  }
  @Test
  public void testParallelLeveledCompaction() throws Exception {
    String ksname = "Keyspace1";
    String cfname = "StandardLeveled";
    Keyspace keyspace = Keyspace.open(ksname);
    ColumnFamilyStore store = keyspace.getColumnFamilyStore(cfname);
    store.disableAutoCompaction();

    LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) store.getCompactionStrategy();

    ByteBuffer value =
        ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files

    // Enough data to have a level 1 and 2
    int rows = 128;
    int columns = 10;

    // Adds enough data to trigger multiple sstable per level
    for (int r = 0; r < rows; r++) {
      DecoratedKey key = Util.dk(String.valueOf(r));
      RowMutation rm = new RowMutation(ksname, key.key);
      for (int c = 0; c < columns; c++) {
        rm.add(cfname, ByteBufferUtil.bytes("column" + c), value, 0);
      }
      rm.apply();
      store.forceBlockingFlush();
    }

    // Execute LCS in parallel
    ExecutorService executor =
        new ThreadPoolExecutor(
            4, 4, Long.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingDeque<Runnable>());
    List<Runnable> tasks = new ArrayList<Runnable>();
    while (true) {
      while (true) {
        final AbstractCompactionTask t = lcs.getMaximalTask(Integer.MIN_VALUE);
        if (t == null) break;
        tasks.add(
            new Runnable() {
              public void run() {
                t.execute(null);
              }
            });
      }
      if (tasks.isEmpty()) break;

      List<Future<?>> futures = new ArrayList<Future<?>>(tasks.size());
      for (Runnable r : tasks) futures.add(executor.submit(r));
      FBUtilities.waitOnFutures(futures);

      tasks.clear();
    }

    // Assert all SSTables are lined up correctly.
    LeveledManifest manifest = lcs.manifest;
    int levels = manifest.getLevelCount();
    for (int level = 0; level < levels; level++) {
      List<SSTableReader> sstables = manifest.getLevel(level);
      // score check
      assert (double) SSTable.getTotalBytes(sstables) / manifest.maxBytesForLevel(level) < 1.00;
      // overlap check for levels greater than 0
      if (level > 0) {
        for (SSTableReader sstable : sstables) {
          Set<SSTableReader> overlaps = LeveledManifest.overlapping(sstable, sstables);
          assert overlaps.size() == 1 && overlaps.contains(sstable);
        }
      }
    }
    for (SSTableReader sstable : store.getSSTables()) {
      assert sstable.getSSTableLevel() == sstable.getSSTableLevel();
    }
  }