public static void validateKeyRange(KeyRange range) throws InvalidRequestException { if ((range.start_key == null) != (range.end_key == null)) { throw new InvalidRequestException( "start key and end key must either both be non-null, or both be null"); } if ((range.start_token == null) != (range.end_token == null)) { throw new InvalidRequestException( "start token and end token must either both be non-null, or both be null"); } if ((range.start_key == null) == (range.start_token == null)) { throw new InvalidRequestException( "exactly one of {start key, end key} or {start token, end token} must be specified"); } if (range.start_key != null) { IPartitioner p = StorageService.getPartitioner(); Token startToken = p.getToken(range.start_key); Token endToken = p.getToken(range.end_key); if (startToken.compareTo(endToken) > 0 && !endToken.equals(p.getMinimumToken())) { if (p instanceof RandomPartitioner) throw new InvalidRequestException( "start key's md5 sorts after end key's md5. this is not allowed; you probably should not specify end key at all, under RandomPartitioner"); else throw new InvalidRequestException( "start key must sort before (or equal to) finish key in your partitioner!"); } } if (range.count <= 0) { throw new InvalidRequestException("maxRows must be positive"); } }
/** * iterator over the Tokens in the given ring, starting with the token for the node owning start * (which does not have to be a Token in the ring) * * @param includeMin True if the minimum token should be returned in the ring even if it has no * owner. */ public static Iterator<Token> ringIterator( final ArrayList<Token> ring, Token start, boolean includeMin) { if (ring.isEmpty()) return includeMin ? Iterators.singletonIterator(StorageService.getPartitioner().getMinimumToken()) : Iterators.<Token>emptyIterator(); final boolean insertMin = includeMin && !ring.get(0).isMinimum(); final int startIndex = firstTokenIndex(ring, start, insertMin); return new AbstractIterator<Token>() { int j = startIndex; protected Token computeNext() { if (j < -1) return endOfData(); try { // return minimum for index == -1 if (j == -1) return StorageService.getPartitioner().getMinimumToken(); // return ring token for other indexes return ring.get(j); } finally { j++; if (j == ring.size()) j = insertMin ? -1 : 0; if (j == startIndex) // end iteration j = -2; } } }; }
private SSTableReader writeSortedContents(List<DecoratedKey> sortedKeys) throws IOException { logger.info("Writing " + this); String path = cfs.getFlushPath(); SSTableWriter writer = new SSTableWriter(path, sortedKeys.size(), StorageService.getPartitioner()); boolean bloomColumns = writer.getBloomFilterWriter().isBloomColumns(); ObservingColumnFamilyDeserializer observer = null; DataInputStream din = null; ReentrantByteArrayInputStream bin = null; if (bloomColumns) { writer.getBloomFilterWriter().setEstimatedColumnCount(sortedKeys.size() * 10); observer = new ObservingColumnFamilyDeserializer(writer.getBloomFilterWriter()); bin = new ReentrantByteArrayInputStream(new byte[0]); din = new DataInputStream(bin); } for (DecoratedKey key : sortedKeys) { byte[] bytes = columnFamilies.get(key); assert bytes.length > 0; writer.append(key, bytes); if (observer != null) { bin.reset(bytes); observer.deserialize(key, din); } } SSTableReader sstable = writer.closeAndOpenReader(); logger.info("Completed flushing " + writer.getFilename()); return sstable; }
public static List<Row> getRangeSlice(ColumnFamilyStore cfs, ByteBuffer superColumn) throws IOException, ExecutionException, InterruptedException { Token min = StorageService.getPartitioner().getMinimumToken(); return cfs.getRangeSlice( superColumn, new Bounds<Token>(min, min).toRowBounds(), 10000, new IdentityQueryFilter(), null); }
RangeNamesQueryPager( RangeSliceCommand command, ConsistencyLevel consistencyLevel, boolean localQuery, PagingState state) { this(command, consistencyLevel, localQuery); if (state != null) { lastReturnedKey = StorageService.getPartitioner().decorateKey(state.partitionKey); restoreState(state.remaining, true); } }
@Test public void testCleanupWithIndexes() throws IOException, ExecutionException, InterruptedException { Keyspace keyspace = Keyspace.open(KEYSPACE1); ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF1); List<Row> rows; // insert data and verify we get it back w/ range query fillCF(cfs, LOOPS); rows = Util.getRangeSlice(cfs); assertEquals(LOOPS, rows.size()); SecondaryIndex index = cfs.indexManager.getIndexForColumn(COLUMN); long start = System.nanoTime(); while (!index.isIndexBuilt(COLUMN) && System.nanoTime() - start < TimeUnit.SECONDS.toNanos(10)) Thread.sleep(10); // verify we get it back w/ index query too IndexExpression expr = new IndexExpression(COLUMN, IndexExpression.Operator.EQ, VALUE); List<IndexExpression> clause = Arrays.asList(expr); IDiskAtomFilter filter = new IdentityQueryFilter(); IPartitioner p = StorageService.getPartitioner(); Range<RowPosition> range = Util.range("", ""); rows = keyspace.getColumnFamilyStore(CF1).search(range, clause, filter, Integer.MAX_VALUE); assertEquals(LOOPS, rows.size()); // we don't allow cleanup when the local host has no range to avoid wipping up all data when a // node has not join the ring. // So to make sure cleanup erase everything here, we give the localhost the tiniest possible // range. TokenMetadata tmd = StorageService.instance.getTokenMetadata(); byte[] tk1 = new byte[1], tk2 = new byte[1]; tk1[0] = 2; tk2[0] = 1; tmd.updateNormalToken(new BytesToken(tk1), InetAddress.getByName("127.0.0.1")); tmd.updateNormalToken(new BytesToken(tk2), InetAddress.getByName("127.0.0.2")); CompactionManager.instance.performCleanup(cfs, new CounterId.OneShotRenewer()); // row data should be gone rows = Util.getRangeSlice(cfs); assertEquals(0, rows.size()); // not only should it be gone but there should be no data on disk, not even tombstones assert cfs.getSSTables().isEmpty(); // 2ary indexes should result in no results, too (although tombstones won't be gone until // compacted) rows = cfs.search(range, clause, filter, Integer.MAX_VALUE); assertEquals(0, rows.size()); }
public static List<Row> getRangeSlice(ColumnFamilyStore cfs, ByteBuffer superColumn) { IDiskAtomFilter filter = superColumn == null ? new IdentityQueryFilter() : new SliceQueryFilter( SuperColumns.startOf(superColumn), SuperColumns.endOf(superColumn), false, Integer.MAX_VALUE); Token min = StorageService.getPartitioner().getMinimumToken(); return cfs.getRangeSlice(Bounds.makeRowBounds(min, min), null, filter, 10000); }
/** Creates initial set of nodes and tokens. Nodes are added to StorageService as 'normal' */ public static void createInitialRing( StorageService ss, IPartitioner partitioner, List<Token> endpointTokens, List<Token> keyTokens, List<InetAddress> hosts, List<UUID> hostIds, int howMany) throws UnknownHostException { // Expand pool of host IDs as necessary for (int i = hostIdPool.size(); i < howMany; i++) hostIdPool.add(UUID.randomUUID()); for (int i = 0; i < howMany; i++) { endpointTokens.add(new BigIntegerToken(String.valueOf(10 * i))); keyTokens.add(new BigIntegerToken(String.valueOf(10 * i + 5))); hostIds.add(hostIdPool.get(i)); } for (int i = 0; i < endpointTokens.size(); i++) { InetAddress ep = InetAddress.getByName("127.0.0." + String.valueOf(i + 1)); Gossiper.instance.initializeNodeUnsafe(ep, hostIds.get(i), 1); Gossiper.instance.injectApplicationState( ep, ApplicationState.TOKENS, new VersionedValue.VersionedValueFactory(partitioner) .tokens(Collections.singleton(endpointTokens.get(i)))); ss.onChange( ep, ApplicationState.STATUS, new VersionedValue.VersionedValueFactory(partitioner) .normal(Collections.singleton(endpointTokens.get(i)))); hosts.add(ep); } // check that all nodes are in token metadata for (int i = 0; i < endpointTokens.size(); ++i) assertTrue(ss.getTokenMetadata().isMember(hosts.get(i))); }
public void init(String keyspace) { outputHandler.output( String.format( "Starting client (and waiting %d seconds for gossip) ...", StorageService.RING_DELAY / 1000)); try { // Init gossip StorageService.instance.initClient(); Set<InetAddress> hosts = Gossiper.instance.getLiveMembers(); hosts.remove(FBUtilities.getLocalAddress()); if (hosts.isEmpty()) throw new IllegalStateException( "Cannot load any sstable, no live member found in the cluster"); // Query endpoint to ranges map and schemas from thrift String host = hosts.iterator().next().toString().substring(1); int port = DatabaseDescriptor.getRpcPort(); Cassandra.Client client = createThriftClient(host, port); List<TokenRange> tokenRanges = client.describe_ring(keyspace); List<KsDef> ksDefs = client.describe_keyspaces(); Token.TokenFactory tkFactory = StorageService.getPartitioner().getTokenFactory(); try { for (TokenRange tr : tokenRanges) { Range range = new Range(tkFactory.fromString(tr.start_token), tkFactory.fromString(tr.end_token)); for (String ep : tr.endpoints) { addRangeForEndpoint(range, InetAddress.getByName(ep)); } } } catch (UnknownHostException e) { throw new RuntimeException("Got an unknow host from describe_ring()", e); } for (KsDef ksDef : ksDefs) { Set<String> cfs = new HashSet<String>(); for (CfDef cfDef : ksDef.cf_defs) cfs.add(cfDef.name); knownCfs.put(ksDef.name, cfs); } } catch (Exception e) { throw new RuntimeException(e); } }
private void testRangeSliceCommandWrite() throws IOException { IPartitioner part = StorageService.getPartitioner(); AbstractBounds<RowPosition> bounds = new Range<Token>(part.getRandomToken(), part.getRandomToken()).toRowBounds(); RangeSliceCommand namesCmd = new RangeSliceCommand(statics.KS, "Standard1", statics.readTs, namesPred, bounds, 100); MessageOut<RangeSliceCommand> namesCmdMsg = namesCmd.createMessage(); RangeSliceCommand emptyRangeCmd = new RangeSliceCommand(statics.KS, "Standard1", statics.readTs, emptyRangePred, bounds, 100); MessageOut<RangeSliceCommand> emptyRangeCmdMsg = emptyRangeCmd.createMessage(); RangeSliceCommand regRangeCmd = new RangeSliceCommand( statics.KS, "Standard1", statics.readTs, nonEmptyRangePred, bounds, 100); MessageOut<RangeSliceCommand> regRangeCmdMsg = regRangeCmd.createMessage(); RangeSliceCommand namesCmdSup = new RangeSliceCommand(statics.KS, "Super1", statics.readTs, namesSCPred, bounds, 100); MessageOut<RangeSliceCommand> namesCmdSupMsg = namesCmdSup.createMessage(); RangeSliceCommand emptyRangeCmdSup = new RangeSliceCommand(statics.KS, "Super1", statics.readTs, emptyRangePred, bounds, 100); MessageOut<RangeSliceCommand> emptyRangeCmdSupMsg = emptyRangeCmdSup.createMessage(); RangeSliceCommand regRangeCmdSup = new RangeSliceCommand( statics.KS, "Super1", statics.readTs, nonEmptyRangeSCPred, bounds, 100); MessageOut<RangeSliceCommand> regRangeCmdSupMsg = regRangeCmdSup.createMessage(); DataOutputStream out = getOutput("db.RangeSliceCommand.bin"); namesCmdMsg.serialize(out, getVersion()); emptyRangeCmdMsg.serialize(out, getVersion()); regRangeCmdMsg.serialize(out, getVersion()); namesCmdSupMsg.serialize(out, getVersion()); emptyRangeCmdSupMsg.serialize(out, getVersion()); regRangeCmdSupMsg.serialize(out, getVersion()); out.close(); // test serializedSize testSerializedSize(namesCmd, RangeSliceCommand.serializer); testSerializedSize(emptyRangeCmd, RangeSliceCommand.serializer); testSerializedSize(regRangeCmd, RangeSliceCommand.serializer); testSerializedSize(namesCmdSup, RangeSliceCommand.serializer); testSerializedSize(emptyRangeCmdSup, RangeSliceCommand.serializer); testSerializedSize(regRangeCmdSup, RangeSliceCommand.serializer); }
public List<String> get_key_range( String tablename, String columnFamily, String startWith, String stopAt, int maxResults) throws InvalidRequestException, TException { if (logger.isDebugEnabled()) logger.debug("get_key_range"); ThriftValidation.validateCommand(tablename, columnFamily); if (!(StorageService.getPartitioner() instanceof OrderPreservingPartitioner)) { throw new InvalidRequestException( "range queries may only be performed against an order-preserving partitioner"); } if (maxResults <= 0) { throw new InvalidRequestException("maxResults must be positive"); } try { return StorageProxy.getKeyRange( new RangeCommand(tablename, columnFamily, startWith, stopAt, maxResults)); } catch (IOException e) { throw new RuntimeException(e); } }
/* * The start function initializes the server and start's listening on the * specified port. */ public void start() throws IOException { LogUtil.init(); // LogUtil.setLogLevel("com.facebook", "DEBUG"); // Start the storage service storageService.start(); }
/** * Retrieves a local subBlock * * @param blockId row key * @param sblockId SubBlock column name * @param offset inside the sblock * @return a local sublock * @throws TException */ private LocalBlock getLocalSubBlock( String subBlockCFName, ByteBuffer blockId, ByteBuffer sblockId, int offset) throws TException { DecoratedKey<Token<?>> decoratedKey = new DecoratedKey<Token<?>>(StorageService.getPartitioner().getToken(blockId), blockId); Table table = Table.open(cfsKeyspace); ColumnFamilyStore sblockStore = table.getColumnFamilyStore(subBlockCFName); Collection<SSTableReader> sstables = sblockStore.getSSTables(); for (SSTableReader sstable : sstables) { long position = sstable.getPosition(decoratedKey, Operator.EQ); if (position == -1) continue; String filename = sstable.descriptor.filenameFor(Component.DATA); RandomAccessFile raf = null; int mappedLength = -1; MappedByteBuffer mappedData = null; MappedFileDataInput file = null; try { raf = new RandomAccessFile(filename, "r"); assert position < raf.length(); mappedLength = (raf.length() - position) < Integer.MAX_VALUE ? (int) (raf.length() - position) : Integer.MAX_VALUE; mappedData = raf.getChannel().map(FileChannel.MapMode.READ_ONLY, position, mappedLength); file = new MappedFileDataInput(mappedData, filename, 0); if (file == null) continue; // Verify key was found in data file DecoratedKey keyInDisk = SSTableReader.decodeKey( sstable.partitioner, sstable.descriptor, ByteBufferUtil.readWithShortLength(file)); assert keyInDisk.equals(decoratedKey) : String.format("%s != %s in %s", keyInDisk, decoratedKey, file.getPath()); long rowSize = SSTableReader.readRowSize(file, sstable.descriptor); assert rowSize > 0; assert rowSize < mappedLength; Filter bf = IndexHelper.defreezeBloomFilter(file, sstable.descriptor.usesOldBloomFilter); // verify this column in in this version of the row. if (!bf.isPresent(sblockId)) continue; List<IndexHelper.IndexInfo> indexList = IndexHelper.deserializeIndex(file); // we can stop early if bloom filter says none of the // columns actually exist -- but, // we can't stop before initializing the cf above, in // case there's a relevant tombstone ColumnFamilySerializer serializer = ColumnFamily.serializer(); try { ColumnFamily cf = serializer.deserializeFromSSTableNoColumns( ColumnFamily.create(sstable.metadata), file); if (cf.isMarkedForDelete()) continue; } catch (Exception e) { e.printStackTrace(); throw new IOException( serializer + " failed to deserialize " + sstable.getColumnFamilyName() + " with " + sstable.metadata + " from " + file, e); } Integer sblockLength = null; if (indexList == null) sblockLength = seekToSubColumn(sstable.metadata, file, sblockId); else sblockLength = seekToSubColumn(sstable.metadata, file, sblockId, indexList); if (sblockLength == null || sblockLength < 0) continue; int bytesReadFromStart = mappedLength - (int) file.bytesRemaining(); if (logger.isDebugEnabled()) logger.debug("BlockLength = " + sblockLength + " Availible " + file.bytesRemaining()); assert offset <= sblockLength : String.format("%d > %d", offset, sblockLength); long dataOffset = position + bytesReadFromStart; if (file.bytesRemaining() == 0 || sblockLength == 0) continue; return new LocalBlock(file.getPath(), dataOffset + offset, sblockLength - offset); } catch (IOException e) { throw new TException(e); } finally { FileUtils.closeQuietly(raf); } } return null; }
public CassandraServer() { storageService = StorageService.instance(); }
public static RowPosition rp(String key) { return rp(key, StorageService.getPartitioner()); }
public IncludingExcludingBounds(T left, T right) { this(left, right, StorageService.getPartitioner()); }
public static DecoratedKey dk(ByteBuffer key) { return StorageService.getPartitioner().decorateKey(key); }
public Row deserialize(DataInput in, int version, ColumnSerializer.Flag flag) throws IOException { return new Row( StorageService.getPartitioner().decorateKey(ByteBufferUtil.readWithShortLength(in)), ColumnFamily.serializer.deserialize(in, flag, version)); }
public Range(Token left, Token right) { this(left, right, StorageService.getPartitioner()); }
@Test public void testRangeTombstones() throws IOException, ExecutionException, InterruptedException { Keyspace keyspace = Keyspace.open(KEYSPACE1); ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard2"); cfs.clearUnsafe(); // disable compaction while flushing cfs.disableAutoCompaction(); final CFMetaData cfmeta = cfs.metadata; Directories dir = cfs.directories; ArrayList<DecoratedKey> keys = new ArrayList<DecoratedKey>(); for (int i = 0; i < 4; i++) { keys.add(Util.dk("" + i)); } ArrayBackedSortedColumns cf = ArrayBackedSortedColumns.factory.create(cfmeta); cf.addColumn(Util.column("01", "a", 1)); // this must not resurrect cf.addColumn(Util.column("a", "a", 3)); cf.deletionInfo() .add( new RangeTombstone( Util.cellname("0"), Util.cellname("b"), 2, (int) (System.currentTimeMillis() / 1000)), cfmeta.comparator); SSTableWriter writer = new SSTableWriter( cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables()), 0, 0, cfs.metadata, StorageService.getPartitioner(), new MetadataCollector(cfs.metadata.comparator)); writer.append(Util.dk("0"), cf); writer.append(Util.dk("1"), cf); writer.append(Util.dk("3"), cf); cfs.addSSTable(writer.closeAndOpenReader()); writer = new SSTableWriter( cfs.getTempSSTablePath(dir.getDirectoryForNewSSTables()), 0, 0, cfs.metadata, StorageService.getPartitioner(), new MetadataCollector(cfs.metadata.comparator)); writer.append(Util.dk("0"), cf); writer.append(Util.dk("1"), cf); writer.append(Util.dk("2"), cf); writer.append(Util.dk("3"), cf); cfs.addSSTable(writer.closeAndOpenReader()); Collection<SSTableReader> toCompact = cfs.getSSTables(); assert toCompact.size() == 2; // Force compaction on first sstables. Since each row is in only one sstable, we will be using // EchoedRow. Util.compact(cfs, toCompact); assertEquals(1, cfs.getSSTables().size()); // Now assert we do have the 4 keys assertEquals(4, Util.getRangeSlice(cfs).size()); ArrayList<DecoratedKey> k = new ArrayList<DecoratedKey>(); for (Row r : Util.getRangeSlice(cfs)) { k.add(r.key); assertEquals(ByteBufferUtil.bytes("a"), r.cf.getColumn(Util.cellname("a")).value()); assertNull(r.cf.getColumn(Util.cellname("01"))); assertEquals(3, r.cf.getColumn(Util.cellname("a")).timestamp()); } for (SSTableReader sstable : cfs.getSSTables()) { StatsMetadata stats = sstable.getSSTableMetadata(); assertEquals(ByteBufferUtil.bytes("0"), stats.minColumnNames.get(0)); assertEquals(ByteBufferUtil.bytes("b"), stats.maxColumnNames.get(0)); } assertEquals(keys, k); }
public static Token token(String key) { return StorageService.getPartitioner().getToken(ByteBufferUtil.bytes(key)); }
public class BinaryMemtable implements IFlushable { private static final Logger logger = Logger.getLogger(BinaryMemtable.class); private final int threshold = DatabaseDescriptor.getBMTThreshold() * 1024 * 1024; private final AtomicInteger currentSize = new AtomicInteger(0); /* Table and ColumnFamily name are used to determine the ColumnFamilyStore */ private boolean isFrozen = false; private final Map<DecoratedKey, byte[]> columnFamilies = new NonBlockingHashMap<DecoratedKey, byte[]>(); /* Lock and Condition for notifying new clients about Memtable switches */ private final Lock lock = new ReentrantLock(); Condition condition; private final IPartitioner partitioner = StorageService.getPartitioner(); private final ColumnFamilyStore cfs; public BinaryMemtable(ColumnFamilyStore cfs) { this.cfs = cfs; condition = lock.newCondition(); } boolean isThresholdViolated() { return currentSize.get() >= threshold; } /* * This version is used by the external clients to put data into * the memtable. This version will respect the threshold and flush * the memtable to disk when the size exceeds the threshold. */ void put(String key, byte[] buffer) { if (isThresholdViolated()) { lock.lock(); try { if (!isFrozen) { isFrozen = true; cfs.submitFlush(this); cfs.switchBinaryMemtable(key, buffer); } else { cfs.applyBinary(key, buffer); } } finally { lock.unlock(); } } else { resolve(key, buffer); } } public boolean isClean() { return columnFamilies.isEmpty(); } private void resolve(String key, byte[] buffer) { columnFamilies.put(partitioner.decorateKey(key), buffer); currentSize.addAndGet(buffer.length + key.length()); } private List<DecoratedKey> getSortedKeys() { assert !columnFamilies.isEmpty(); logger.info("Sorting " + this); List<DecoratedKey> keys = new ArrayList<DecoratedKey>(columnFamilies.keySet()); Collections.sort(keys); return keys; } private SSTableReader writeSortedContents(List<DecoratedKey> sortedKeys) throws IOException { logger.info("Writing " + this); String path = cfs.getFlushPath(); SSTableWriter writer = new SSTableWriter(path, sortedKeys.size(), StorageService.getPartitioner()); boolean bloomColumns = writer.getBloomFilterWriter().isBloomColumns(); ObservingColumnFamilyDeserializer observer = null; DataInputStream din = null; ReentrantByteArrayInputStream bin = null; if (bloomColumns) { writer.getBloomFilterWriter().setEstimatedColumnCount(sortedKeys.size() * 10); observer = new ObservingColumnFamilyDeserializer(writer.getBloomFilterWriter()); bin = new ReentrantByteArrayInputStream(new byte[0]); din = new DataInputStream(bin); } for (DecoratedKey key : sortedKeys) { byte[] bytes = columnFamilies.get(key); assert bytes.length > 0; writer.append(key, bytes); if (observer != null) { bin.reset(bytes); observer.deserialize(key, din); } } SSTableReader sstable = writer.closeAndOpenReader(); logger.info("Completed flushing " + writer.getFilename()); return sstable; } public void flushAndSignal( final Condition condition, ExecutorService sorter, final ExecutorService writer) { sorter.submit( new Runnable() { public void run() { final List<DecoratedKey> sortedKeys = getSortedKeys(); writer.submit( new WrappedRunnable() { public void runMayThrow() throws IOException { cfs.addSSTable(writeSortedContents(sortedKeys)); condition.signalAll(); } }); } }); } }
public static DecoratedKey dk(String key, AbstractType type) { return StorageService.getPartitioner().decorateKey(type.fromString(key)); }
public static DecoratedKey dk(String key) { return StorageService.getPartitioner().decorateKey(ByteBufferUtil.bytes(key)); }
public Row(ByteBuffer key, ColumnFamily updates) { this(StorageService.getPartitioner().decorateKey(key), updates); }