/** * @param cctx Context. * @param id Partition ID. */ @SuppressWarnings("ExternalizableWithoutPublicNoArgConstructor") GridDhtLocalPartition(GridCacheContext cctx, int id) { assert cctx != null; this.id = id; this.cctx = cctx; log = U.logger(cctx.kernalContext(), logRef, this); rent = new GridFutureAdapter<Object>() { @Override public String toString() { return "PartitionRentFuture [part=" + GridDhtLocalPartition.this + ", map=" + map + ']'; } }; map = new ConcurrentHashMap8<>(cctx.config().getStartSize() / cctx.affinity().partitions()); int delQueueSize = CU.isSystemCache(cctx.name()) ? 100 : Math.max(MAX_DELETE_QUEUE_SIZE / cctx.affinity().partitions(), 20); rmvQueue = new GridCircularBuffer<>(U.ceilPow2(delQueueSize)); }
private void sendPartitions() { ClusterNode oldestNode = this.oldestNode.get(); try { sendLocalPartitions(oldestNode, exchId); } catch (ClusterTopologyCheckedException ignore) { if (log.isDebugEnabled()) log.debug( "Oldest node left during partition exchange [nodeId=" + oldestNode.id() + ", exchId=" + exchId + ']'); } catch (IgniteCheckedException e) { scheduleRecheck(); U.error( log, "Failed to send local partitions to oldest node (will retry after timeout) [oldestNodeId=" + oldestNode.id() + ", exchId=" + exchId + ']', e); } }
private void recheck() { // If this is the oldest node. if (oldestNode.get().id().equals(cctx.localNodeId())) { Collection<UUID> remaining = remaining(); if (!remaining.isEmpty()) { try { cctx.io() .safeSend( cctx.discovery().nodes(remaining), new GridDhtPartitionsSingleRequest(exchId), SYSTEM_POOL, null); } catch (IgniteCheckedException e) { U.error( log, "Failed to request partitions from nodes [exchangeId=" + exchId + ", nodes=" + remaining + ']', e); } } // Resend full partition map because last attempt failed. else { if (spreadPartitions()) onDone(exchId.topologyVersion()); } } else sendPartitions(); // Schedule another send. scheduleRecheck(); }
/** Clears swap entries for evicted partition. */ private void clearSwap() { assert state() == EVICTED; assert !GridQueryProcessor.isEnabled(cctx.config()) : "Indexing needs to have unswapped values."; try { GridCloseableIterator<Map.Entry<byte[], GridCacheSwapEntry>> it = cctx.swap().iterator(id); boolean isLocStore = cctx.store().isLocal(); if (it != null) { // We can safely remove these values because no entries will be created for evicted // partition. while (it.hasNext()) { Map.Entry<byte[], GridCacheSwapEntry> entry = it.next(); byte[] keyBytes = entry.getKey(); KeyCacheObject key = cctx.toCacheKeyObject(keyBytes); cctx.swap().remove(key); if (isLocStore) cctx.store().remove(null, key.value(cctx.cacheObjectContext(), false)); } } } catch (IgniteCheckedException e) { U.error(log, "Failed to clear swap for evicted partition: " + this, e); } }
/** @return Nodes to execute on. */ private Collection<ClusterNode> nodes() { CacheMode cacheMode = cctx.config().getCacheMode(); switch (cacheMode) { case LOCAL: if (prj != null) U.warn( log, "Ignoring query projection because it's executed over LOCAL cache " + "(only local node will be queried): " + this); return Collections.singletonList(cctx.localNode()); case REPLICATED: if (prj != null || partition() != null) return nodes(cctx, prj, partition()); return cctx.affinityNode() ? Collections.singletonList(cctx.localNode()) : Collections.singletonList(F.rand(nodes(cctx, null, partition()))); case PARTITIONED: return nodes(cctx, prj, partition()); default: throw new IllegalStateException("Unknown cache distribution mode: " + cacheMode); } }
/** * @param cacheCtx Cache context. * @return {@code True} if local node can calculate affinity on it's own for this partition map * exchange. */ private boolean canCalculateAffinity(GridCacheContext cacheCtx) { AffinityFunction affFunc = cacheCtx.config().getAffinity(); // Do not request affinity from remote nodes if affinity function is not centralized. if (!U.hasAnnotation(affFunc, AffinityCentralizedFunction.class)) return true; // If local node did not initiate exchange or local node is the only cache node in grid. Collection<ClusterNode> affNodes = CU.affinityNodes(cacheCtx, exchId.topologyVersion()); return !exchId.nodeId().equals(cctx.localNodeId()) || (affNodes.size() == 1 && affNodes.contains(cctx.localNode())); }
/** * @param nodeId Node ID. * @param retryCnt Number of retries. */ private void sendAllPartitions(final UUID nodeId, final int retryCnt) { ClusterNode n = cctx.node(nodeId); try { if (n != null) sendAllPartitions(F.asList(n), exchId); } catch (IgniteCheckedException e) { if (e instanceof ClusterTopologyCheckedException || !cctx.discovery().alive(n)) { log.debug( "Failed to send full partition map to node, node left grid " + "[rmtNode=" + nodeId + ", exchangeId=" + exchId + ']'); return; } if (retryCnt > 0) { long timeout = cctx.gridConfig().getNetworkSendRetryDelay(); LT.error( log, e, "Failed to send full partition map to node (will retry after timeout) " + "[node=" + nodeId + ", exchangeId=" + exchId + ", timeout=" + timeout + ']'); cctx.time() .addTimeoutObject( new GridTimeoutObjectAdapter(timeout) { @Override public void onTimeout() { sendAllPartitions(nodeId, retryCnt - 1); } }); } else U.error( log, "Failed to send full partition map [node=" + n + ", exchangeId=" + exchId + ']', e); } }
/** {@inheritDoc} */ @Override public String toString() { return S.toString( GridDhtLocalPartition.class, this, "state", state(), "reservations", reservations(), "empty", map.isEmpty(), "createTime", U.format(createTime), "mapPubSize", mapPubSize); }
private void dumpPendingObjects() { U.warn( log, "Failed to wait for partition release future. Dumping pending objects that might be the cause: " + cctx.localNodeId()); U.warn(log, "Pending transactions:"); for (IgniteInternalTx tx : cctx.tm().activeTransactions()) U.warn(log, ">>> " + tx); U.warn(log, "Pending explicit locks:"); for (GridCacheExplicitLockSpan lockSpan : cctx.mvcc().activeExplicitLocks()) U.warn(log, ">>> " + lockSpan); U.warn(log, "Pending cache futures:"); for (GridCacheFuture<?> fut : cctx.mvcc().activeFutures()) U.warn(log, ">>> " + fut); U.warn(log, "Pending atomic cache futures:"); for (GridCacheFuture<?> fut : cctx.mvcc().atomicFutures()) U.warn(log, ">>> " + fut); }
/** @return {@code True} if succeeded. */ private boolean spreadPartitions() { try { sendAllPartitions(rmtNodes, exchId); return true; } catch (IgniteCheckedException e) { scheduleRecheck(); if (!X.hasCause(e, InterruptedException.class)) U.error( log, "Failed to send full partition map to nodes (will retry after timeout) [nodes=" + F.nodeId8s(rmtNodes) + ", exchangeId=" + exchId + ']', e); return false; } }
/** * Starts activity. * * @throws IgniteInterruptedCheckedException If interrupted. */ public void init() throws IgniteInterruptedCheckedException { if (isDone()) return; if (init.compareAndSet(false, true)) { if (isDone()) return; try { // Wait for event to occur to make sure that discovery // will return corresponding nodes. U.await(evtLatch); assert discoEvt != null : this; assert !dummy && !forcePreload : this; ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx, exchId.topologyVersion()); oldestNode.set(oldest); startCaches(); // True if client node joined or failed. boolean clientNodeEvt; if (F.isEmpty(reqs)) { int type = discoEvt.type(); assert type == EVT_NODE_JOINED || type == EVT_NODE_LEFT || type == EVT_NODE_FAILED : discoEvt; clientNodeEvt = CU.clientNode(discoEvt.eventNode()); } else { assert discoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT : discoEvt; boolean clientOnlyStart = true; for (DynamicCacheChangeRequest req : reqs) { if (!req.clientStartOnly()) { clientOnlyStart = false; break; } } clientNodeEvt = clientOnlyStart; } if (clientNodeEvt) { ClusterNode node = discoEvt.eventNode(); // Client need to initialize affinity for local join event or for stated client caches. if (!node.isLocal()) { for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; GridDhtPartitionTopology top = cacheCtx.topology(); top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId())); if (cacheCtx.affinity().affinityTopologyVersion() == AffinityTopologyVersion.NONE) { initTopology(cacheCtx); top.beforeExchange(this); } else cacheCtx.affinity().clientEventTopologyChange(discoEvt, exchId.topologyVersion()); } if (exchId.isLeft()) cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion()); onDone(exchId.topologyVersion()); skipPreload = cctx.kernalContext().clientNode(); return; } } if (cctx.kernalContext().clientNode()) { skipPreload = true; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; GridDhtPartitionTopology top = cacheCtx.topology(); top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId())); } for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; initTopology(cacheCtx); } if (oldestNode.get() != null) { rmtNodes = new ConcurrentLinkedQueue<>( CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion())); rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes))); ready.set(true); initFut.onDone(true); if (log.isDebugEnabled()) log.debug("Initialized future: " + this); sendPartitions(); } else onDone(exchId.topologyVersion()); return; } assert oldestNode.get() != null; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (isCacheAdded(cacheCtx.cacheId(), exchId.topologyVersion())) { if (cacheCtx .discovery() .cacheAffinityNodes(cacheCtx.name(), topologyVersion()) .isEmpty()) U.quietAndWarn(log, "No server nodes found for cache client: " + cacheCtx.namex()); } cacheCtx.preloader().onExchangeFutureAdded(); } List<String> cachesWithoutNodes = null; if (exchId.isLeft()) { for (String name : cctx.cache().cacheNames()) { if (cctx.discovery().cacheAffinityNodes(name, topologyVersion()).isEmpty()) { if (cachesWithoutNodes == null) cachesWithoutNodes = new ArrayList<>(); cachesWithoutNodes.add(name); // Fire event even if there is no client cache started. if (cctx.gridEvents().isRecordable(EventType.EVT_CACHE_NODES_LEFT)) { Event evt = new CacheEvent( name, cctx.localNode(), cctx.localNode(), "All server nodes have left the cluster.", EventType.EVT_CACHE_NODES_LEFT, 0, false, null, null, null, null, false, null, false, null, null, null); cctx.gridEvents().record(evt); } } } } if (cachesWithoutNodes != null) { StringBuilder sb = new StringBuilder( "All server nodes for the following caches have left the cluster: "); for (int i = 0; i < cachesWithoutNodes.size(); i++) { String cache = cachesWithoutNodes.get(i); sb.append('\'').append(cache).append('\''); if (i != cachesWithoutNodes.size() - 1) sb.append(", "); } U.quietAndWarn(log, sb.toString()); U.quietAndWarn(log, "Must have server nodes for caches to operate."); } assert discoEvt != null; assert exchId.nodeId().equals(discoEvt.eventNode().id()); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { GridClientPartitionTopology clientTop = cctx.exchange().clearClientTopology(cacheCtx.cacheId()); long updSeq = clientTop == null ? -1 : clientTop.lastUpdateSequence(); // Update before waiting for locks. if (!cacheCtx.isLocal()) cacheCtx .topology() .updateTopologyVersion(exchId, this, updSeq, stopping(cacheCtx.cacheId())); } // Grab all alive remote nodes with order of equal or less than last joined node. rmtNodes = new ConcurrentLinkedQueue<>( CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion())); rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes))); for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> m : singleMsgs.entrySet()) // If received any messages, process them. onReceive(m.getKey(), m.getValue()); for (Map.Entry<UUID, GridDhtPartitionsFullMessage> m : fullMsgs.entrySet()) // If received any messages, process them. onReceive(m.getKey(), m.getValue()); AffinityTopologyVersion topVer = exchId.topologyVersion(); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; // Must initialize topology after we get discovery event. initTopology(cacheCtx); cacheCtx.preloader().updateLastExchangeFuture(this); } IgniteInternalFuture<?> partReleaseFut = cctx.partitionReleaseFuture(topVer); // Assign to class variable so it will be included into toString() method. this.partReleaseFut = partReleaseFut; if (log.isDebugEnabled()) log.debug("Before waiting for partition release future: " + this); while (true) { try { partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { // Print pending transactions and locks that might have led to hang. dumpPendingObjects(); } } if (log.isDebugEnabled()) log.debug("After waiting for partition release future: " + this); if (!F.isEmpty(reqs)) blockGateways(); if (exchId.isLeft()) cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion()); IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion()); while (true) { try { locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { U.warn( log, "Failed to wait for locks release future. " + "Dumping pending objects that might be the cause: " + cctx.localNodeId()); U.warn(log, "Locked entries:"); Map<IgniteTxKey, Collection<GridCacheMvccCandidate>> locks = cctx.mvcc().unfinishedLocks(exchId.topologyVersion()); for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet()) U.warn(log, "Locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']'); } } for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; // Notify replication manager. GridCacheContext drCacheCtx = cacheCtx.isNear() ? cacheCtx.near().dht().context() : cacheCtx; if (drCacheCtx.isDrEnabled()) drCacheCtx.dr().beforeExchange(topVer, exchId.isLeft()); // Partition release future is done so we can flush the write-behind store. cacheCtx.store().forceFlush(); // Process queued undeploys prior to sending/spreading map. cacheCtx.preloader().unwindUndeploys(); GridDhtPartitionTopology top = cacheCtx.topology(); assert topVer.equals(top.topologyVersion()) : "Topology version is updated only in this class instances inside single ExchangeWorker thread."; top.beforeExchange(this); } for (GridClientPartitionTopology top : cctx.exchange().clientTopologies()) { top.updateTopologyVersion(exchId, this, -1, stopping(top.cacheId())); top.beforeExchange(this); } } catch (IgniteInterruptedCheckedException e) { onDone(e); throw e; } catch (Throwable e) { U.error( log, "Failed to reinitialize local partitions (preloading will be stopped): " + exchId, e); onDone(e); if (e instanceof Error) throw (Error) e; return; } if (F.isEmpty(rmtIds)) { onDone(exchId.topologyVersion()); return; } ready.set(true); initFut.onDone(true); if (log.isDebugEnabled()) log.debug("Initialized future: " + this); // If this node is not oldest. if (!oldestNode.get().id().equals(cctx.localNodeId())) sendPartitions(); else { boolean allReceived = allReceived(); if (allReceived && replied.compareAndSet(false, true)) { if (spreadPartitions()) onDone(exchId.topologyVersion()); } } scheduleRecheck(); } else assert false : "Skipped init future: " + this; }
/** Clears values for this partition. */ private void clearAll() { GridCacheVersion clearVer = cctx.versions().next(); boolean swap = cctx.isSwapOrOffheapEnabled(); boolean rec = cctx.events().isRecordable(EVT_CACHE_REBALANCE_OBJECT_UNLOADED); Iterator<GridDhtCacheEntry> it = map.values().iterator(); GridCloseableIterator<Map.Entry<byte[], GridCacheSwapEntry>> swapIt = null; if (swap && GridQueryProcessor.isEnabled(cctx.config())) { // Indexing needs to unswap cache values. Iterator<GridDhtCacheEntry> unswapIt = null; try { swapIt = cctx.swap().iterator(id); unswapIt = unswapIterator(swapIt); } catch (Exception e) { U.error(log, "Failed to clear swap for evicted partition: " + this, e); } if (unswapIt != null) it = F.concat(it, unswapIt); } try { while (it.hasNext()) { GridDhtCacheEntry cached = it.next(); try { if (cached.clearInternal(clearVer, swap)) { map.remove(cached.key(), cached); if (!cached.isInternal()) { mapPubSize.decrement(); if (rec) cctx.events() .addEvent( cached.partition(), cached.key(), cctx.localNodeId(), (IgniteUuid) null, null, EVT_CACHE_REBALANCE_OBJECT_UNLOADED, null, false, cached.rawGet(), cached.hasValue(), null, null, null); } } } catch (IgniteCheckedException e) { U.error(log, "Failed to clear cache entry for evicted partition: " + cached, e); } } } finally { U.close(swapIt, log); } }
/** Key partition. */ public class GridDhtLocalPartition implements Comparable<GridDhtLocalPartition>, GridReservable { /** Maximum size for delete queue. */ public static final int MAX_DELETE_QUEUE_SIZE = Integer.getInteger(IGNITE_ATOMIC_CACHE_DELETE_HISTORY_SIZE, 200_000); /** Static logger to avoid re-creation. */ private static final AtomicReference<IgniteLogger> logRef = new AtomicReference<>(); /** Logger. */ private static volatile IgniteLogger log; /** Partition ID. */ private final int id; /** State. */ @GridToStringExclude private final AtomicStampedReference<GridDhtPartitionState> state = new AtomicStampedReference<>(MOVING, 0); /** Rent future. */ @GridToStringExclude private final GridFutureAdapter<?> rent; /** Entries map. */ private final ConcurrentMap<KeyCacheObject, GridDhtCacheEntry> map; /** Context. */ private final GridCacheContext cctx; /** Create time. */ @GridToStringExclude private final long createTime = U.currentTimeMillis(); /** Eviction history. */ private volatile Map<KeyCacheObject, GridCacheVersion> evictHist = new HashMap<>(); /** Lock. */ private final ReentrantLock lock = new ReentrantLock(); /** Public size counter. */ private final LongAdder8 mapPubSize = new LongAdder8(); /** Remove queue. */ private final GridCircularBuffer<T2<KeyCacheObject, GridCacheVersion>> rmvQueue; /** Group reservations. */ private final CopyOnWriteArrayList<GridDhtPartitionsReservation> reservations = new CopyOnWriteArrayList<>(); /** * @param cctx Context. * @param id Partition ID. */ @SuppressWarnings("ExternalizableWithoutPublicNoArgConstructor") GridDhtLocalPartition(GridCacheContext cctx, int id) { assert cctx != null; this.id = id; this.cctx = cctx; log = U.logger(cctx.kernalContext(), logRef, this); rent = new GridFutureAdapter<Object>() { @Override public String toString() { return "PartitionRentFuture [part=" + GridDhtLocalPartition.this + ", map=" + map + ']'; } }; map = new ConcurrentHashMap8<>(cctx.config().getStartSize() / cctx.affinity().partitions()); int delQueueSize = CU.isSystemCache(cctx.name()) ? 100 : Math.max(MAX_DELETE_QUEUE_SIZE / cctx.affinity().partitions(), 20); rmvQueue = new GridCircularBuffer<>(U.ceilPow2(delQueueSize)); } /** * Adds group reservation to this partition. * * @param r Reservation. * @return {@code false} If such reservation already added. */ public boolean addReservation(GridDhtPartitionsReservation r) { assert state.getReference() != EVICTED : "we can reserve only active partitions"; assert state.getStamp() != 0 : "partition must be already reserved before adding group reservation"; return reservations.addIfAbsent(r); } /** @param r Reservation. */ public void removeReservation(GridDhtPartitionsReservation r) { if (!reservations.remove(r)) throw new IllegalStateException("Reservation was already removed."); } /** @return Partition ID. */ public int id() { return id; } /** @return Create time. */ long createTime() { return createTime; } /** @return Partition state. */ public GridDhtPartitionState state() { return state.getReference(); } /** @return Reservations. */ public int reservations() { return state.getStamp(); } /** @return Keys belonging to partition. */ public Set<KeyCacheObject> keySet() { return map.keySet(); } /** @return Entries belonging to partition. */ public Collection<GridDhtCacheEntry> entries() { return map.values(); } /** @return {@code True} if partition is empty. */ public boolean isEmpty() { return map.isEmpty(); } /** @return Number of entries in this partition (constant-time method). */ public int size() { return map.size(); } /** Increments public size of the map. */ public void incrementPublicSize() { mapPubSize.increment(); } /** Decrements public size of the map. */ public void decrementPublicSize() { mapPubSize.decrement(); } /** @return Number of public (non-internal) entries in this partition. */ public int publicSize() { return mapPubSize.intValue(); } /** @return If partition is moving or owning or renting. */ public boolean valid() { GridDhtPartitionState state = state(); return state == MOVING || state == OWNING || state == RENTING; } /** @param entry Entry to add. */ void onAdded(GridDhtCacheEntry entry) { GridDhtPartitionState state = state(); if (state == EVICTED) throw new GridDhtInvalidPartitionException( id, "Adding entry to invalid partition [part=" + id + ']'); map.put(entry.key(), entry); if (!entry.isInternal()) mapPubSize.increment(); } /** @param entry Entry to remove. */ @SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter") void onRemoved(GridDhtCacheEntry entry) { assert entry.obsolete(); // Make sure to remove exactly this entry. synchronized (entry) { map.remove(entry.key(), entry); if (!entry.isInternal() && !entry.deleted()) mapPubSize.decrement(); } // Attempt to evict. tryEvict(true); } /** * @param key Removed key. * @param ver Removed version. * @throws IgniteCheckedException If failed. */ public void onDeferredDelete(KeyCacheObject key, GridCacheVersion ver) throws IgniteCheckedException { try { T2<KeyCacheObject, GridCacheVersion> evicted = rmvQueue.add(new T2<>(key, ver)); if (evicted != null) cctx.dht().removeVersionedEntry(evicted.get1(), evicted.get2()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } } /** Locks partition. */ @SuppressWarnings({"LockAcquiredButNotSafelyReleased"}) public void lock() { lock.lock(); } /** Unlocks partition. */ public void unlock() { lock.unlock(); } /** * @param key Key. * @param ver Version. */ public void onEntryEvicted(KeyCacheObject key, GridCacheVersion ver) { assert key != null; assert ver != null; assert lock.isHeldByCurrentThread(); // Only one thread can enter this method at a time. if (state() != MOVING) return; Map<KeyCacheObject, GridCacheVersion> evictHist0 = evictHist; if (evictHist0 != null) { GridCacheVersion ver0 = evictHist0.get(key); if (ver0 == null || ver0.isLess(ver)) { GridCacheVersion ver1 = evictHist0.put(key, ver); assert ver1 == ver0; } } } /** * Cache preloader should call this method within partition lock. * * @param key Key. * @param ver Version. * @return {@code True} if preloading is permitted. */ public boolean preloadingPermitted(KeyCacheObject key, GridCacheVersion ver) { assert key != null; assert ver != null; assert lock.isHeldByCurrentThread(); // Only one thread can enter this method at a time. if (state() != MOVING) return false; Map<KeyCacheObject, GridCacheVersion> evictHist0 = evictHist; if (evictHist0 != null) { GridCacheVersion ver0 = evictHist0.get(key); // Permit preloading if version in history // is missing or less than passed in. return ver0 == null || ver0.isLess(ver); } return false; } /** * Reserves a partition so it won't be cleared. * * @return {@code True} if reserved. */ @Override public boolean reserve() { while (true) { int reservations = state.getStamp(); GridDhtPartitionState s = state.getReference(); if (s == EVICTED) return false; if (state.compareAndSet(s, s, reservations, reservations + 1)) return true; } } /** Releases previously reserved partition. */ @Override public void release() { while (true) { int reservations = state.getStamp(); if (reservations == 0) return; GridDhtPartitionState s = state.getReference(); assert s != EVICTED; // Decrement reservations. if (state.compareAndSet(s, s, reservations, --reservations)) { tryEvict(true); break; } } } /** @return {@code True} if transitioned to OWNING state. */ boolean own() { while (true) { int reservations = state.getStamp(); GridDhtPartitionState s = state.getReference(); if (s == RENTING || s == EVICTED) return false; if (s == OWNING) return true; assert s == MOVING; if (state.compareAndSet(MOVING, OWNING, reservations, reservations)) { if (log.isDebugEnabled()) log.debug("Owned partition: " + this); // No need to keep history any more. evictHist = null; return true; } } } /** * @param updateSeq Update sequence. * @return Future to signal that this node is no longer an owner or backup. */ IgniteInternalFuture<?> rent(boolean updateSeq) { while (true) { int reservations = state.getStamp(); GridDhtPartitionState s = state.getReference(); if (s == RENTING || s == EVICTED) return rent; if (state.compareAndSet(s, RENTING, reservations, reservations)) { if (log.isDebugEnabled()) log.debug("Moved partition to RENTING state: " + this); // Evict asynchronously, as the 'rent' method may be called // from within write locks on local partition. tryEvictAsync(updateSeq); break; } } return rent; } /** * @param updateSeq Update sequence. * @return Future for evict attempt. */ IgniteInternalFuture<Boolean> tryEvictAsync(boolean updateSeq) { if (map.isEmpty() && !GridQueryProcessor.isEnabled(cctx.config()) && state.compareAndSet(RENTING, EVICTED, 0, 0)) { if (log.isDebugEnabled()) log.debug("Evicted partition: " + this); clearSwap(); if (cctx.isDrEnabled()) cctx.dr().partitionEvicted(id); cctx.dataStructures().onPartitionEvicted(id); rent.onDone(); ((GridDhtPreloader) cctx.preloader()).onPartitionEvicted(this, updateSeq); clearDeferredDeletes(); return new GridFinishedFuture<>(true); } return cctx.closures() .callLocalSafe( new GPC<Boolean>() { @Override public Boolean call() { return tryEvict(true); } }, /*system pool*/ true); } /** @return {@code true} If there is a group reservation. */ private boolean groupReserved() { for (GridDhtPartitionsReservation reservation : reservations) { if (!reservation.invalidate()) return true; // Failed to invalidate reservation -> we are reserved. } return false; } /** * @param updateSeq Update sequence. * @return {@code True} if entry has been transitioned to state EVICTED. */ boolean tryEvict(boolean updateSeq) { if (state.getReference() != RENTING || state.getStamp() != 0 || groupReserved()) return false; // Attempt to evict partition entries from cache. clearAll(); if (map.isEmpty() && state.compareAndSet(RENTING, EVICTED, 0, 0)) { if (log.isDebugEnabled()) log.debug("Evicted partition: " + this); if (!GridQueryProcessor.isEnabled(cctx.config())) clearSwap(); if (cctx.isDrEnabled()) cctx.dr().partitionEvicted(id); cctx.dataStructures().onPartitionEvicted(id); rent.onDone(); ((GridDhtPreloader) cctx.preloader()).onPartitionEvicted(this, updateSeq); clearDeferredDeletes(); return true; } return false; } /** Clears swap entries for evicted partition. */ private void clearSwap() { assert state() == EVICTED; assert !GridQueryProcessor.isEnabled(cctx.config()) : "Indexing needs to have unswapped values."; try { GridCloseableIterator<Map.Entry<byte[], GridCacheSwapEntry>> it = cctx.swap().iterator(id); boolean isLocStore = cctx.store().isLocal(); if (it != null) { // We can safely remove these values because no entries will be created for evicted // partition. while (it.hasNext()) { Map.Entry<byte[], GridCacheSwapEntry> entry = it.next(); byte[] keyBytes = entry.getKey(); KeyCacheObject key = cctx.toCacheKeyObject(keyBytes); cctx.swap().remove(key); if (isLocStore) cctx.store().remove(null, key.value(cctx.cacheObjectContext(), false)); } } } catch (IgniteCheckedException e) { U.error(log, "Failed to clear swap for evicted partition: " + this, e); } } /** */ void onUnlock() { tryEvict(true); } /** * @param topVer Topology version. * @return {@code True} if local node is primary for this partition. */ public boolean primary(AffinityTopologyVersion topVer) { return cctx.affinity().primary(cctx.localNode(), id, topVer); } /** Clears values for this partition. */ private void clearAll() { GridCacheVersion clearVer = cctx.versions().next(); boolean swap = cctx.isSwapOrOffheapEnabled(); boolean rec = cctx.events().isRecordable(EVT_CACHE_REBALANCE_OBJECT_UNLOADED); Iterator<GridDhtCacheEntry> it = map.values().iterator(); GridCloseableIterator<Map.Entry<byte[], GridCacheSwapEntry>> swapIt = null; if (swap && GridQueryProcessor.isEnabled(cctx.config())) { // Indexing needs to unswap cache values. Iterator<GridDhtCacheEntry> unswapIt = null; try { swapIt = cctx.swap().iterator(id); unswapIt = unswapIterator(swapIt); } catch (Exception e) { U.error(log, "Failed to clear swap for evicted partition: " + this, e); } if (unswapIt != null) it = F.concat(it, unswapIt); } try { while (it.hasNext()) { GridDhtCacheEntry cached = it.next(); try { if (cached.clearInternal(clearVer, swap)) { map.remove(cached.key(), cached); if (!cached.isInternal()) { mapPubSize.decrement(); if (rec) cctx.events() .addEvent( cached.partition(), cached.key(), cctx.localNodeId(), (IgniteUuid) null, null, EVT_CACHE_REBALANCE_OBJECT_UNLOADED, null, false, cached.rawGet(), cached.hasValue(), null, null, null); } } } catch (IgniteCheckedException e) { U.error(log, "Failed to clear cache entry for evicted partition: " + cached, e); } } } finally { U.close(swapIt, log); } } /** * @param it Swap iterator. * @return Unswapping iterator over swapped entries. */ private Iterator<GridDhtCacheEntry> unswapIterator( final GridCloseableIterator<Map.Entry<byte[], GridCacheSwapEntry>> it) { if (it == null) return null; return new Iterator<GridDhtCacheEntry>() { /** */ GridDhtCacheEntry lastEntry; @Override public boolean hasNext() { return it.hasNext(); } @Override public GridDhtCacheEntry next() { Map.Entry<byte[], GridCacheSwapEntry> entry = it.next(); byte[] keyBytes = entry.getKey(); try { KeyCacheObject key = cctx.toCacheKeyObject(keyBytes); lastEntry = (GridDhtCacheEntry) cctx.cache().entryEx(key, false); lastEntry.unswap(true); return lastEntry; } catch (IgniteCheckedException e) { throw new CacheException(e); } } @Override public void remove() { map.remove(lastEntry.key(), lastEntry); } }; } /** */ private void clearDeferredDeletes() { rmvQueue.forEach( new CI1<T2<KeyCacheObject, GridCacheVersion>>() { @Override public void apply(T2<KeyCacheObject, GridCacheVersion> t) { cctx.dht().removeVersionedEntry(t.get1(), t.get2()); } }); } /** {@inheritDoc} */ @Override public int hashCode() { return id; } /** {@inheritDoc} */ @SuppressWarnings({"OverlyStrongTypeCast"}) @Override public boolean equals(Object obj) { return obj instanceof GridDhtLocalPartition && (obj == this || ((GridDhtLocalPartition) obj).id() == id); } /** {@inheritDoc} */ @Override public int compareTo(@NotNull GridDhtLocalPartition part) { if (part == null) return 1; return Integer.compare(id, part.id()); } /** {@inheritDoc} */ @Override public String toString() { return S.toString( GridDhtLocalPartition.class, this, "state", state(), "reservations", reservations(), "empty", map.isEmpty(), "createTime", U.format(createTime), "mapPubSize", mapPubSize); } }
/** {@inheritDoc} */ @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"}) @Nullable @Override public GridDhtPartitionMap update( @Nullable GridDhtPartitionExchangeId exchId, GridDhtPartitionFullMap partMap) { if (log.isDebugEnabled()) log.debug( "Updating full partition map [exchId=" + exchId + ", parts=" + fullMapString() + ']'); assert partMap != null; lock.writeLock().lock(); try { if (stopping) return null; if (exchId != null && lastExchangeId != null && lastExchangeId.compareTo(exchId) >= 0) { if (log.isDebugEnabled()) log.debug( "Stale exchange id for full partition map update (will ignore) [lastExchId=" + lastExchangeId + ", exchId=" + exchId + ']'); return null; } if (node2part != null && node2part.compareTo(partMap) >= 0) { if (log.isDebugEnabled()) log.debug( "Stale partition map for full partition map update (will ignore) [lastExchId=" + lastExchangeId + ", exchId=" + exchId + ", curMap=" + node2part + ", newMap=" + partMap + ']'); return null; } long updateSeq = this.updateSeq.incrementAndGet(); if (exchId != null) lastExchangeId = exchId; if (node2part != null) { for (GridDhtPartitionMap part : node2part.values()) { GridDhtPartitionMap newPart = partMap.get(part.nodeId()); // If for some nodes current partition has a newer map, // then we keep the newer value. if (newPart != null && newPart.updateSequence() < part.updateSequence()) { if (log.isDebugEnabled()) log.debug( "Overriding partition map in full update map [exchId=" + exchId + ", curPart=" + mapString(part) + ", newPart=" + mapString(newPart) + ']'); partMap.put(part.nodeId(), part); } } for (Iterator<UUID> it = partMap.keySet().iterator(); it.hasNext(); ) { UUID nodeId = it.next(); if (!cctx.discovery().alive(nodeId)) { if (log.isDebugEnabled()) log.debug( "Removing left node from full map update [nodeId=" + nodeId + ", partMap=" + partMap + ']'); it.remove(); } } } node2part = partMap; Map<Integer, Set<UUID>> p2n = new HashMap<>(cctx.affinity().partitions(), 1.0f); for (Map.Entry<UUID, GridDhtPartitionMap> e : partMap.entrySet()) { for (Integer p : e.getValue().keySet()) { Set<UUID> ids = p2n.get(p); if (ids == null) // Initialize HashSet to size 3 in anticipation that there won't be // more than 3 nodes per partitions. p2n.put(p, ids = U.newHashSet(3)); ids.add(e.getKey()); } } part2node = p2n; boolean changed = checkEvictions(updateSeq); consistencyCheck(); if (log.isDebugEnabled()) log.debug("Partition map after full update: " + fullMapString()); return changed ? localPartitionMap() : null; } finally { lock.writeLock().unlock(); } }
/** * Updates value for single partition. * * @param p Partition. * @param nodeId Node ID. * @param state State. * @param updateSeq Update sequence. */ @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"}) private void updateLocal(int p, UUID nodeId, GridDhtPartitionState state, long updateSeq) { assert lock.isWriteLockedByCurrentThread(); assert nodeId.equals(cctx.nodeId()); // In case if node joins, get topology at the time of joining node. ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; // If this node became the oldest node. if (oldest.id().equals(cctx.nodeId())) { long seq = node2part.updateSequence(); if (seq != updateSeq) { if (seq > updateSeq) { if (this.updateSeq.get() < seq) { // Update global counter if necessary. boolean b = this.updateSeq.compareAndSet(this.updateSeq.get(), seq + 1); assert b : "Invalid update sequence [updateSeq=" + updateSeq + ", seq=" + seq + ", curUpdateSeq=" + this.updateSeq.get() + ", node2part=" + node2part.toFullString() + ']'; updateSeq = seq + 1; } else updateSeq = seq; } node2part.updateSequence(updateSeq); } } GridDhtPartitionMap map = node2part.get(nodeId); if (map == null) node2part.put( nodeId, map = new GridDhtPartitionMap( nodeId, updateSeq, Collections.<Integer, GridDhtPartitionState>emptyMap(), false)); map.updateSequence(updateSeq); map.put(p, state); Set<UUID> ids = part2node.get(p); if (ids == null) part2node.put(p, ids = U.newHashSet(3)); ids.add(nodeId); }
/** {@inheritDoc} */ @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"}) @Nullable @Override public GridDhtPartitionMap update( @Nullable GridDhtPartitionExchangeId exchId, GridDhtPartitionMap parts) { if (log.isDebugEnabled()) log.debug( "Updating single partition map [exchId=" + exchId + ", parts=" + mapString(parts) + ']'); if (!cctx.discovery().alive(parts.nodeId())) { if (log.isDebugEnabled()) log.debug( "Received partition update for non-existing node (will ignore) [exchId=" + exchId + ", parts=" + parts + ']'); return null; } lock.writeLock().lock(); try { if (stopping) return null; if (lastExchangeId != null && exchId != null && lastExchangeId.compareTo(exchId) > 0) { if (log.isDebugEnabled()) log.debug( "Stale exchange id for single partition map update (will ignore) [lastExchId=" + lastExchangeId + ", exchId=" + exchId + ']'); return null; } if (exchId != null) lastExchangeId = exchId; if (node2part == null) // Create invalid partition map. node2part = new GridDhtPartitionFullMap(); GridDhtPartitionMap cur = node2part.get(parts.nodeId()); if (cur != null && cur.updateSequence() >= parts.updateSequence()) { if (log.isDebugEnabled()) log.debug( "Stale update sequence for single partition map update (will ignore) [exchId=" + exchId + ", curSeq=" + cur.updateSequence() + ", newSeq=" + parts.updateSequence() + ']'); return null; } long updateSeq = this.updateSeq.incrementAndGet(); node2part = new GridDhtPartitionFullMap(node2part, updateSeq); boolean changed = false; if (cur == null || !cur.equals(parts)) changed = true; node2part.put(parts.nodeId(), parts); part2node = new HashMap<>(part2node); // Add new mappings. for (Integer p : parts.keySet()) { Set<UUID> ids = part2node.get(p); if (ids == null) // Initialize HashSet to size 3 in anticipation that there won't be // more than 3 nodes per partition. part2node.put(p, ids = U.newHashSet(3)); changed |= ids.add(parts.nodeId()); } // Remove obsolete mappings. if (cur != null) { for (Integer p : F.view(cur.keySet(), F0.notIn(parts.keySet()))) { Set<UUID> ids = part2node.get(p); if (ids != null) changed |= ids.remove(parts.nodeId()); } } changed |= checkEvictions(updateSeq); consistencyCheck(); if (log.isDebugEnabled()) log.debug("Partition map after single update: " + fullMapString()); return changed ? localPartitionMap() : null; } finally { lock.writeLock().unlock(); } }