/** * Waits for renting partitions. * * @return {@code True} if mapping was changed. * @throws IgniteCheckedException If failed. */ private boolean waitForRent() throws IgniteCheckedException { boolean changed = false; // Synchronously wait for all renting partitions to complete. for (Iterator<GridDhtLocalPartition> it = locParts.values().iterator(); it.hasNext(); ) { GridDhtLocalPartition p = it.next(); GridDhtPartitionState state = p.state(); if (state == RENTING || state == EVICTED) { if (log.isDebugEnabled()) log.debug("Waiting for renting partition: " + p); // Wait for partition to empty out. p.rent(true).get(); if (log.isDebugEnabled()) log.debug("Finished waiting for renting partition: " + p); // Remove evicted partition. it.remove(); changed = true; } } return changed; }
/** * @param p Partition number. * @param topVer Topology version. * @param create Create flag. * @param updateSeq Update sequence. * @return Local partition. */ private GridDhtLocalPartition localPartition( int p, AffinityTopologyVersion topVer, boolean create, boolean updateSeq) { while (true) { boolean belongs = cctx.affinity().localNode(p, topVer); GridDhtLocalPartition loc = locParts.get(p); if (loc != null && loc.state() == EVICTED) { locParts.remove(p, loc); if (!create) return null; if (!belongs) throw new GridDhtInvalidPartitionException( p, "Adding entry to evicted partition [part=" + p + ", topVer=" + topVer + ", this.topVer=" + this.topVer + ']'); continue; } if (loc == null && create) { if (!belongs) throw new GridDhtInvalidPartitionException( p, "Creating partition which does not belong [part=" + p + ", topVer=" + topVer + ", this.topVer=" + this.topVer + ']'); lock.writeLock().lock(); try { GridDhtLocalPartition old = locParts.putIfAbsent(p, loc = new GridDhtLocalPartition(cctx, p)); if (old != null) loc = old; else { if (updateSeq) this.updateSeq.incrementAndGet(); if (log.isDebugEnabled()) log.debug("Created local partition: " + loc); } } finally { lock.writeLock().unlock(); } } return loc; } }
/** {@inheritDoc} */ @Override public void onRemoved(GridDhtCacheEntry e) { /* * Make sure not to acquire any locks here as this method * may be called from sensitive synchronization blocks. * =================================================== */ GridDhtLocalPartition loc = localPartition(cctx.affinity().partition(e.key()), topologyVersion(), false); if (loc != null) loc.onRemoved(e); }
/** {@inheritDoc} */ @Override public void printMemoryStats(int threshold) { X.println( ">>> Cache partition topology stats [grid=" + cctx.gridName() + ", cache=" + cctx.name() + ']'); for (GridDhtLocalPartition part : locParts.values()) { int size = part.size(); if (size >= threshold) X.println(">>> Local partition [part=" + part.id() + ", size=" + size + ']'); } }
/** {@inheritDoc} */ @Override public GridDhtLocalPartition onAdded(AffinityTopologyVersion topVer, GridDhtCacheEntry e) { /* * Make sure not to acquire any locks here as this method * may be called from sensitive synchronization blocks. * =================================================== */ int p = cctx.affinity().partition(e.key()); GridDhtLocalPartition loc = localPartition(p, topVer, true); assert loc != null; loc.onAdded(e); return loc; }
/** {@inheritDoc} */ @Override public void onEvicted(GridDhtLocalPartition part, boolean updateSeq) { assert updateSeq || lock.isWriteLockedByCurrentThread(); lock.writeLock().lock(); try { if (stopping) return; assert part.state() == EVICTED; long seq = updateSeq ? this.updateSeq.incrementAndGet() : this.updateSeq.get(); updateLocal(part.id(), cctx.localNodeId(), part.state(), seq); consistencyCheck(); } finally { lock.writeLock().unlock(); } }
/** {@inheritDoc} */ @Override public boolean own(GridDhtLocalPartition part) { ClusterNode loc = cctx.localNode(); lock.writeLock().lock(); try { if (part.own()) { updateLocal(part.id(), loc.id(), part.state(), updateSeq.incrementAndGet()); consistencyCheck(); return true; } consistencyCheck(); return false; } finally { lock.writeLock().unlock(); } }
/** {@inheritDoc} */ @Override public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { boolean changed = waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); AffinityTopologyVersion topVer = exchFut.topologyVersion(); lock.writeLock().lock(); try { if (stopping) return false; assert topVer.equals(exchFut.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']'; if (log.isDebugEnabled()) log.debug( "Partition map before afterExchange [exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); if (cctx.affinity().localNode(p, topVer)) { // This partition will be created during next topology event, // which obviously has not happened at this point. if (locPart == null) { if (log.isDebugEnabled()) log.debug("Skipping local partition afterExchange (will not create): " + p); continue; } GridDhtPartitionState state = locPart.state(); if (state == MOVING) { if (cctx.rebalanceEnabled()) { Collection<ClusterNode> owners = owners(p); // If there are no other owners, then become an owner. if (F.isEmpty(owners)) { boolean owned = locPart.own(); assert owned : "Failed to own partition [cacheName" + cctx.name() + ", locPart=" + locPart + ']'; updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) { DiscoveryEvent discoEvt = exchFut.discoveryEvent(); cctx.events() .addPreloadEvent( p, EVT_CACHE_REBALANCE_PART_DATA_LOST, discoEvt.eventNode(), discoEvt.type(), discoEvt.timestamp()); } if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart); } else if (log.isDebugEnabled()) log.debug( "Will not own partition (there are owners to rebalance from) [locPart=" + locPart + ", owners = " + owners + ']'); } else updateLocal(p, loc.id(), locPart.state(), updateSeq); } } else { if (locPart != null) { GridDhtPartitionState state = locPart.state(); if (state == MOVING) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (log.isDebugEnabled()) log.debug("Evicting moving partition (it does not belong to affinity): " + locPart); } } } } consistencyCheck(); } finally { lock.writeLock().unlock(); } return changed; }
/** {@inheritDoc} */ @Override public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); lock.writeLock().lock(); try { GridDhtPartitionExchangeId exchId = exchFut.exchangeId(); if (stopping) return; assert topVer.equals(exchId.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']'; if (exchId.isLeft()) removeNode(exchId.nodeId()); // In case if node joins, get topology at the time of joining node. ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; if (log.isDebugEnabled()) log.debug( "Partition map beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); // If this is the oldest node. if (oldest.id().equals(loc.id()) || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) { if (node2part == null) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq); if (log.isDebugEnabled()) log.debug( "Created brand new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } else if (!node2part.valid()) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Created new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + node2part + ']'); } else if (!node2part.nodeId().equals(loc.id())) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Copied old map into new map on oldest node (previous oldest node left) [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } } if (cctx.rebalanceEnabled()) { for (int p = 0; p < num; p++) { // If this is the first node in grid. boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion()); if ((oldest.id().equals(loc.id()) && oldest.id().equals(exchId.nodeId()) && exchId.isJoined()) || added) { assert exchId.isJoined() || added; try { GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); assert locPart != null; boolean owned = locPart.own(); assert owned : "Failed to own partition for oldest node [cacheName" + cctx.name() + ", part=" + locPart + ']'; if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition on oldest node (no need to create a partition " + "if it no longer belongs to local node: " + e.partition()); } } // If this is not the first node in grid. else { if (node2part != null && node2part.valid()) { if (cctx.affinity().localNode(p, topVer)) { try { // This will make sure that all non-existing partitions // will be created in MOVING state. GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } // If this node's map is empty, we pre-create local partitions, // so local map will be sent correctly during exchange. else if (cctx.affinity().localNode(p, topVer)) { try { localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to pre-create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } } } else { // If preloader is disabled, then we simply clear out // the partitions this node is not responsible for. for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); boolean belongs = cctx.affinity().localNode(p, topVer); if (locPart != null) { if (!belongs) { GridDhtPartitionState state = locPart.state(); if (state.active()) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); if (log.isDebugEnabled()) log.debug( "Evicting partition with rebalancing disabled " + "(it does not belong to affinity): " + locPart); } } } else if (belongs) { try { // Pre-create partitions. localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition with disabled rebalancer (no need to " + "pre-create a partition if it no longer belongs to local node: " + e.partition()); } } } } if (node2part != null && node2part.valid()) checkEvictions(updateSeq); consistencyCheck(); if (log.isDebugEnabled()) log.debug( "Partition map after beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } finally { lock.writeLock().unlock(); } // Wait for evictions. waitForRent(); }
/** * @param updateSeq Update sequence. * @return Checks if any of the local partitions need to be evicted. */ private boolean checkEvictions(long updateSeq) { assert lock.isWriteLockedByCurrentThread(); boolean changed = false; UUID locId = cctx.nodeId(); for (GridDhtLocalPartition part : locParts.values()) { GridDhtPartitionState state = part.state(); if (state.active()) { int p = part.id(); List<ClusterNode> affNodes = cctx.affinity().nodes(p, topVer); if (!affNodes.contains(cctx.localNode())) { Collection<UUID> nodeIds = F.nodeIds(nodes(p, topVer, OWNING)); // If all affinity nodes are owners, then evict partition from local node. if (nodeIds.containsAll(F.nodeIds(affNodes))) { part.rent(false); updateLocal(part.id(), locId, part.state(), updateSeq); changed = true; if (log.isDebugEnabled()) log.debug("Evicted local partition (all affinity nodes are owners): " + part); } else { int ownerCnt = nodeIds.size(); int affCnt = affNodes.size(); if (ownerCnt > affCnt) { List<ClusterNode> sorted = new ArrayList<>(cctx.discovery().nodes(nodeIds)); // Sort by node orders in ascending order. Collections.sort(sorted, CU.nodeComparator(true)); int diff = sorted.size() - affCnt; for (int i = 0; i < diff; i++) { ClusterNode n = sorted.get(i); if (locId.equals(n.id())) { part.rent(false); updateLocal(part.id(), locId, part.state(), updateSeq); changed = true; if (log.isDebugEnabled()) log.debug( "Evicted local partition (this node is oldest non-affinity node): " + part); break; } } } } } } } return changed; }