/** * Calculates data nodes for replicated caches on unstable topology. * * @param cctx Cache context for main space. * @param extraSpaces Extra spaces. * @return Collection of all data nodes owning all the caches or {@code null} for retry. */ private Collection<ClusterNode> replicatedUnstableDataNodes( final GridCacheContext<?, ?> cctx, List<String> extraSpaces) { assert cctx.isReplicated() : cctx.name() + " must be replicated"; Set<ClusterNode> nodes = replicatedUnstableDataNodes(cctx); if (F.isEmpty(nodes)) return null; // Retry. if (!F.isEmpty(extraSpaces)) { for (String extraSpace : extraSpaces) { GridCacheContext<?, ?> extraCctx = cacheContext(extraSpace); if (extraCctx.isLocal()) continue; if (!extraCctx.isReplicated()) throw new CacheException( "Queries running on replicated cache should not contain JOINs " + "with tables in partitioned caches [rCache=" + cctx.name() + ", pCache=" + extraSpace + "]"); Set<ClusterNode> extraOwners = replicatedUnstableDataNodes(extraCctx); if (F.isEmpty(extraOwners)) return null; // Retry. nodes.retainAll(extraOwners); if (nodes.isEmpty()) return null; // Retry. } } return nodes; }
/** * @param cctx Context. * @param id Partition ID. */ @SuppressWarnings("ExternalizableWithoutPublicNoArgConstructor") GridDhtLocalPartition(GridCacheContext cctx, int id) { assert cctx != null; this.id = id; this.cctx = cctx; log = U.logger(cctx.kernalContext(), logRef, this); rent = new GridFutureAdapter<Object>() { @Override public String toString() { return "PartitionRentFuture [part=" + GridDhtLocalPartition.this + ", map=" + map + ']'; } }; map = new ConcurrentHashMap8<>(cctx.config().getStartSize() / cctx.affinity().partitions()); int delQueueSize = CU.isSystemCache(cctx.name()) ? 100 : Math.max(MAX_DELETE_QUEUE_SIZE / cctx.affinity().partitions(), 20); rmvQueue = new GridCircularBuffer<>(U.ceilPow2(delQueueSize)); }
/** {@inheritDoc} */ @Override public GridDhtPartitionFullMap partitionMap(boolean onlyActive) { lock.readLock().lock(); try { assert node2part != null && node2part.valid() : "Invalid node2part [node2part: " + node2part + ", cache=" + cctx.name() + ", started=" + cctx.started() + ", stopping=" + stopping + ", locNodeId=" + cctx.localNode().id() + ", locName=" + cctx.gridName() + ']'; GridDhtPartitionFullMap m = node2part; return new GridDhtPartitionFullMap( m.nodeId(), m.nodeOrder(), m.updateSequence(), m, onlyActive); } finally { lock.readLock().unlock(); } }
/** @param ctx Context. */ public GridNearAtomicCache(GridCacheContext<K, V> ctx) { super(ctx); int size = CU.isSystemCache(ctx.name()) ? 100 : Integer.getInteger(IGNITE_ATOMIC_CACHE_DELETE_HISTORY_SIZE, 1_000_000); rmvQueue = new GridCircularBuffer<>(U.ceilPow2(size / 10)); }
/** {@inheritDoc} */ @Override public AffinityTopologyVersion topologyVersion() { lock.readLock().lock(); try { assert topVer.topologyVersion() > 0 : "Invalid topology version [topVer=" + topVer + ", cacheName=" + cctx.name() + ']'; return topVer; } finally { lock.readLock().unlock(); } }
/** * Collects all the nodes owning all the partitions for the given replicated cache. * * @param cctx Cache context. * @return Owning nodes or {@code null} if we can't find owners for some partitions. */ private Set<ClusterNode> replicatedUnstableDataNodes(GridCacheContext<?, ?> cctx) { assert cctx.isReplicated() : cctx.name() + " must be replicated"; String space = cctx.name(); Set<ClusterNode> dataNodes = new HashSet<>(dataNodes(space, NONE)); if (dataNodes.isEmpty()) throw new CacheException("Failed to find data nodes for cache: " + space); // Find all the nodes owning all the partitions for replicated cache. for (int p = 0, parts = cctx.affinity().partitions(); p < parts; p++) { List<ClusterNode> owners = cctx.topology().owners(p); if (F.isEmpty(owners)) return null; // Retry. dataNodes.retainAll(owners); if (dataNodes.isEmpty()) return null; // Retry. } return dataNodes; }
/** {@inheritDoc} */ @Override public Collection<ClusterNode> nodes(int p, AffinityTopologyVersion topVer) { Collection<ClusterNode> affNodes = cctx.affinity().nodes(p, topVer); lock.readLock().lock(); try { assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer1=" + topVer + ", topVer2=" + this.topVer + ", cache=" + cctx.name() + ", node2part=" + node2part + ']'; Collection<ClusterNode> nodes = null; Collection<UUID> nodeIds = part2node.get(p); if (!F.isEmpty(nodeIds)) { Collection<UUID> affIds = new HashSet<>(F.viewReadOnly(affNodes, F.node2id())); for (UUID nodeId : nodeIds) { if (!affIds.contains(nodeId) && hasState(p, nodeId, OWNING, MOVING, RENTING)) { ClusterNode n = cctx.discovery().node(nodeId); if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion())) { if (nodes == null) { nodes = new ArrayList<>(affNodes.size() + 2); nodes.addAll(affNodes); } nodes.add(n); } } } } return nodes != null ? nodes : affNodes; } finally { lock.readLock().unlock(); } }
/** {@inheritDoc} */ @Override public void printMemoryStats(int threshold) { X.println( ">>> Cache partition topology stats [grid=" + cctx.gridName() + ", cache=" + cctx.name() + ']'); for (GridDhtLocalPartition part : locParts.values()) { int size = part.size(); if (size >= threshold) X.println(">>> Local partition [part=" + part.id() + ", size=" + size + ']'); } }
/** * @param p Partition. * @param topVer Topology version ({@code -1} for all nodes). * @param state Partition state. * @param states Additional partition states. * @return List of nodes for the partition. */ private List<ClusterNode> nodes( int p, AffinityTopologyVersion topVer, GridDhtPartitionState state, GridDhtPartitionState... states) { Collection<UUID> allIds = topVer.topologyVersion() > 0 ? F.nodeIds(CU.affinityNodes(cctx, topVer)) : null; lock.readLock().lock(); try { assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer=" + topVer + ", allIds=" + allIds + ", node2part=" + node2part + ", cache=" + cctx.name() + ']'; Collection<UUID> nodeIds = part2node.get(p); // Node IDs can be null if both, primary and backup, nodes disappear. int size = nodeIds == null ? 0 : nodeIds.size(); if (size == 0) return Collections.emptyList(); List<ClusterNode> nodes = new ArrayList<>(size); for (UUID id : nodeIds) { if (topVer.topologyVersion() > 0 && !allIds.contains(id)) continue; if (hasState(p, id, state, states)) { ClusterNode n = cctx.discovery().node(id); if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion())) nodes.add(n); } } return nodes; } finally { lock.readLock().unlock(); } }
/** {@inheritDoc} */ @Override public boolean onDone(AffinityTopologyVersion res, Throwable err) { Map<Integer, Boolean> m = null; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.config().getTopologyValidator() != null && !CU.isSystemCache(cacheCtx.name())) { if (m == null) m = new HashMap<>(); m.put( cacheCtx.cacheId(), cacheCtx.config().getTopologyValidator().validate(discoEvt.topologyNodes())); } } cacheValidRes = m != null ? m : Collections.<Integer, Boolean>emptyMap(); cctx.cache().onExchangeDone(exchId.topologyVersion(), reqs, err); cctx.exchange().onExchangeDone(this, err); if (super.onDone(res, err) && !dummy && !forcePreload) { if (log.isDebugEnabled()) log.debug( "Completed partition exchange [localNode=" + cctx.localNodeId() + ", exchange= " + this + ']'); initFut.onDone(err == null); GridTimeoutObject timeoutObj = this.timeoutObj; // Deschedule timeout object. if (timeoutObj != null) cctx.kernalContext().timeout().removeTimeoutObject(timeoutObj); if (exchId.isLeft()) { for (GridCacheContext cacheCtx : cctx.cacheContexts()) cacheCtx.config().getAffinity().removeNode(exchId.nodeId()); } return true; } return dummy; }
/** * Starts activity. * * @throws IgniteInterruptedCheckedException If interrupted. */ public void init() throws IgniteInterruptedCheckedException { if (isDone()) return; if (init.compareAndSet(false, true)) { if (isDone()) return; try { // Wait for event to occur to make sure that discovery // will return corresponding nodes. U.await(evtLatch); assert discoEvt != null : this; assert !dummy && !forcePreload : this; ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx, exchId.topologyVersion()); oldestNode.set(oldest); startCaches(); // True if client node joined or failed. boolean clientNodeEvt; if (F.isEmpty(reqs)) { int type = discoEvt.type(); assert type == EVT_NODE_JOINED || type == EVT_NODE_LEFT || type == EVT_NODE_FAILED : discoEvt; clientNodeEvt = CU.clientNode(discoEvt.eventNode()); } else { assert discoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT : discoEvt; boolean clientOnlyStart = true; for (DynamicCacheChangeRequest req : reqs) { if (!req.clientStartOnly()) { clientOnlyStart = false; break; } } clientNodeEvt = clientOnlyStart; } if (clientNodeEvt) { ClusterNode node = discoEvt.eventNode(); // Client need to initialize affinity for local join event or for stated client caches. if (!node.isLocal()) { for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; GridDhtPartitionTopology top = cacheCtx.topology(); top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId())); if (cacheCtx.affinity().affinityTopologyVersion() == AffinityTopologyVersion.NONE) { initTopology(cacheCtx); top.beforeExchange(this); } else cacheCtx.affinity().clientEventTopologyChange(discoEvt, exchId.topologyVersion()); } if (exchId.isLeft()) cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion()); onDone(exchId.topologyVersion()); skipPreload = cctx.kernalContext().clientNode(); return; } } if (cctx.kernalContext().clientNode()) { skipPreload = true; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; GridDhtPartitionTopology top = cacheCtx.topology(); top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId())); } for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; initTopology(cacheCtx); } if (oldestNode.get() != null) { rmtNodes = new ConcurrentLinkedQueue<>( CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion())); rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes))); ready.set(true); initFut.onDone(true); if (log.isDebugEnabled()) log.debug("Initialized future: " + this); sendPartitions(); } else onDone(exchId.topologyVersion()); return; } assert oldestNode.get() != null; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (isCacheAdded(cacheCtx.cacheId(), exchId.topologyVersion())) { if (cacheCtx .discovery() .cacheAffinityNodes(cacheCtx.name(), topologyVersion()) .isEmpty()) U.quietAndWarn(log, "No server nodes found for cache client: " + cacheCtx.namex()); } cacheCtx.preloader().onExchangeFutureAdded(); } List<String> cachesWithoutNodes = null; if (exchId.isLeft()) { for (String name : cctx.cache().cacheNames()) { if (cctx.discovery().cacheAffinityNodes(name, topologyVersion()).isEmpty()) { if (cachesWithoutNodes == null) cachesWithoutNodes = new ArrayList<>(); cachesWithoutNodes.add(name); // Fire event even if there is no client cache started. if (cctx.gridEvents().isRecordable(EventType.EVT_CACHE_NODES_LEFT)) { Event evt = new CacheEvent( name, cctx.localNode(), cctx.localNode(), "All server nodes have left the cluster.", EventType.EVT_CACHE_NODES_LEFT, 0, false, null, null, null, null, false, null, false, null, null, null); cctx.gridEvents().record(evt); } } } } if (cachesWithoutNodes != null) { StringBuilder sb = new StringBuilder( "All server nodes for the following caches have left the cluster: "); for (int i = 0; i < cachesWithoutNodes.size(); i++) { String cache = cachesWithoutNodes.get(i); sb.append('\'').append(cache).append('\''); if (i != cachesWithoutNodes.size() - 1) sb.append(", "); } U.quietAndWarn(log, sb.toString()); U.quietAndWarn(log, "Must have server nodes for caches to operate."); } assert discoEvt != null; assert exchId.nodeId().equals(discoEvt.eventNode().id()); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { GridClientPartitionTopology clientTop = cctx.exchange().clearClientTopology(cacheCtx.cacheId()); long updSeq = clientTop == null ? -1 : clientTop.lastUpdateSequence(); // Update before waiting for locks. if (!cacheCtx.isLocal()) cacheCtx .topology() .updateTopologyVersion(exchId, this, updSeq, stopping(cacheCtx.cacheId())); } // Grab all alive remote nodes with order of equal or less than last joined node. rmtNodes = new ConcurrentLinkedQueue<>( CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion())); rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes))); for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> m : singleMsgs.entrySet()) // If received any messages, process them. onReceive(m.getKey(), m.getValue()); for (Map.Entry<UUID, GridDhtPartitionsFullMessage> m : fullMsgs.entrySet()) // If received any messages, process them. onReceive(m.getKey(), m.getValue()); AffinityTopologyVersion topVer = exchId.topologyVersion(); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; // Must initialize topology after we get discovery event. initTopology(cacheCtx); cacheCtx.preloader().updateLastExchangeFuture(this); } IgniteInternalFuture<?> partReleaseFut = cctx.partitionReleaseFuture(topVer); // Assign to class variable so it will be included into toString() method. this.partReleaseFut = partReleaseFut; if (log.isDebugEnabled()) log.debug("Before waiting for partition release future: " + this); while (true) { try { partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { // Print pending transactions and locks that might have led to hang. dumpPendingObjects(); } } if (log.isDebugEnabled()) log.debug("After waiting for partition release future: " + this); if (!F.isEmpty(reqs)) blockGateways(); if (exchId.isLeft()) cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion()); IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion()); while (true) { try { locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { U.warn( log, "Failed to wait for locks release future. " + "Dumping pending objects that might be the cause: " + cctx.localNodeId()); U.warn(log, "Locked entries:"); Map<IgniteTxKey, Collection<GridCacheMvccCandidate>> locks = cctx.mvcc().unfinishedLocks(exchId.topologyVersion()); for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet()) U.warn(log, "Locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']'); } } for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; // Notify replication manager. GridCacheContext drCacheCtx = cacheCtx.isNear() ? cacheCtx.near().dht().context() : cacheCtx; if (drCacheCtx.isDrEnabled()) drCacheCtx.dr().beforeExchange(topVer, exchId.isLeft()); // Partition release future is done so we can flush the write-behind store. cacheCtx.store().forceFlush(); // Process queued undeploys prior to sending/spreading map. cacheCtx.preloader().unwindUndeploys(); GridDhtPartitionTopology top = cacheCtx.topology(); assert topVer.equals(top.topologyVersion()) : "Topology version is updated only in this class instances inside single ExchangeWorker thread."; top.beforeExchange(this); } for (GridClientPartitionTopology top : cctx.exchange().clientTopologies()) { top.updateTopologyVersion(exchId, this, -1, stopping(top.cacheId())); top.beforeExchange(this); } } catch (IgniteInterruptedCheckedException e) { onDone(e); throw e; } catch (Throwable e) { U.error( log, "Failed to reinitialize local partitions (preloading will be stopped): " + exchId, e); onDone(e); if (e instanceof Error) throw (Error) e; return; } if (F.isEmpty(rmtIds)) { onDone(exchId.topologyVersion()); return; } ready.set(true); initFut.onDone(true); if (log.isDebugEnabled()) log.debug("Initialized future: " + this); // If this node is not oldest. if (!oldestNode.get().id().equals(cctx.localNodeId())) sendPartitions(); else { boolean allReceived = allReceived(); if (allReceived && replied.compareAndSet(false, true)) { if (spreadPartitions()) onDone(exchId.topologyVersion()); } } scheduleRecheck(); } else assert false : "Skipped init future: " + this; }
/** * Calculates partition mapping for partitioned cache on unstable topology. * * @param cctx Cache context for main space. * @param extraSpaces Extra spaces. * @return Partition mapping or {@code null} if we can't calculate it due to repartitioning and we * need to retry. */ @SuppressWarnings("unchecked") private Map<ClusterNode, IntArray> partitionedUnstableDataNodes( final GridCacheContext<?, ?> cctx, List<String> extraSpaces) { assert !cctx.isReplicated() && !cctx.isLocal() : cctx.name() + " must be partitioned"; final int partsCnt = cctx.affinity().partitions(); if (extraSpaces != null) { // Check correct number of partitions for partitioned caches. for (String extraSpace : extraSpaces) { GridCacheContext<?, ?> extraCctx = cacheContext(extraSpace); if (extraCctx.isReplicated() || extraCctx.isLocal()) continue; int parts = extraCctx.affinity().partitions(); if (parts != partsCnt) throw new CacheException( "Number of partitions must be the same for correct collocation [cache1=" + cctx.name() + ", parts1=" + partsCnt + ", cache2=" + extraSpace + ", parts2=" + parts + "]"); } } Set<ClusterNode>[] partLocs = new Set[partsCnt]; // Fill partition locations for main cache. for (int p = 0, parts = cctx.affinity().partitions(); p < parts; p++) { List<ClusterNode> owners = cctx.topology().owners(p); if (F.isEmpty(owners)) { if (!F.isEmpty(dataNodes(cctx.name(), NONE))) return null; // Retry. throw new CacheException( "Failed to find data nodes [cache=" + cctx.name() + ", part=" + p + "]"); } partLocs[p] = new HashSet<>(owners); } if (extraSpaces != null) { // Find owner intersections for each participating partitioned cache partition. // We need this for logical collocation between different partitioned caches with the same // affinity. for (String extraSpace : extraSpaces) { GridCacheContext<?, ?> extraCctx = cacheContext(extraSpace); if (extraCctx.isReplicated() || extraCctx.isLocal()) continue; for (int p = 0, parts = extraCctx.affinity().partitions(); p < parts; p++) { List<ClusterNode> owners = extraCctx.topology().owners(p); if (F.isEmpty(owners)) { if (!F.isEmpty(dataNodes(extraSpace, NONE))) return null; // Retry. throw new CacheException( "Failed to find data nodes [cache=" + extraSpace + ", part=" + p + "]"); } if (partLocs[p] == null) partLocs[p] = new HashSet<>(owners); else { partLocs[p].retainAll(owners); // Intersection of owners. if (partLocs[p].isEmpty()) return null; // Intersection is empty -> retry. } } } // Filter nodes where not all the replicated caches loaded. for (String extraSpace : extraSpaces) { GridCacheContext<?, ?> extraCctx = cacheContext(extraSpace); if (!extraCctx.isReplicated()) continue; Set<ClusterNode> dataNodes = replicatedUnstableDataNodes(extraCctx); if (F.isEmpty(dataNodes)) return null; // Retry. for (Set<ClusterNode> partLoc : partLocs) { partLoc.retainAll(dataNodes); if (partLoc.isEmpty()) return null; // Retry. } } } // Collect the final partitions mapping. Map<ClusterNode, IntArray> res = new HashMap<>(); // Here partitions in all IntArray's will be sorted in ascending order, this is important. for (int p = 0; p < partLocs.length; p++) { Set<ClusterNode> pl = partLocs[p]; assert !F.isEmpty(pl) : pl; ClusterNode n = pl.size() == 1 ? F.first(pl) : F.rand(pl); IntArray parts = res.get(n); if (parts == null) res.put(n, parts = new IntArray()); parts.add(p); } return res; }
/** * @param topVer Topology version. * @param cctx Cache context for main space. * @param extraSpaces Extra spaces. * @return Data nodes or {@code null} if repartitioning started and we need to retry.. */ private Collection<ClusterNode> stableDataNodes( AffinityTopologyVersion topVer, final GridCacheContext<?, ?> cctx, List<String> extraSpaces) { String space = cctx.name(); Set<ClusterNode> nodes = new HashSet<>(dataNodes(space, topVer)); if (F.isEmpty(nodes)) throw new CacheException("Failed to find data nodes for cache: " + space); if (!F.isEmpty(extraSpaces)) { for (String extraSpace : extraSpaces) { GridCacheContext<?, ?> extraCctx = cacheContext(extraSpace); if (extraCctx.isLocal()) continue; // No consistency guaranties for local caches. if (cctx.isReplicated() && !extraCctx.isReplicated()) throw new CacheException( "Queries running on replicated cache should not contain JOINs " + "with partitioned tables [rCache=" + cctx.name() + ", pCache=" + extraSpace + "]"); Collection<ClusterNode> extraNodes = dataNodes(extraSpace, topVer); if (F.isEmpty(extraNodes)) throw new CacheException("Failed to find data nodes for cache: " + extraSpace); if (cctx.isReplicated() && extraCctx.isReplicated()) { nodes.retainAll(extraNodes); if (nodes.isEmpty()) { if (isPreloadingActive(cctx, extraSpaces)) return null; // Retry. else throw new CacheException( "Caches have distinct sets of data nodes [cache1=" + cctx.name() + ", cache2=" + extraSpace + "]"); } } else if (!cctx.isReplicated() && extraCctx.isReplicated()) { if (!extraNodes.containsAll(nodes)) if (isPreloadingActive(cctx, extraSpaces)) return null; // Retry. else throw new CacheException( "Caches have distinct sets of data nodes [cache1=" + cctx.name() + ", cache2=" + extraSpace + "]"); } else if (!cctx.isReplicated() && !extraCctx.isReplicated()) { if (extraNodes.size() != nodes.size() || !nodes.containsAll(extraNodes)) if (isPreloadingActive(cctx, extraSpaces)) return null; // Retry. else throw new CacheException( "Caches have distinct sets of data nodes [cache1=" + cctx.name() + ", cache2=" + extraSpace + "]"); } else throw new IllegalStateException(); } } return nodes; }
/** {@inheritDoc} */ @Override public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { boolean changed = waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); AffinityTopologyVersion topVer = exchFut.topologyVersion(); lock.writeLock().lock(); try { if (stopping) return false; assert topVer.equals(exchFut.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']'; if (log.isDebugEnabled()) log.debug( "Partition map before afterExchange [exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); if (cctx.affinity().localNode(p, topVer)) { // This partition will be created during next topology event, // which obviously has not happened at this point. if (locPart == null) { if (log.isDebugEnabled()) log.debug("Skipping local partition afterExchange (will not create): " + p); continue; } GridDhtPartitionState state = locPart.state(); if (state == MOVING) { if (cctx.rebalanceEnabled()) { Collection<ClusterNode> owners = owners(p); // If there are no other owners, then become an owner. if (F.isEmpty(owners)) { boolean owned = locPart.own(); assert owned : "Failed to own partition [cacheName" + cctx.name() + ", locPart=" + locPart + ']'; updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) { DiscoveryEvent discoEvt = exchFut.discoveryEvent(); cctx.events() .addPreloadEvent( p, EVT_CACHE_REBALANCE_PART_DATA_LOST, discoEvt.eventNode(), discoEvt.type(), discoEvt.timestamp()); } if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart); } else if (log.isDebugEnabled()) log.debug( "Will not own partition (there are owners to rebalance from) [locPart=" + locPart + ", owners = " + owners + ']'); } else updateLocal(p, loc.id(), locPart.state(), updateSeq); } } else { if (locPart != null) { GridDhtPartitionState state = locPart.state(); if (state == MOVING) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (log.isDebugEnabled()) log.debug("Evicting moving partition (it does not belong to affinity): " + locPart); } } } } consistencyCheck(); } finally { lock.writeLock().unlock(); } return changed; }
/** {@inheritDoc} */ @Override public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); lock.writeLock().lock(); try { GridDhtPartitionExchangeId exchId = exchFut.exchangeId(); if (stopping) return; assert topVer.equals(exchId.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']'; if (exchId.isLeft()) removeNode(exchId.nodeId()); // In case if node joins, get topology at the time of joining node. ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; if (log.isDebugEnabled()) log.debug( "Partition map beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); // If this is the oldest node. if (oldest.id().equals(loc.id()) || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) { if (node2part == null) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq); if (log.isDebugEnabled()) log.debug( "Created brand new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } else if (!node2part.valid()) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Created new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + node2part + ']'); } else if (!node2part.nodeId().equals(loc.id())) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Copied old map into new map on oldest node (previous oldest node left) [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } } if (cctx.rebalanceEnabled()) { for (int p = 0; p < num; p++) { // If this is the first node in grid. boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion()); if ((oldest.id().equals(loc.id()) && oldest.id().equals(exchId.nodeId()) && exchId.isJoined()) || added) { assert exchId.isJoined() || added; try { GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); assert locPart != null; boolean owned = locPart.own(); assert owned : "Failed to own partition for oldest node [cacheName" + cctx.name() + ", part=" + locPart + ']'; if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition on oldest node (no need to create a partition " + "if it no longer belongs to local node: " + e.partition()); } } // If this is not the first node in grid. else { if (node2part != null && node2part.valid()) { if (cctx.affinity().localNode(p, topVer)) { try { // This will make sure that all non-existing partitions // will be created in MOVING state. GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } // If this node's map is empty, we pre-create local partitions, // so local map will be sent correctly during exchange. else if (cctx.affinity().localNode(p, topVer)) { try { localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to pre-create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } } } else { // If preloader is disabled, then we simply clear out // the partitions this node is not responsible for. for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); boolean belongs = cctx.affinity().localNode(p, topVer); if (locPart != null) { if (!belongs) { GridDhtPartitionState state = locPart.state(); if (state.active()) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); if (log.isDebugEnabled()) log.debug( "Evicting partition with rebalancing disabled " + "(it does not belong to affinity): " + locPart); } } } else if (belongs) { try { // Pre-create partitions. localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition with disabled rebalancer (no need to " + "pre-create a partition if it no longer belongs to local node: " + e.partition()); } } } } if (node2part != null && node2part.valid()) checkEvictions(updateSeq); consistencyCheck(); if (log.isDebugEnabled()) log.debug( "Partition map after beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } finally { lock.writeLock().unlock(); } // Wait for evictions. waitForRent(); }
/** {@inheritDoc} */ @SuppressWarnings("unchecked") @Override public void notifyCallback( UUID nodeId, UUID routineId, Collection<?> objs, GridKernalContext ctx) { assert nodeId != null; assert routineId != null; assert objs != null; assert ctx != null; Collection<CacheContinuousQueryEntry> entries = (Collection<CacheContinuousQueryEntry>) objs; final GridCacheContext cctx = cacheContext(ctx); for (CacheContinuousQueryEntry e : entries) { GridCacheDeploymentManager depMgr = cctx.deploy(); ClassLoader ldr = depMgr.globalLoader(); if (ctx.config().isPeerClassLoadingEnabled()) { GridDeploymentInfo depInfo = e.deployInfo(); if (depInfo != null) { depMgr.p2pContext( nodeId, depInfo.classLoaderId(), depInfo.userVersion(), depInfo.deployMode(), depInfo.participants(), depInfo.localDeploymentOwner()); } } try { e.unmarshal(cctx, ldr); } catch (IgniteCheckedException ex) { U.error(ctx.log(getClass()), "Failed to unmarshal entry.", ex); } } final IgniteCache cache = cctx.kernalContext().cache().jcache(cctx.name()); Collection<CacheContinuousQueryEntry> entries0 = new ArrayList<>(); for (CacheContinuousQueryEntry e : entries) entries0.addAll(handleEvent(ctx, e)); Iterable<CacheEntryEvent<? extends K, ? extends V>> evts = F.viewReadOnly( entries0, new C1<CacheContinuousQueryEntry, CacheEntryEvent<? extends K, ? extends V>>() { @Override public CacheEntryEvent<? extends K, ? extends V> apply(CacheContinuousQueryEntry e) { return new CacheContinuousQueryEvent<>(cache, cctx, e); } }, new IgnitePredicate<CacheContinuousQueryEntry>() { @Override public boolean apply(CacheContinuousQueryEntry entry) { return !entry.isFiltered(); } }); locLsnr.onUpdated(evts); }
/** * @param cctx Cache context. * @param qry Query. * @param keepPortable Keep portable. * @return Cursor. */ public Iterator<List<?>> query( GridCacheContext<?, ?> cctx, GridCacheTwoStepQuery qry, boolean keepPortable) { for (int attempt = 0; ; attempt++) { if (attempt != 0) { try { Thread.sleep(attempt * 10); // Wait for exchange. } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CacheException("Query was interrupted.", e); } } long qryReqId = reqIdGen.incrementAndGet(); QueryRun r = new QueryRun(); r.pageSize = qry.pageSize() <= 0 ? GridCacheTwoStepQuery.DFLT_PAGE_SIZE : qry.pageSize(); r.idxs = new ArrayList<>(qry.mapQueries().size()); String space = cctx.name(); r.conn = (JdbcConnection) h2.connectionForSpace(space); AffinityTopologyVersion topVer = h2.readyTopologyVersion(); List<String> extraSpaces = extraSpaces(space, qry.spaces()); Collection<ClusterNode> nodes; // Explicit partition mapping for unstable topology. Map<ClusterNode, IntArray> partsMap = null; if (isPreloadingActive(cctx, extraSpaces)) { if (cctx.isReplicated()) nodes = replicatedUnstableDataNodes(cctx, extraSpaces); else { partsMap = partitionedUnstableDataNodes(cctx, extraSpaces); nodes = partsMap == null ? null : partsMap.keySet(); } } else nodes = stableDataNodes(topVer, cctx, extraSpaces); if (nodes == null) continue; // Retry. assert !nodes.isEmpty(); if (cctx.isReplicated() || qry.explain()) { assert qry.explain() || !nodes.contains(ctx.discovery().localNode()) : "We must be on a client node."; // Select random data node to run query on a replicated data or get EXPLAIN PLAN from a // single node. nodes = Collections.singleton(F.rand(nodes)); } int tblIdx = 0; final boolean skipMergeTbl = !qry.explain() && qry.skipMergeTable(); for (GridCacheSqlQuery mapQry : qry.mapQueries()) { GridMergeIndex idx; if (!skipMergeTbl) { GridMergeTable tbl; try { tbl = createMergeTable(r.conn, mapQry, qry.explain()); } catch (IgniteCheckedException e) { throw new IgniteException(e); } idx = tbl.getScanIndex(null); fakeTable(r.conn, tblIdx++).setInnerTable(tbl); } else idx = GridMergeIndexUnsorted.createDummy(); for (ClusterNode node : nodes) idx.addSource(node.id()); r.idxs.add(idx); } r.latch = new CountDownLatch(r.idxs.size() * nodes.size()); runs.put(qryReqId, r); try { if (ctx.clientDisconnected()) { throw new CacheException( "Query was cancelled, client node disconnected.", new IgniteClientDisconnectedException( ctx.cluster().clientReconnectFuture(), "Client node disconnected.")); } Collection<GridCacheSqlQuery> mapQrys = qry.mapQueries(); if (qry.explain()) { mapQrys = new ArrayList<>(qry.mapQueries().size()); for (GridCacheSqlQuery mapQry : qry.mapQueries()) mapQrys.add(new GridCacheSqlQuery("EXPLAIN " + mapQry.query(), mapQry.parameters())); } if (nodes.size() != 1 || !F.first(nodes).isLocal()) { // Marshall params for remotes. Marshaller m = ctx.config().getMarshaller(); for (GridCacheSqlQuery mapQry : mapQrys) mapQry.marshallParams(m); } boolean retry = false; if (send( nodes, new GridQueryRequest(qryReqId, r.pageSize, space, mapQrys, topVer, extraSpaces, null), partsMap)) { awaitAllReplies(r, nodes); Object state = r.state.get(); if (state != null) { if (state instanceof CacheException) { CacheException err = (CacheException) state; if (err.getCause() instanceof IgniteClientDisconnectedException) throw err; throw new CacheException("Failed to run map query remotely.", err); } if (state instanceof AffinityTopologyVersion) { retry = true; // If remote node asks us to retry then we have outdated full partition map. h2.awaitForReadyTopologyVersion((AffinityTopologyVersion) state); } } } else // Send failed. retry = true; Iterator<List<?>> resIter = null; if (!retry) { if (qry.explain()) return explainPlan(r.conn, space, qry); if (skipMergeTbl) { List<List<?>> res = new ArrayList<>(); assert r.idxs.size() == 1 : r.idxs; GridMergeIndex idx = r.idxs.get(0); Cursor cur = idx.findInStream(null, null); while (cur.next()) { Row row = cur.get(); int cols = row.getColumnCount(); List<Object> resRow = new ArrayList<>(cols); for (int c = 0; c < cols; c++) resRow.add(row.getValue(c).getObject()); res.add(resRow); } resIter = res.iterator(); } else { GridCacheSqlQuery rdc = qry.reduceQuery(); // Statement caching is prohibited here because we can't guarantee correct merge index // reuse. ResultSet res = h2.executeSqlQueryWithTimer( space, r.conn, rdc.query(), F.asList(rdc.parameters()), false); resIter = new Iter(res); } } for (GridMergeIndex idx : r.idxs) { if (!idx.fetchedAll()) // We have to explicitly cancel queries on remote nodes. send(nodes, new GridQueryCancelRequest(qryReqId), null); } if (retry) { if (Thread.currentThread().isInterrupted()) throw new IgniteInterruptedCheckedException("Query was interrupted."); continue; } return new GridQueryCacheObjectsIterator(resIter, cctx, keepPortable); } catch (IgniteCheckedException | RuntimeException e) { U.closeQuiet(r.conn); if (e instanceof CacheException) throw (CacheException) e; Throwable cause = e; if (e instanceof IgniteCheckedException) { Throwable disconnectedErr = ((IgniteCheckedException) e).getCause(IgniteClientDisconnectedException.class); if (disconnectedErr != null) cause = disconnectedErr; } throw new CacheException("Failed to run reduce query locally.", cause); } finally { if (!runs.remove(qryReqId, r)) U.warn(log, "Query run was already removed: " + qryReqId); if (!skipMergeTbl) { for (int i = 0, mapQrys = qry.mapQueries().size(); i < mapQrys; i++) fakeTable(null, i).setInnerTable(null); // Drop all merge tables. } } } }