/** * @param nodes Nodes. * @param id ID. * @throws IgniteCheckedException If failed. */ private void sendAllPartitions( Collection<? extends ClusterNode> nodes, GridDhtPartitionExchangeId id) throws IgniteCheckedException { GridDhtPartitionsFullMessage m = new GridDhtPartitionsFullMessage(id, lastVer.get(), id.topologyVersion()); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (!cacheCtx.isLocal()) { AffinityTopologyVersion startTopVer = cacheCtx.startTopologyVersion(); boolean ready = startTopVer == null || startTopVer.compareTo(id.topologyVersion()) <= 0; if (ready) m.addFullPartitionsMap(cacheCtx.cacheId(), cacheCtx.topology().partitionMap(true)); } } // It is important that client topologies be added after contexts. for (GridClientPartitionTopology top : cctx.exchange().clientTopologies()) m.addFullPartitionsMap(top.cacheId(), top.partitionMap(true)); if (log.isDebugEnabled()) log.debug( "Sending full partition map [nodeIds=" + F.viewReadOnly(nodes, F.node2id()) + ", exchId=" + exchId + ", msg=" + m + ']'); cctx.io().safeSend(nodes, m, SYSTEM_POOL, null); }
/** * Creates a future that will wait for all explicit locks acquired on given topology version to be * released. * * @param topVer Topology version to wait for. * @return Explicit locks release future. */ public IgniteInternalFuture<?> finishExplicitLocks(AffinityTopologyVersion topVer) { GridCompoundFuture<Object, Object> res = new GridCompoundFuture<>(); for (GridCacheExplicitLockSpan span : pendingExplicit.values()) { AffinityTopologyVersion snapshot = span.topologyVersion(); if (snapshot != null && snapshot.compareTo(topVer) < 0) res.add(span.releaseFuture()); } res.markInitialized(); return res; }
/** {@inheritDoc} */ @Override public Collection<ClusterNode> nodes(int p, AffinityTopologyVersion topVer) { Collection<ClusterNode> affNodes = cctx.affinity().nodes(p, topVer); lock.readLock().lock(); try { assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer1=" + topVer + ", topVer2=" + this.topVer + ", cache=" + cctx.name() + ", node2part=" + node2part + ']'; Collection<ClusterNode> nodes = null; Collection<UUID> nodeIds = part2node.get(p); if (!F.isEmpty(nodeIds)) { Collection<UUID> affIds = new HashSet<>(F.viewReadOnly(affNodes, F.node2id())); for (UUID nodeId : nodeIds) { if (!affIds.contains(nodeId) && hasState(p, nodeId, OWNING, MOVING, RENTING)) { ClusterNode n = cctx.discovery().node(nodeId); if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion())) { if (nodes == null) { nodes = new ArrayList<>(affNodes.size() + 2); nodes.addAll(affNodes); } nodes.add(n); } } } } return nodes != null ? nodes : affNodes; } finally { lock.readLock().unlock(); } }
/** * @param topVer Topology version. * @param entries Entries. */ FinishLockFuture(Iterable<GridDistributedCacheEntry> entries, AffinityTopologyVersion topVer) { assert topVer.compareTo(AffinityTopologyVersion.ZERO) > 0; this.topVer = topVer; for (GridCacheEntryEx entry : entries) { // Either local or near local candidates. try { Collection<GridCacheMvccCandidate> locs = entry.localCandidates(); if (!F.isEmpty(locs)) { Collection<GridCacheMvccCandidate> cands = new ConcurrentLinkedQueue<>(); cands.addAll(F.view(locs, versionFilter())); if (!F.isEmpty(cands)) pendingLocks.put(entry.txKey(), cands); } } catch (GridCacheEntryRemovedException ignored) { if (exchLog.isDebugEnabled()) exchLog.debug( "Got removed entry when adding it to finish lock future (will ignore): " + entry); } } if (exchLog.isDebugEnabled()) exchLog.debug("Pending lock set [topVer=" + topVer + ", locks=" + pendingLocks + ']'); }
/** * @param p Partition. * @param topVer Topology version ({@code -1} for all nodes). * @param state Partition state. * @param states Additional partition states. * @return List of nodes for the partition. */ private List<ClusterNode> nodes( int p, AffinityTopologyVersion topVer, GridDhtPartitionState state, GridDhtPartitionState... states) { Collection<UUID> allIds = topVer.topologyVersion() > 0 ? F.nodeIds(CU.affinityNodes(cctx, topVer)) : null; lock.readLock().lock(); try { assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer=" + topVer + ", allIds=" + allIds + ", node2part=" + node2part + ", cache=" + cctx.name() + ']'; Collection<UUID> nodeIds = part2node.get(p); // Node IDs can be null if both, primary and backup, nodes disappear. int size = nodeIds == null ? 0 : nodeIds.size(); if (size == 0) return Collections.emptyList(); List<ClusterNode> nodes = new ArrayList<>(size); for (UUID id : nodeIds) { if (topVer.topologyVersion() > 0 && !allIds.contains(id)) continue; if (hasState(p, id, state, states)) { ClusterNode n = cctx.discovery().node(id); if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion())) nodes.add(n); } } return nodes; } finally { lock.readLock().unlock(); } }
/** * @param log Logger. * @param topVer Topology version. * @param initCntr Update counters. */ public PartitionRecovery( IgniteLogger log, AffinityTopologyVersion topVer, @Nullable Long initCntr) { assert topVer.topologyVersion() > 0 : topVer; this.log = log; if (initCntr != null) { this.lastFiredEvt = initCntr; curTop = topVer; } }
/** {@inheritDoc} */ @Override public AffinityTopologyVersion topologyVersion() { lock.readLock().lock(); try { assert topVer.topologyVersion() > 0 : "Invalid topology version [topVer=" + topVer + ", cacheName=" + cctx.name() + ']'; return topVer; } finally { lock.readLock().unlock(); } }
/** * @param keyFilter Key filter. * @param topVer Topology version. * @return Future that signals when all locks for given partitions will be released. */ private IgniteInternalFuture<?> finishLocks( @Nullable final IgnitePredicate<KeyCacheObject> keyFilter, AffinityTopologyVersion topVer) { assert topVer.topologyVersion() != 0; if (topVer.equals(AffinityTopologyVersion.NONE)) return new GridFinishedFuture(); final FinishLockFuture finishFut = new FinishLockFuture( keyFilter == null ? locked() : F.view( locked(), new P1<GridDistributedCacheEntry>() { @Override public boolean apply(GridDistributedCacheEntry e) { return F.isAll(e.key(), keyFilter); } }), topVer); finishFuts.add(finishFut); finishFut.listen( new CI1<IgniteInternalFuture<?>>() { @Override public void apply(IgniteInternalFuture<?> e) { finishFuts.remove(finishFut); // This call is required to make sure that the concurrent queue // clears memory occupied by internal nodes. finishFuts.peek(); } }); finishFut.recheck(); return finishFut; }
/** @return Filter. */ private IgnitePredicate<GridCacheMvccCandidate> versionFilter() { assert topVer.topologyVersion() > 0; return new P1<GridCacheMvccCandidate>() { @Override public boolean apply(GridCacheMvccCandidate c) { assert c.nearLocal() || c.dhtLocal(); // Wait for explicit locks. return c.topologyVersion().equals(AffinityTopologyVersion.ZERO) || c.topologyVersion().compareTo(topVer) < 0; } }; }
/** * @param expVer Expected topology version. * @param curVer Current topology version. * @return {@code True} if cache affinity changed and operation should be remapped. */ protected final boolean needRemap( AffinityTopologyVersion expVer, AffinityTopologyVersion curVer) { if (expVer.equals(curVer)) return false; Collection<ClusterNode> cacheNodes0 = ctx.discovery().cacheAffinityNodes(ctx.name(), expVer); Collection<ClusterNode> cacheNodes1 = ctx.discovery().cacheAffinityNodes(ctx.name(), curVer); if (!cacheNodes0.equals(cacheNodes1) || ctx.affinity().affinityTopologyVersion().compareTo(curVer) < 0) return true; try { List<List<ClusterNode>> aff1 = ctx.affinity().assignments(expVer); List<List<ClusterNode>> aff2 = ctx.affinity().assignments(curVer); return !aff1.equals(aff2); } catch (IllegalStateException e) { return true; } }
/** @param e Failure exception. */ @SuppressWarnings("UnusedParameters") synchronized void onNodeLeft(ClusterTopologyCheckedException e) { if (remapped) return; remapped = true; if (log.isDebugEnabled()) log.debug("Remote node left grid while sending or waiting for reply (will retry): " + this); // Try getting from existing nodes. if (!canRemap) { map(keys.keySet(), F.t(node, keys), topVer); onDone(Collections.<K, V>emptyMap()); } else { final AffinityTopologyVersion updTopVer = new AffinityTopologyVersion( Math.max(topVer.topologyVersion() + 1, cctx.discovery().topologyVersion())); cctx.affinity() .affinityReadyFuture(updTopVer) .listen( new CI1<IgniteInternalFuture<AffinityTopologyVersion>>() { @Override public void apply(IgniteInternalFuture<AffinityTopologyVersion> fut) { try { fut.get(); // Remap. map(keys.keySet(), F.t(node, keys), updTopVer); onDone(Collections.<K, V>emptyMap()); } catch (IgniteCheckedException e) { GridPartitionedGetFuture.this.onDone(e); } } }); } }
/** * Add continuous entry. * * @param entry Cache continuous query entry. * @return Collection entries which will be fired. */ public Collection<CacheContinuousQueryEntry> collectEntries(CacheContinuousQueryEntry entry) { assert entry != null; List<CacheContinuousQueryEntry> entries; synchronized (pendingEvts) { // Received first event. if (curTop == AffinityTopologyVersion.NONE) { lastFiredEvt = entry.updateCounter(); curTop = entry.topologyVersion(); return F.asList(entry); } if (curTop.compareTo(entry.topologyVersion()) < 0) { if (entry.updateCounter() == 1L && !entry.isBackup()) { entries = new ArrayList<>(pendingEvts.size()); for (CacheContinuousQueryEntry evt : pendingEvts.values()) { if (evt != HOLE && !evt.isFiltered()) entries.add(evt); } pendingEvts.clear(); curTop = entry.topologyVersion(); lastFiredEvt = entry.updateCounter(); entries.add(entry); return entries; } curTop = entry.topologyVersion(); } // Check duplicate. if (entry.updateCounter() > lastFiredEvt) { pendingEvts.put(entry.updateCounter(), entry); // Put filtered events. if (entry.filteredEvents() != null) { for (long cnrt : entry.filteredEvents()) { if (cnrt > lastFiredEvt) pendingEvts.put(cnrt, HOLE); } } } else { if (log.isDebugEnabled()) log.debug("Skip duplicate continuous query message: " + entry); return Collections.emptyList(); } if (pendingEvts.isEmpty()) return Collections.emptyList(); Iterator<Map.Entry<Long, CacheContinuousQueryEntry>> iter = pendingEvts.entrySet().iterator(); entries = new ArrayList<>(); if (pendingEvts.size() >= MAX_BUFF_SIZE) { for (int i = 0; i < MAX_BUFF_SIZE - (MAX_BUFF_SIZE / 10); i++) { Map.Entry<Long, CacheContinuousQueryEntry> e = iter.next(); if (e.getValue() != HOLE && !e.getValue().isFiltered()) entries.add(e.getValue()); lastFiredEvt = e.getKey(); iter.remove(); } } else { // Elements are consistently. while (iter.hasNext()) { Map.Entry<Long, CacheContinuousQueryEntry> e = iter.next(); if (e.getKey() == lastFiredEvt + 1) { ++lastFiredEvt; if (e.getValue() != HOLE && !e.getValue().isFiltered()) entries.add(e.getValue()); iter.remove(); } else break; } } } return entries; }
/** {@inheritDoc} */ @Override public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { boolean changed = waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); AffinityTopologyVersion topVer = exchFut.topologyVersion(); lock.writeLock().lock(); try { if (stopping) return false; assert topVer.equals(exchFut.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']'; if (log.isDebugEnabled()) log.debug( "Partition map before afterExchange [exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); if (cctx.affinity().localNode(p, topVer)) { // This partition will be created during next topology event, // which obviously has not happened at this point. if (locPart == null) { if (log.isDebugEnabled()) log.debug("Skipping local partition afterExchange (will not create): " + p); continue; } GridDhtPartitionState state = locPart.state(); if (state == MOVING) { if (cctx.rebalanceEnabled()) { Collection<ClusterNode> owners = owners(p); // If there are no other owners, then become an owner. if (F.isEmpty(owners)) { boolean owned = locPart.own(); assert owned : "Failed to own partition [cacheName" + cctx.name() + ", locPart=" + locPart + ']'; updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) { DiscoveryEvent discoEvt = exchFut.discoveryEvent(); cctx.events() .addPreloadEvent( p, EVT_CACHE_REBALANCE_PART_DATA_LOST, discoEvt.eventNode(), discoEvt.type(), discoEvt.timestamp()); } if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart); } else if (log.isDebugEnabled()) log.debug( "Will not own partition (there are owners to rebalance from) [locPart=" + locPart + ", owners = " + owners + ']'); } else updateLocal(p, loc.id(), locPart.state(), updateSeq); } } else { if (locPart != null) { GridDhtPartitionState state = locPart.state(); if (state == MOVING) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (log.isDebugEnabled()) log.debug("Evicting moving partition (it does not belong to affinity): " + locPart); } } } } consistencyCheck(); } finally { lock.writeLock().unlock(); } return changed; }
/** @throws InterruptedException If interrupted. */ @SuppressWarnings("BusyWait") protected void awaitPartitionMapExchange() throws InterruptedException { for (Ignite g : G.allGrids()) { IgniteKernal g0 = (IgniteKernal) g; for (IgniteCacheProxy<?, ?> c : g0.context().cache().jcaches()) { CacheConfiguration cfg = c.context().config(); if (cfg.getCacheMode() == PARTITIONED && cfg.getRebalanceMode() != NONE && g.cluster().nodes().size() > 1) { AffinityFunction aff = cfg.getAffinity(); GridDhtCacheAdapter<?, ?> dht = dht(c); GridDhtPartitionTopology top = dht.topology(); for (int p = 0; p < aff.partitions(); p++) { long start = 0; for (int i = 0; ; i++) { boolean match = false; AffinityTopologyVersion readyVer = dht.context().shared().exchange().readyAffinityVersion(); if (readyVer.topologyVersion() > 0 && c.context().started()) { // Must map on updated version of topology. Collection<ClusterNode> affNodes = g0.affinity(cfg.getName()).mapPartitionToPrimaryAndBackups(p); int exp = affNodes.size(); GridDhtTopologyFuture topFut = top.topologyVersionFuture(); Collection<ClusterNode> owners = (topFut != null && topFut.isDone()) ? top.nodes(p, AffinityTopologyVersion.NONE) : Collections.<ClusterNode>emptyList(); int actual = owners.size(); if (affNodes.size() != owners.size() || !affNodes.containsAll(owners)) { LT.warn( log(), null, "Waiting for topology map update [" + "grid=" + g.name() + ", cache=" + cfg.getName() + ", cacheId=" + dht.context().cacheId() + ", topVer=" + top.topologyVersion() + ", topFut=" + topFut + ", p=" + p + ", affNodesCnt=" + exp + ", ownersCnt=" + actual + ", affNodes=" + affNodes + ", owners=" + owners + ", locNode=" + g.cluster().localNode() + ']'); } else match = true; } else { LT.warn( log(), null, "Waiting for topology map update [" + "grid=" + g.name() + ", cache=" + cfg.getName() + ", cacheId=" + dht.context().cacheId() + ", topVer=" + top.topologyVersion() + ", started=" + dht.context().started() + ", p=" + p + ", readVer=" + readyVer + ", locNode=" + g.cluster().localNode() + ']'); } if (!match) { if (i == 0) start = System.currentTimeMillis(); if (System.currentTimeMillis() - start > 30_000) throw new IgniteException( "Timeout of waiting for topology map update [" + "grid=" + g.name() + ", cache=" + cfg.getName() + ", cacheId=" + dht.context().cacheId() + ", topVer=" + top.topologyVersion() + ", p=" + p + ", readVer=" + readyVer + ", locNode=" + g.cluster().localNode() + ']'); Thread.sleep(200); // Busy wait. continue; } if (i > 0) log() .warning( "Finished waiting for topology map update [grid=" + g.name() + ", p=" + p + ", duration=" + (System.currentTimeMillis() - start) + "ms]"); break; } } } } } }
/** * Starts activity. * * @throws IgniteInterruptedCheckedException If interrupted. */ public void init() throws IgniteInterruptedCheckedException { if (isDone()) return; if (init.compareAndSet(false, true)) { if (isDone()) return; try { // Wait for event to occur to make sure that discovery // will return corresponding nodes. U.await(evtLatch); assert discoEvt != null : this; assert !dummy && !forcePreload : this; ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx, exchId.topologyVersion()); oldestNode.set(oldest); startCaches(); // True if client node joined or failed. boolean clientNodeEvt; if (F.isEmpty(reqs)) { int type = discoEvt.type(); assert type == EVT_NODE_JOINED || type == EVT_NODE_LEFT || type == EVT_NODE_FAILED : discoEvt; clientNodeEvt = CU.clientNode(discoEvt.eventNode()); } else { assert discoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT : discoEvt; boolean clientOnlyStart = true; for (DynamicCacheChangeRequest req : reqs) { if (!req.clientStartOnly()) { clientOnlyStart = false; break; } } clientNodeEvt = clientOnlyStart; } if (clientNodeEvt) { ClusterNode node = discoEvt.eventNode(); // Client need to initialize affinity for local join event or for stated client caches. if (!node.isLocal()) { for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; GridDhtPartitionTopology top = cacheCtx.topology(); top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId())); if (cacheCtx.affinity().affinityTopologyVersion() == AffinityTopologyVersion.NONE) { initTopology(cacheCtx); top.beforeExchange(this); } else cacheCtx.affinity().clientEventTopologyChange(discoEvt, exchId.topologyVersion()); } if (exchId.isLeft()) cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion()); onDone(exchId.topologyVersion()); skipPreload = cctx.kernalContext().clientNode(); return; } } if (cctx.kernalContext().clientNode()) { skipPreload = true; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; GridDhtPartitionTopology top = cacheCtx.topology(); top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId())); } for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; initTopology(cacheCtx); } if (oldestNode.get() != null) { rmtNodes = new ConcurrentLinkedQueue<>( CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion())); rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes))); ready.set(true); initFut.onDone(true); if (log.isDebugEnabled()) log.debug("Initialized future: " + this); sendPartitions(); } else onDone(exchId.topologyVersion()); return; } assert oldestNode.get() != null; for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (isCacheAdded(cacheCtx.cacheId(), exchId.topologyVersion())) { if (cacheCtx .discovery() .cacheAffinityNodes(cacheCtx.name(), topologyVersion()) .isEmpty()) U.quietAndWarn(log, "No server nodes found for cache client: " + cacheCtx.namex()); } cacheCtx.preloader().onExchangeFutureAdded(); } List<String> cachesWithoutNodes = null; if (exchId.isLeft()) { for (String name : cctx.cache().cacheNames()) { if (cctx.discovery().cacheAffinityNodes(name, topologyVersion()).isEmpty()) { if (cachesWithoutNodes == null) cachesWithoutNodes = new ArrayList<>(); cachesWithoutNodes.add(name); // Fire event even if there is no client cache started. if (cctx.gridEvents().isRecordable(EventType.EVT_CACHE_NODES_LEFT)) { Event evt = new CacheEvent( name, cctx.localNode(), cctx.localNode(), "All server nodes have left the cluster.", EventType.EVT_CACHE_NODES_LEFT, 0, false, null, null, null, null, false, null, false, null, null, null); cctx.gridEvents().record(evt); } } } } if (cachesWithoutNodes != null) { StringBuilder sb = new StringBuilder( "All server nodes for the following caches have left the cluster: "); for (int i = 0; i < cachesWithoutNodes.size(); i++) { String cache = cachesWithoutNodes.get(i); sb.append('\'').append(cache).append('\''); if (i != cachesWithoutNodes.size() - 1) sb.append(", "); } U.quietAndWarn(log, sb.toString()); U.quietAndWarn(log, "Must have server nodes for caches to operate."); } assert discoEvt != null; assert exchId.nodeId().equals(discoEvt.eventNode().id()); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { GridClientPartitionTopology clientTop = cctx.exchange().clearClientTopology(cacheCtx.cacheId()); long updSeq = clientTop == null ? -1 : clientTop.lastUpdateSequence(); // Update before waiting for locks. if (!cacheCtx.isLocal()) cacheCtx .topology() .updateTopologyVersion(exchId, this, updSeq, stopping(cacheCtx.cacheId())); } // Grab all alive remote nodes with order of equal or less than last joined node. rmtNodes = new ConcurrentLinkedQueue<>( CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion())); rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes))); for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> m : singleMsgs.entrySet()) // If received any messages, process them. onReceive(m.getKey(), m.getValue()); for (Map.Entry<UUID, GridDhtPartitionsFullMessage> m : fullMsgs.entrySet()) // If received any messages, process them. onReceive(m.getKey(), m.getValue()); AffinityTopologyVersion topVer = exchId.topologyVersion(); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; // Must initialize topology after we get discovery event. initTopology(cacheCtx); cacheCtx.preloader().updateLastExchangeFuture(this); } IgniteInternalFuture<?> partReleaseFut = cctx.partitionReleaseFuture(topVer); // Assign to class variable so it will be included into toString() method. this.partReleaseFut = partReleaseFut; if (log.isDebugEnabled()) log.debug("Before waiting for partition release future: " + this); while (true) { try { partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { // Print pending transactions and locks that might have led to hang. dumpPendingObjects(); } } if (log.isDebugEnabled()) log.debug("After waiting for partition release future: " + this); if (!F.isEmpty(reqs)) blockGateways(); if (exchId.isLeft()) cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion()); IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion()); while (true) { try { locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { U.warn( log, "Failed to wait for locks release future. " + "Dumping pending objects that might be the cause: " + cctx.localNodeId()); U.warn(log, "Locked entries:"); Map<IgniteTxKey, Collection<GridCacheMvccCandidate>> locks = cctx.mvcc().unfinishedLocks(exchId.topologyVersion()); for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet()) U.warn(log, "Locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']'); } } for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (cacheCtx.isLocal()) continue; // Notify replication manager. GridCacheContext drCacheCtx = cacheCtx.isNear() ? cacheCtx.near().dht().context() : cacheCtx; if (drCacheCtx.isDrEnabled()) drCacheCtx.dr().beforeExchange(topVer, exchId.isLeft()); // Partition release future is done so we can flush the write-behind store. cacheCtx.store().forceFlush(); // Process queued undeploys prior to sending/spreading map. cacheCtx.preloader().unwindUndeploys(); GridDhtPartitionTopology top = cacheCtx.topology(); assert topVer.equals(top.topologyVersion()) : "Topology version is updated only in this class instances inside single ExchangeWorker thread."; top.beforeExchange(this); } for (GridClientPartitionTopology top : cctx.exchange().clientTopologies()) { top.updateTopologyVersion(exchId, this, -1, stopping(top.cacheId())); top.beforeExchange(this); } } catch (IgniteInterruptedCheckedException e) { onDone(e); throw e; } catch (Throwable e) { U.error( log, "Failed to reinitialize local partitions (preloading will be stopped): " + exchId, e); onDone(e); if (e instanceof Error) throw (Error) e; return; } if (F.isEmpty(rmtIds)) { onDone(exchId.topologyVersion()); return; } ready.set(true); initFut.onDone(true); if (log.isDebugEnabled()) log.debug("Initialized future: " + this); // If this node is not oldest. if (!oldestNode.get().id().equals(cctx.localNodeId())) sendPartitions(); else { boolean allReceived = allReceived(); if (allReceived && replied.compareAndSet(false, true)) { if (spreadPartitions()) onDone(exchId.topologyVersion()); } } scheduleRecheck(); } else assert false : "Skipped init future: " + this; }
/** * @param keys Keys. * @param mapped Mappings to check for duplicates. * @param topVer Topology version on which keys should be mapped. */ private void map( Collection<KeyCacheObject> keys, Map<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> mapped, AffinityTopologyVersion topVer) { Collection<ClusterNode> cacheNodes = CU.affinityNodes(cctx, topVer); if (cacheNodes.isEmpty()) { onDone( new ClusterTopologyServerNotFoundException( "Failed to map keys for cache " + "(all partition nodes left the grid) [topVer=" + topVer + ", cache=" + cctx.name() + ']')); return; } Map<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> mappings = U.newHashMap(cacheNodes.size()); final int keysSize = keys.size(); Map<K, V> locVals = U.newHashMap(keysSize); boolean hasRmtNodes = false; // Assign keys to primary nodes. for (KeyCacheObject key : keys) hasRmtNodes |= map(key, mappings, locVals, topVer, mapped); if (isDone()) return; if (!locVals.isEmpty()) add(new GridFinishedFuture<>(locVals)); if (hasRmtNodes) { if (!trackable) { trackable = true; cctx.mvcc().addFuture(this, futId); } } // Create mini futures. for (Map.Entry<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> entry : mappings.entrySet()) { final ClusterNode n = entry.getKey(); final LinkedHashMap<KeyCacheObject, Boolean> mappedKeys = entry.getValue(); assert !mappedKeys.isEmpty(); // If this is the primary or backup node for the keys. if (n.isLocal()) { final GridDhtFuture<Collection<GridCacheEntryInfo>> fut = cache() .getDhtAsync( n.id(), -1, mappedKeys, readThrough, topVer, subjId, taskName == null ? 0 : taskName.hashCode(), expiryPlc, skipVals); final Collection<Integer> invalidParts = fut.invalidPartitions(); if (!F.isEmpty(invalidParts)) { Collection<KeyCacheObject> remapKeys = new ArrayList<>(keysSize); for (KeyCacheObject key : keys) { if (key != null && invalidParts.contains(cctx.affinity().partition(key))) remapKeys.add(key); } AffinityTopologyVersion updTopVer = cctx.discovery().topologyVersionEx(); assert updTopVer.compareTo(topVer) > 0 : "Got invalid partitions for local node but topology version did " + "not change [topVer=" + topVer + ", updTopVer=" + updTopVer + ", invalidParts=" + invalidParts + ']'; // Remap recursively. map(remapKeys, mappings, updTopVer); } // Add new future. add( fut.chain( new C1<IgniteInternalFuture<Collection<GridCacheEntryInfo>>, Map<K, V>>() { @Override public Map<K, V> apply(IgniteInternalFuture<Collection<GridCacheEntryInfo>> fut) { try { return createResultMap(fut.get()); } catch (Exception e) { U.error(log, "Failed to get values from dht cache [fut=" + fut + "]", e); onDone(e); return Collections.emptyMap(); } } })); } else { MiniFuture fut = new MiniFuture(n, mappedKeys, topVer); GridCacheMessage req = new GridNearGetRequest( cctx.cacheId(), futId, fut.futureId(), n.version().compareTo(SINGLE_GET_MSG_SINCE) >= 0 ? null : DUMMY_VER, mappedKeys, readThrough, topVer, subjId, taskName == null ? 0 : taskName.hashCode(), expiryPlc != null ? expiryPlc.forAccess() : -1L, skipVals, cctx.deploymentEnabled()); add(fut); // Append new future. try { cctx.io().send(n, req, cctx.ioPolicy()); } catch (IgniteCheckedException e) { // Fail the whole thing. if (e instanceof ClusterTopologyCheckedException) fut.onNodeLeft((ClusterTopologyCheckedException) e); else fut.onResult(e); } } } }
/** * @param topVer Topology version. * @return Future that signals when all locks for given partitions are released. */ @SuppressWarnings({"unchecked"}) public IgniteInternalFuture<?> finishLocks(AffinityTopologyVersion topVer) { assert topVer.compareTo(AffinityTopologyVersion.ZERO) > 0; return finishLocks(null, topVer); }
/** * @param key Key. * @param part Partition. * @param locVals Local values. * @return {@code True} if there is no need to further search value. */ private boolean localGet(KeyCacheObject key, int part, Map<K, V> locVals) { assert cctx.affinityNode() : this; GridDhtCacheAdapter<K, V> cache = cache(); while (true) { GridCacheEntryEx entry; try { entry = cache.context().isSwapOrOffheapEnabled() ? cache.entryEx(key) : cache.peekEx(key); // If our DHT cache do has value, then we peek it. if (entry != null) { boolean isNew = entry.isNewLocked(); CacheObject v = null; GridCacheVersion ver = null; if (needVer) { T2<CacheObject, GridCacheVersion> res = entry.innerGetVersioned( null, null, /*swap*/ true, /*unmarshal*/ true, /** update-metrics */ false, /*event*/ !skipVals, subjId, null, taskName, expiryPlc, !deserializeBinary); if (res != null) { v = res.get1(); ver = res.get2(); } } else { v = entry.innerGet( null, null, /*swap*/ true, /*read-through*/ false, /** update-metrics */ false, /*event*/ !skipVals, /*temporary*/ false, subjId, null, taskName, expiryPlc, !deserializeBinary); } cache.context().evicts().touch(entry, topVer); // Entry was not in memory or in swap, so we remove it from cache. if (v == null) { if (isNew && entry.markObsoleteIfEmpty(ver)) cache.removeEntry(entry); } else { cctx.addResult( locVals, key, v, skipVals, keepCacheObjects, deserializeBinary, true, ver); return true; } } boolean topStable = cctx.isReplicated() || topVer.equals(cctx.topology().topologyVersion()); // Entry not found, do not continue search if topology did not change and there is no store. if (!cctx.readThroughConfigured() && (topStable || partitionOwned(part))) { if (!skipVals && cctx.config().isStatisticsEnabled()) cache.metrics0().onRead(false); return true; } return false; } catch (GridCacheEntryRemovedException ignored) { // No-op, will retry. } catch (GridDhtInvalidPartitionException ignored) { return false; } catch (IgniteCheckedException e) { onDone(e); return true; } } }
/** {@inheritDoc} */ @Override public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); lock.writeLock().lock(); try { GridDhtPartitionExchangeId exchId = exchFut.exchangeId(); if (stopping) return; assert topVer.equals(exchId.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']'; if (exchId.isLeft()) removeNode(exchId.nodeId()); // In case if node joins, get topology at the time of joining node. ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; if (log.isDebugEnabled()) log.debug( "Partition map beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); // If this is the oldest node. if (oldest.id().equals(loc.id()) || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) { if (node2part == null) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq); if (log.isDebugEnabled()) log.debug( "Created brand new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } else if (!node2part.valid()) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Created new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + node2part + ']'); } else if (!node2part.nodeId().equals(loc.id())) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Copied old map into new map on oldest node (previous oldest node left) [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } } if (cctx.rebalanceEnabled()) { for (int p = 0; p < num; p++) { // If this is the first node in grid. boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion()); if ((oldest.id().equals(loc.id()) && oldest.id().equals(exchId.nodeId()) && exchId.isJoined()) || added) { assert exchId.isJoined() || added; try { GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); assert locPart != null; boolean owned = locPart.own(); assert owned : "Failed to own partition for oldest node [cacheName" + cctx.name() + ", part=" + locPart + ']'; if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition on oldest node (no need to create a partition " + "if it no longer belongs to local node: " + e.partition()); } } // If this is not the first node in grid. else { if (node2part != null && node2part.valid()) { if (cctx.affinity().localNode(p, topVer)) { try { // This will make sure that all non-existing partitions // will be created in MOVING state. GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } // If this node's map is empty, we pre-create local partitions, // so local map will be sent correctly during exchange. else if (cctx.affinity().localNode(p, topVer)) { try { localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to pre-create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } } } else { // If preloader is disabled, then we simply clear out // the partitions this node is not responsible for. for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); boolean belongs = cctx.affinity().localNode(p, topVer); if (locPart != null) { if (!belongs) { GridDhtPartitionState state = locPart.state(); if (state.active()) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); if (log.isDebugEnabled()) log.debug( "Evicting partition with rebalancing disabled " + "(it does not belong to affinity): " + locPart); } } } else if (belongs) { try { // Pre-create partitions. localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition with disabled rebalancer (no need to " + "pre-create a partition if it no longer belongs to local node: " + e.partition()); } } } } if (node2part != null && node2part.valid()) checkEvictions(updateSeq); consistencyCheck(); if (log.isDebugEnabled()) log.debug( "Partition map after beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } finally { lock.writeLock().unlock(); } // Wait for evictions. waitForRent(); }
/** @param res Result callback. */ @SuppressWarnings("ThrowableResultOfMethodCallIgnored") void onResult(final GridNearGetResponse res) { final Collection<Integer> invalidParts = res.invalidPartitions(); // If error happened on remote node, fail the whole future. if (res.error() != null) { onDone(res.error()); return; } // Remap invalid partitions. if (!F.isEmpty(invalidParts)) { AffinityTopologyVersion rmtTopVer = res.topologyVersion(); assert !rmtTopVer.equals(AffinityTopologyVersion.ZERO); if (rmtTopVer.compareTo(topVer) <= 0) { // Fail the whole get future. onDone( new IgniteCheckedException( "Failed to process invalid partitions response (remote node reported " + "invalid partitions but remote topology version does not differ from local) " + "[topVer=" + topVer + ", rmtTopVer=" + rmtTopVer + ", invalidParts=" + invalidParts + ", nodeId=" + node.id() + ']')); return; } if (log.isDebugEnabled()) log.debug( "Remapping mini get future [invalidParts=" + invalidParts + ", fut=" + this + ']'); if (!canRemap) { map( F.view( keys.keySet(), new P1<KeyCacheObject>() { @Override public boolean apply(KeyCacheObject key) { return invalidParts.contains(cctx.affinity().partition(key)); } }), F.t(node, keys), topVer); onDone(createResultMap(res.entries())); return; } // Need to wait for next topology version to remap. IgniteInternalFuture<AffinityTopologyVersion> topFut = cctx.affinity().affinityReadyFuture(rmtTopVer); topFut.listen( new CIX1<IgniteInternalFuture<AffinityTopologyVersion>>() { @SuppressWarnings("unchecked") @Override public void applyx(IgniteInternalFuture<AffinityTopologyVersion> fut) throws IgniteCheckedException { AffinityTopologyVersion topVer = fut.get(); // This will append new futures to compound list. map( F.view( keys.keySet(), new P1<KeyCacheObject>() { @Override public boolean apply(KeyCacheObject key) { return invalidParts.contains(cctx.affinity().partition(key)); } }), F.t(node, keys), topVer); onDone(createResultMap(res.entries())); } }); } else { try { onDone(createResultMap(res.entries())); } catch (Exception e) { onDone(e); } } }