/** * Adds key mapping to dht mapping. * * @param key Key to add. * @param node Node this key mapped to. */ public void addKeyMapping(IgniteTxKey key, ClusterNode node) { GridDistributedTxMapping m = mappings.get(node.id()); if (m == null) mappings.put(m = new GridDistributedTxMapping(node)); IgniteTxEntry txEntry = entry(key); assert txEntry != null; txEntry.nodeId(node.id()); m.add(txEntry); if (log.isDebugEnabled()) log.debug( "Added mappings to transaction [locId=" + cctx.localNodeId() + ", key=" + key + ", node=" + node + ", tx=" + this + ']'); }
/** @param m Mapping. */ private void finish(GridDistributedTxMapping m) { ClusterNode n = m.node(); assert !m.empty(); GridNearTxFinishRequest req = new GridNearTxFinishRequest( futId, tx.xidVersion(), tx.threadId(), commit, tx.isInvalidate(), tx.system(), tx.ioPolicy(), tx.syncCommit(), tx.syncRollback(), m.explicitLock(), tx.storeEnabled(), tx.topologyVersion(), null, null, null, tx.size(), tx.subjectId(), tx.taskNameHash(), tx.activeCachesDeploymentEnabled()); // If this is the primary node for the keys. if (n.isLocal()) { req.miniId(IgniteUuid.randomUuid()); IgniteInternalFuture<IgniteInternalTx> fut = cctx.tm().txHandler().finish(n.id(), tx, req); // Add new future. if (fut != null) add(fut); } else { FinishMiniFuture fut = new FinishMiniFuture(m); req.miniId(fut.futureId()); add(fut); // Append new future. if (tx.pessimistic()) cctx.tm().beforeFinishRemote(n.id(), tx.threadId()); try { cctx.io().send(n, req, tx.ioPolicy()); // If we don't wait for result, then mark future as done. if (!isSync() && !m.explicitLock()) fut.onDone(); } catch (ClusterTopologyCheckedException e) { // Remove previous mapping. mappings.remove(m.node().id()); fut.onNodeLeft(n.id()); } catch (IgniteCheckedException e) { // Fail the whole thing. fut.onDone(e); } } }
private void sendPartitions() { ClusterNode oldestNode = this.oldestNode.get(); try { sendLocalPartitions(oldestNode, exchId); } catch (ClusterTopologyCheckedException ignore) { if (log.isDebugEnabled()) log.debug( "Oldest node left during partition exchange [nodeId=" + oldestNode.id() + ", exchId=" + exchId + ']'); } catch (IgniteCheckedException e) { scheduleRecheck(); U.error( log, "Failed to send local partitions to oldest node (will retry after timeout) [oldestNodeId=" + oldestNode.id() + ", exchId=" + exchId + ']', e); } }
/** {@inheritDoc} */ @Override public UUID localNodeId() { if (locNode != null) return locNode.id(); if (discoMgr != null) locNode = discoMgr.localNode(); return locNode != null ? locNode.id() : config().getNodeId(); }
/** * Set source nodes. * * @param nodes Nodes. */ public void setSources(Collection<ClusterNode> nodes) { assert remainingRows == null; remainingRows = U.newHashMap(nodes.size()); for (ClusterNode node : nodes) { if (remainingRows.put(node.id(), new Counter()) != null) throw new IllegalStateException("Duplicate node id: " + node.id()); } }
/** * Converts collection of rich nodes to block location data. * * @param nodes Collection of affinity nodes. */ private void convertFromNodes(Collection<ClusterNode> nodes) { Collection<String> names = new LinkedHashSet<>(); Collection<String> hosts = new LinkedHashSet<>(); Collection<UUID> nodeIds = new ArrayList<>(nodes.size()); for (final ClusterNode node : nodes) { // Normalize host names into Hadoop-expected format. try { Collection<InetAddress> addrs = U.toInetAddresses(node); for (InetAddress addr : addrs) { if (addr.getHostName() == null) names.add(addr.getHostAddress() + ":" + 9001); else { names.add(addr.getHostName() + ":" + 9001); // hostname:portNumber hosts.add(addr.getHostName()); } } } catch (IgniteCheckedException ignored) { names.addAll(node.addresses()); } nodeIds.add(node.id()); } this.nodeIds = nodeIds; this.names = names; this.hosts = hosts; }
/** @param maps Mappings. */ void addEntryMapping(@Nullable Collection<GridDistributedTxMapping> maps) { if (!F.isEmpty(maps)) { for (GridDistributedTxMapping map : maps) { ClusterNode n = map.node(); GridDistributedTxMapping m = mappings.get(n.id()); if (m == null) { mappings.put(m = new GridDistributedTxMapping(n)); m.near(map.near()); if (map.explicitLock()) m.markExplicitLock(); } for (IgniteTxEntry entry : map.entries()) m.add(entry); } if (log.isDebugEnabled()) log.debug( "Added mappings to transaction [locId=" + cctx.localNodeId() + ", mappings=" + maps + ", tx=" + this + ']'); } }
/** * Sends query request. * * @param fut Distributed future. * @param req Request. * @param nodes Nodes. * @throws IgniteCheckedException In case of error. */ @SuppressWarnings("unchecked") private void sendRequest( final GridCacheDistributedQueryFuture<?, ?, ?> fut, final GridCacheQueryRequest req, Collection<ClusterNode> nodes) throws IgniteCheckedException { assert fut != null; assert req != null; assert nodes != null; final UUID locNodeId = cctx.localNodeId(); ClusterNode locNode = null; Collection<ClusterNode> rmtNodes = null; for (ClusterNode n : nodes) { if (n.id().equals(locNodeId)) locNode = n; else { if (rmtNodes == null) rmtNodes = new ArrayList<>(nodes.size()); rmtNodes.add(n); } } // Request should be sent to remote nodes before the query is processed on the local node. // For example, a remote reducer has a state, we should not serialize and then send // the reducer changed by the local node. if (!F.isEmpty(rmtNodes)) { cctx.io() .safeSend( rmtNodes, req, cctx.ioPolicy(), new P1<ClusterNode>() { @Override public boolean apply(ClusterNode node) { fut.onNodeLeft(node.id()); return !fut.isDone(); } }); } if (locNode != null) { cctx.closures() .callLocalSafe( new Callable<Object>() { @Override public Object call() throws Exception { req.beforeLocalExecution(cctx); processQueryRequest(locNodeId, req); return null; } }); } }
/** @param futs Futures to complete. */ private void completeOnNodeLeft(GridNioFuture<?>[] futs) { for (GridNioFuture<?> msg : futs) { IOException e = new IOException("Failed to send message, node has left: " + node.id()); ((GridNioFutureImpl) msg).onDone(e); if (msg.ackClosure() != null) msg.ackClosure().apply(new IgniteException(e)); } }
/** {@inheritDoc} */ @Override public Map<? extends ComputeJob, ClusterNode> map(List<ClusterNode> subgrid, UUID arg) { assert arg != null; assert subgrid.size() > 1 : "Test requires at least 2 nodes. One with load and another one to steal."; int jobsNum = subgrid.size(); Map<GridStealingLoadTestJob, ClusterNode> map = new HashMap<>(jobsNum); stealingNodeId = arg; Iterator<ClusterNode> iter = subgrid.iterator(); Collection<UUID> assigned = new ArrayList<>(subgrid.size()); for (int i = 0; i < jobsNum; i++) { ClusterNode node = null; boolean nextNodeFound = false; while (iter.hasNext() && !nextNodeFound) { node = iter.next(); // Do not map jobs to the stealing node. if (!node.id().equals(stealingNodeId)) nextNodeFound = true; // Recycle iterator. if (!iter.hasNext()) iter = subgrid.iterator(); } assert node != null; assigned.add(node.id()); map.put(new GridStealingLoadTestJob(), node); } taskSes.setAttribute("nodes", assigned); return map; }
/** {@inheritDoc} */ @Override boolean onNodeLeft(UUID nodeId) { if (nodeId.equals(backup.id())) { readyNearMappingFromBackup(m); onDone(new ClusterTopologyCheckedException("Remote node left grid: " + nodeId)); return true; } return false; }
/** @param nodeId Node to remove. */ private void removeNode(UUID nodeId) { assert nodeId != null; assert lock.writeLock().isHeldByCurrentThread(); ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; ClusterNode loc = cctx.localNode(); if (node2part != null) { if (oldest.equals(loc) && !node2part.nodeId().equals(loc.id())) { updateSeq.setIfGreater(node2part.updateSequence()); node2part = new GridDhtPartitionFullMap( loc.id(), loc.order(), updateSeq.incrementAndGet(), node2part, false); } else node2part = new GridDhtPartitionFullMap(node2part, node2part.updateSequence()); part2node = new HashMap<>(part2node); GridDhtPartitionMap parts = node2part.remove(nodeId); if (parts != null) { for (Integer p : parts.keySet()) { Set<UUID> nodeIds = part2node.get(p); if (nodeIds != null) { nodeIds.remove(nodeId); if (nodeIds.isEmpty()) part2node.remove(p); } } } consistencyCheck(); } }
/** * @param r Query run. * @param nodes Nodes to check periodically if they alive. * @throws IgniteInterruptedCheckedException If interrupted. */ private void awaitAllReplies(QueryRun r, Collection<ClusterNode> nodes) throws IgniteInterruptedCheckedException { while (!U.await(r.latch, 500, TimeUnit.MILLISECONDS)) { for (ClusterNode node : nodes) { if (!ctx.discovery().alive(node)) { handleNodeLeft(r, node.id()); assert r.latch.getCount() == 0; return; } } } }
/** @return {@code True} if */ public boolean jobUpdateLeader() { long minOrder = Long.MAX_VALUE; ClusterNode minOrderNode = null; for (ClusterNode node : nodes()) { if (node.order() < minOrder) { minOrder = node.order(); minOrderNode = node; } } assert minOrderNode != null; return localNodeId().equals(minOrderNode.id()); }
/** {@inheritDoc} */ @Override public String toString() { ClusterNode oldestNode = this.oldestNode.get(); return S.toString( GridDhtPartitionsExchangeFuture.class, this, "oldest", oldestNode == null ? "null" : oldestNode.id(), "oldestOrder", oldestNode == null ? "null" : oldestNode.order(), "evtLatch", evtLatch == null ? "null" : evtLatch.getCount(), "remaining", remaining(), "super", super.toString()); }
/** * Send delete message to all meta cache nodes in the grid. * * @param msg Message to send. */ private void sendDeleteMessage(IgfsDeleteMessage msg) { assert msg != null; Collection<ClusterNode> nodes = meta.metaCacheNodes(); for (ClusterNode node : nodes) { try { igfsCtx.send(node, topic, msg, GridIoPolicy.SYSTEM_POOL); } catch (IgniteCheckedException e) { U.warn( log, "Failed to send IGFS delete message to node [nodeId=" + node.id() + ", msg=" + msg + ", err=" + e.getMessage() + ']'); } } }
/** {@inheritDoc} */ @Override public boolean own(GridDhtLocalPartition part) { ClusterNode loc = cctx.localNode(); lock.writeLock().lock(); try { if (part.own()) { updateLocal(part.id(), loc.id(), part.state(), updateSeq.incrementAndGet()); consistencyCheck(); return true; } consistencyCheck(); return false; } finally { lock.writeLock().unlock(); } }
/** * @param node Node. * @param id ID. * @throws IgniteCheckedException If failed. */ private void sendLocalPartitions(ClusterNode node, @Nullable GridDhtPartitionExchangeId id) throws IgniteCheckedException { GridDhtPartitionsSingleMessage m = new GridDhtPartitionsSingleMessage( id, cctx.kernalContext().clientNode(), cctx.versions().last()); for (GridCacheContext cacheCtx : cctx.cacheContexts()) { if (!cacheCtx.isLocal()) m.addLocalPartitionMap(cacheCtx.cacheId(), cacheCtx.topology().localPartitionMap()); } if (log.isDebugEnabled()) log.debug( "Sending local partitions [nodeId=" + node.id() + ", exchId=" + exchId + ", msg=" + m + ']'); cctx.io().send(node, m, SYSTEM_POOL); }
/** * Adds future. * * @param fut Future. * @return {@code True} if added. */ @SuppressWarnings({"SynchronizationOnLocalVariableOrMethodParameter"}) public boolean addFuture(final GridCacheFuture<?> fut) { if (fut.isDone()) { fut.markNotTrackable(); return true; } if (!fut.trackable()) return true; while (true) { Collection<GridCacheFuture<?>> old = futs.putIfAbsent( fut.version(), new ConcurrentLinkedDeque8<GridCacheFuture<?>>() { /** */ private int hash; { // Make sure that we add future to queue before // adding queue to the map of futures. add(fut); } @Override public int hashCode() { if (hash == 0) hash = System.identityHashCode(this); return hash; } @Override public boolean equals(Object obj) { return obj == this; } }); if (old != null) { boolean empty, dup = false; synchronized (old) { empty = old.isEmpty(); if (!empty) dup = old.contains(fut); if (!empty && !dup) old.add(fut); } // Future is being removed, so we force-remove here and try again. if (empty) { if (futs.remove(fut.version(), old)) { if (log.isDebugEnabled()) log.debug("Removed future list from futures map for lock version: " + fut.version()); } continue; } if (dup) { if (log.isDebugEnabled()) log.debug("Found duplicate future in futures map (will not add): " + fut); return false; } } // Handle version mappings. if (fut instanceof GridCacheMappedVersion) { GridCacheVersion from = ((GridCacheMappedVersion) fut).mappedVersion(); if (from != null) mapVersion(from, fut.version()); } if (log.isDebugEnabled()) log.debug("Added future to future map: " + fut); break; } // Close window in case of node is gone before the future got added to // the map of futures. for (ClusterNode n : fut.nodes()) { if (cctx.discovery().node(n.id()) == null) fut.onNodeLeft(n.id()); } // Just in case if future was completed before it was added. if (fut.isDone()) removeFuture(fut); else onFutureAdded(fut); return true; }
/** {@inheritDoc} */ @Override public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { boolean changed = waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); AffinityTopologyVersion topVer = exchFut.topologyVersion(); lock.writeLock().lock(); try { if (stopping) return false; assert topVer.equals(exchFut.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']'; if (log.isDebugEnabled()) log.debug( "Partition map before afterExchange [exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); if (cctx.affinity().localNode(p, topVer)) { // This partition will be created during next topology event, // which obviously has not happened at this point. if (locPart == null) { if (log.isDebugEnabled()) log.debug("Skipping local partition afterExchange (will not create): " + p); continue; } GridDhtPartitionState state = locPart.state(); if (state == MOVING) { if (cctx.rebalanceEnabled()) { Collection<ClusterNode> owners = owners(p); // If there are no other owners, then become an owner. if (F.isEmpty(owners)) { boolean owned = locPart.own(); assert owned : "Failed to own partition [cacheName" + cctx.name() + ", locPart=" + locPart + ']'; updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) { DiscoveryEvent discoEvt = exchFut.discoveryEvent(); cctx.events() .addPreloadEvent( p, EVT_CACHE_REBALANCE_PART_DATA_LOST, discoEvt.eventNode(), discoEvt.type(), discoEvt.timestamp()); } if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart); } else if (log.isDebugEnabled()) log.debug( "Will not own partition (there are owners to rebalance from) [locPart=" + locPart + ", owners = " + owners + ']'); } else updateLocal(p, loc.id(), locPart.state(), updateSeq); } } else { if (locPart != null) { GridDhtPartitionState state = locPart.state(); if (state == MOVING) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); changed = true; if (log.isDebugEnabled()) log.debug("Evicting moving partition (it does not belong to affinity): " + locPart); } } } } consistencyCheck(); } finally { lock.writeLock().unlock(); } return changed; }
/** {@inheritDoc} */ @Override public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException { waitForRent(); ClusterNode loc = cctx.localNode(); int num = cctx.affinity().partitions(); lock.writeLock().lock(); try { GridDhtPartitionExchangeId exchId = exchFut.exchangeId(); if (stopping) return; assert topVer.equals(exchId.topologyVersion()) : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']'; if (exchId.isLeft()) removeNode(exchId.nodeId()); // In case if node joins, get topology at the time of joining node. ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; if (log.isDebugEnabled()) log.debug( "Partition map beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); long updateSeq = this.updateSeq.incrementAndGet(); // If this is the oldest node. if (oldest.id().equals(loc.id()) || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) { if (node2part == null) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq); if (log.isDebugEnabled()) log.debug( "Created brand new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } else if (!node2part.valid()) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Created new full topology map on oldest node [exchId=" + exchId + ", fullMap=" + node2part + ']'); } else if (!node2part.nodeId().equals(loc.id())) { node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false); if (log.isDebugEnabled()) log.debug( "Copied old map into new map on oldest node (previous oldest node left) [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } } if (cctx.rebalanceEnabled()) { for (int p = 0; p < num; p++) { // If this is the first node in grid. boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion()); if ((oldest.id().equals(loc.id()) && oldest.id().equals(exchId.nodeId()) && exchId.isJoined()) || added) { assert exchId.isJoined() || added; try { GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); assert locPart != null; boolean owned = locPart.own(); assert owned : "Failed to own partition for oldest node [cacheName" + cctx.name() + ", part=" + locPart + ']'; if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition on oldest node (no need to create a partition " + "if it no longer belongs to local node: " + e.partition()); } } // If this is not the first node in grid. else { if (node2part != null && node2part.valid()) { if (cctx.affinity().localNode(p, topVer)) { try { // This will make sure that all non-existing partitions // will be created in MOVING state. GridDhtLocalPartition locPart = localPartition(p, topVer, true, false); updateLocal(p, loc.id(), locPart.state(), updateSeq); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } // If this node's map is empty, we pre-create local partitions, // so local map will be sent correctly during exchange. else if (cctx.affinity().localNode(p, topVer)) { try { localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition (no need to pre-create a partition if it " + "no longer belongs to local node: " + e.partition()); } } } } } else { // If preloader is disabled, then we simply clear out // the partitions this node is not responsible for. for (int p = 0; p < num; p++) { GridDhtLocalPartition locPart = localPartition(p, topVer, false, false); boolean belongs = cctx.affinity().localNode(p, topVer); if (locPart != null) { if (!belongs) { GridDhtPartitionState state = locPart.state(); if (state.active()) { locPart.rent(false); updateLocal(p, loc.id(), locPart.state(), updateSeq); if (log.isDebugEnabled()) log.debug( "Evicting partition with rebalancing disabled " + "(it does not belong to affinity): " + locPart); } } } else if (belongs) { try { // Pre-create partitions. localPartition(p, topVer, true, false); } catch (GridDhtInvalidPartitionException e) { if (log.isDebugEnabled()) log.debug( "Ignoring invalid partition with disabled rebalancer (no need to " + "pre-create a partition if it no longer belongs to local node: " + e.partition()); } } } } if (node2part != null && node2part.valid()) checkEvictions(updateSeq); consistencyCheck(); if (log.isDebugEnabled()) log.debug( "Partition map after beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']'); } finally { lock.writeLock().unlock(); } // Wait for evictions. waitForRent(); }
private void checkBackup() { GridDistributedTxMapping mapping = mappings.singleMapping(); if (mapping != null) { UUID nodeId = mapping.node().id(); Collection<UUID> backups = tx.transactionNodes().get(nodeId); if (!F.isEmpty(backups)) { assert backups.size() == 1; UUID backupId = F.first(backups); ClusterNode backup = cctx.discovery().node(backupId); // Nothing to do if backup has left the grid. if (backup == null) { readyNearMappingFromBackup(mapping); ClusterTopologyCheckedException cause = new ClusterTopologyCheckedException("Backup node left grid: " + backupId); cause.retryReadyFuture(cctx.nextAffinityReadyFuture(tx.topologyVersion())); onDone( new IgniteTxRollbackCheckedException( "Failed to commit transaction " + "(backup has left grid): " + tx.xidVersion(), cause)); } else { final CheckBackupMiniFuture mini = new CheckBackupMiniFuture(backup, mapping); add(mini); if (backup.isLocal()) { boolean committed = !cctx.tm().addRolledbackTx(tx); readyNearMappingFromBackup(mapping); if (committed) { if (tx.syncCommit()) { GridCacheVersion nearXidVer = tx.nearXidVersion(); assert nearXidVer != null : tx; IgniteInternalFuture<?> fut = cctx.tm().remoteTxFinishFuture(nearXidVer); fut.listen( new CI1<IgniteInternalFuture<?>>() { @Override public void apply(IgniteInternalFuture<?> fut) { mini.onDone(tx); } }); return; } mini.onDone(tx); } else { ClusterTopologyCheckedException cause = new ClusterTopologyCheckedException("Primary node left grid: " + nodeId); cause.retryReadyFuture(cctx.nextAffinityReadyFuture(tx.topologyVersion())); mini.onDone( new IgniteTxRollbackCheckedException( "Failed to commit transaction " + "(transaction has been rolled back on backup node): " + tx.xidVersion(), cause)); } } else { GridDhtTxFinishRequest finishReq = checkCommittedRequest(mini.futureId()); // Preserve old behavior, otherwise response is not sent. if (WAIT_REMOTE_TXS_SINCE.compareTo(backup.version()) > 0) finishReq.syncCommit(true); try { if (FINISH_NEAR_ONE_PHASE_SINCE.compareTo(backup.version()) <= 0) cctx.io().send(backup, finishReq, tx.ioPolicy()); else { mini.onDone( new IgniteTxHeuristicCheckedException( "Failed to check for tx commit on " + "the backup node (node has an old Ignite version) [rmtNodeId=" + backup.id() + ", ver=" + backup.version() + ']')); } } catch (ClusterTopologyCheckedException e) { mini.onNodeLeft(backupId); } catch (IgniteCheckedException e) { mini.onDone(e); } } } } else readyNearMappingFromBackup(mapping); } }
/** * @param cctx Cache context. * @param qry Query. * @param keepPortable Keep portable. * @return Cursor. */ public Iterator<List<?>> query( GridCacheContext<?, ?> cctx, GridCacheTwoStepQuery qry, boolean keepPortable) { for (int attempt = 0; ; attempt++) { if (attempt != 0) { try { Thread.sleep(attempt * 10); // Wait for exchange. } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new CacheException("Query was interrupted.", e); } } long qryReqId = reqIdGen.incrementAndGet(); QueryRun r = new QueryRun(); r.pageSize = qry.pageSize() <= 0 ? GridCacheTwoStepQuery.DFLT_PAGE_SIZE : qry.pageSize(); r.idxs = new ArrayList<>(qry.mapQueries().size()); String space = cctx.name(); r.conn = (JdbcConnection) h2.connectionForSpace(space); AffinityTopologyVersion topVer = h2.readyTopologyVersion(); List<String> extraSpaces = extraSpaces(space, qry.spaces()); Collection<ClusterNode> nodes; // Explicit partition mapping for unstable topology. Map<ClusterNode, IntArray> partsMap = null; if (isPreloadingActive(cctx, extraSpaces)) { if (cctx.isReplicated()) nodes = replicatedUnstableDataNodes(cctx, extraSpaces); else { partsMap = partitionedUnstableDataNodes(cctx, extraSpaces); nodes = partsMap == null ? null : partsMap.keySet(); } } else nodes = stableDataNodes(topVer, cctx, extraSpaces); if (nodes == null) continue; // Retry. assert !nodes.isEmpty(); if (cctx.isReplicated() || qry.explain()) { assert qry.explain() || !nodes.contains(ctx.discovery().localNode()) : "We must be on a client node."; // Select random data node to run query on a replicated data or get EXPLAIN PLAN from a // single node. nodes = Collections.singleton(F.rand(nodes)); } int tblIdx = 0; final boolean skipMergeTbl = !qry.explain() && qry.skipMergeTable(); for (GridCacheSqlQuery mapQry : qry.mapQueries()) { GridMergeIndex idx; if (!skipMergeTbl) { GridMergeTable tbl; try { tbl = createMergeTable(r.conn, mapQry, qry.explain()); } catch (IgniteCheckedException e) { throw new IgniteException(e); } idx = tbl.getScanIndex(null); fakeTable(r.conn, tblIdx++).setInnerTable(tbl); } else idx = GridMergeIndexUnsorted.createDummy(); for (ClusterNode node : nodes) idx.addSource(node.id()); r.idxs.add(idx); } r.latch = new CountDownLatch(r.idxs.size() * nodes.size()); runs.put(qryReqId, r); try { if (ctx.clientDisconnected()) { throw new CacheException( "Query was cancelled, client node disconnected.", new IgniteClientDisconnectedException( ctx.cluster().clientReconnectFuture(), "Client node disconnected.")); } Collection<GridCacheSqlQuery> mapQrys = qry.mapQueries(); if (qry.explain()) { mapQrys = new ArrayList<>(qry.mapQueries().size()); for (GridCacheSqlQuery mapQry : qry.mapQueries()) mapQrys.add(new GridCacheSqlQuery("EXPLAIN " + mapQry.query(), mapQry.parameters())); } if (nodes.size() != 1 || !F.first(nodes).isLocal()) { // Marshall params for remotes. Marshaller m = ctx.config().getMarshaller(); for (GridCacheSqlQuery mapQry : mapQrys) mapQry.marshallParams(m); } boolean retry = false; if (send( nodes, new GridQueryRequest(qryReqId, r.pageSize, space, mapQrys, topVer, extraSpaces, null), partsMap)) { awaitAllReplies(r, nodes); Object state = r.state.get(); if (state != null) { if (state instanceof CacheException) { CacheException err = (CacheException) state; if (err.getCause() instanceof IgniteClientDisconnectedException) throw err; throw new CacheException("Failed to run map query remotely.", err); } if (state instanceof AffinityTopologyVersion) { retry = true; // If remote node asks us to retry then we have outdated full partition map. h2.awaitForReadyTopologyVersion((AffinityTopologyVersion) state); } } } else // Send failed. retry = true; Iterator<List<?>> resIter = null; if (!retry) { if (qry.explain()) return explainPlan(r.conn, space, qry); if (skipMergeTbl) { List<List<?>> res = new ArrayList<>(); assert r.idxs.size() == 1 : r.idxs; GridMergeIndex idx = r.idxs.get(0); Cursor cur = idx.findInStream(null, null); while (cur.next()) { Row row = cur.get(); int cols = row.getColumnCount(); List<Object> resRow = new ArrayList<>(cols); for (int c = 0; c < cols; c++) resRow.add(row.getValue(c).getObject()); res.add(resRow); } resIter = res.iterator(); } else { GridCacheSqlQuery rdc = qry.reduceQuery(); // Statement caching is prohibited here because we can't guarantee correct merge index // reuse. ResultSet res = h2.executeSqlQueryWithTimer( space, r.conn, rdc.query(), F.asList(rdc.parameters()), false); resIter = new Iter(res); } } for (GridMergeIndex idx : r.idxs) { if (!idx.fetchedAll()) // We have to explicitly cancel queries on remote nodes. send(nodes, new GridQueryCancelRequest(qryReqId), null); } if (retry) { if (Thread.currentThread().isInterrupted()) throw new IgniteInterruptedCheckedException("Query was interrupted."); continue; } return new GridQueryCacheObjectsIterator(resIter, cctx, keepPortable); } catch (IgniteCheckedException | RuntimeException e) { U.closeQuiet(r.conn); if (e instanceof CacheException) throw (CacheException) e; Throwable cause = e; if (e instanceof IgniteCheckedException) { Throwable disconnectedErr = ((IgniteCheckedException) e).getCause(IgniteClientDisconnectedException.class); if (disconnectedErr != null) cause = disconnectedErr; } throw new CacheException("Failed to run reduce query locally.", cause); } finally { if (!runs.remove(qryReqId, r)) U.warn(log, "Query run was already removed: " + qryReqId); if (!skipMergeTbl) { for (int i = 0, mapQrys = qry.mapQueries().size(); i < mapQrys; i++) fakeTable(null, i).setInnerTable(null); // Drop all merge tables. } } } }
/** * @param entry Entry to map. * @param val Value to write. * @param entryProcessor Entry processor. * @param ttl TTL (optional). * @param conflictExpireTime Conflict expire time (optional). * @param conflictVer Conflict version (optional). * @param updateCntr Partition update counter. */ public void addWriteEntry( GridDhtCacheEntry entry, @Nullable CacheObject val, EntryProcessor<Object, Object, Object> entryProcessor, long ttl, long conflictExpireTime, @Nullable GridCacheVersion conflictVer, boolean addPrevVal, @Nullable CacheObject prevVal, long updateCntr) { AffinityTopologyVersion topVer = updateReq.topologyVersion(); Collection<ClusterNode> dhtNodes = cctx.dht().topology().nodes(entry.partition(), topVer); if (log.isDebugEnabled()) log.debug( "Mapping entry to DHT nodes [nodes=" + U.nodeIds(dhtNodes) + ", entry=" + entry + ']'); CacheWriteSynchronizationMode syncMode = updateReq.writeSynchronizationMode(); keys.add(entry.key()); for (ClusterNode node : dhtNodes) { UUID nodeId = node.id(); if (!nodeId.equals(cctx.localNodeId())) { GridDhtAtomicUpdateRequest updateReq = mappings.get(nodeId); if (updateReq == null) { updateReq = new GridDhtAtomicUpdateRequest( cctx.cacheId(), nodeId, futVer, writeVer, syncMode, topVer, forceTransformBackups, this.updateReq.subjectId(), this.updateReq.taskNameHash(), forceTransformBackups ? this.updateReq.invokeArguments() : null, cctx.deploymentEnabled(), this.updateReq.keepBinary()); mappings.put(nodeId, updateReq); } updateReq.addWriteValue( entry.key(), val, entryProcessor, ttl, conflictExpireTime, conflictVer, addPrevVal, entry.partition(), prevVal, updateCntr); } else if (dhtNodes.size() == 1) { try { cctx.continuousQueries() .onEntryUpdated( entry.key(), val, prevVal, entry.key().internal() || !cctx.userCache(), entry.partition(), true, false, updateCntr, updateReq.topologyVersion()); } catch (IgniteCheckedException e) { U.warn( log, "Failed to send continuous query message. [key=" + entry.key() + ", newVal=" + val + ", err=" + e + "]"); } } } }
/** {@inheritDoc} */ @SuppressWarnings("unchecked") @Override public void onUtilityCacheStarted() throws IgniteCheckedException { IgniteCacheProxy<Object, Object> proxy = ctx.cache().jcache(CU.UTILITY_CACHE_NAME); boolean old = proxy.context().deploy().ignoreOwnership(true); try { metaDataCache = (IgniteCacheProxy) proxy.withNoRetries(); } finally { proxy.context().deploy().ignoreOwnership(old); } if (clientNode) { assert !metaDataCache.context().affinityNode(); metaCacheQryId = metaDataCache .context() .continuousQueries() .executeInternalQuery( new MetaDataEntryListener(), new MetaDataEntryFilter(), false, true); while (true) { ClusterNode oldestSrvNode = CU.oldestAliveCacheServerNode(ctx.cache().context(), AffinityTopologyVersion.NONE); if (oldestSrvNode == null) break; GridCacheQueryManager qryMgr = metaDataCache.context().queries(); CacheQuery<Map.Entry<PortableMetadataKey, BinaryMetadata>> qry = qryMgr.createScanQuery(new MetaDataPredicate(), null, false); qry.keepAll(false); qry.projection(ctx.cluster().get().forNode(oldestSrvNode)); try { CacheQueryFuture<Map.Entry<PortableMetadataKey, BinaryMetadata>> fut = qry.execute(); Map.Entry<PortableMetadataKey, BinaryMetadata> next; while ((next = fut.next()) != null) { assert next.getKey() != null : next; assert next.getValue() != null : next; addClientCacheMetaData(next.getKey(), next.getValue()); } } catch (IgniteCheckedException e) { if (!ctx.discovery().alive(oldestSrvNode) || !ctx.discovery().pingNode(oldestSrvNode.id())) continue; else throw e; } catch (CacheException e) { if (X.hasCause(e, ClusterTopologyCheckedException.class, ClusterTopologyException.class)) continue; else throw e; } break; } } for (Map.Entry<Integer, BinaryMetadata> e : metaBuf.entrySet()) addMeta(e.getKey(), e.getValue().wrap(portableCtx)); metaBuf.clear(); startLatch.countDown(); }
/** * @param keys Keys. * @param mapped Mappings to check for duplicates. * @param topVer Topology version on which keys should be mapped. */ private void map( Collection<KeyCacheObject> keys, Map<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> mapped, AffinityTopologyVersion topVer) { Collection<ClusterNode> cacheNodes = CU.affinityNodes(cctx, topVer); if (cacheNodes.isEmpty()) { onDone( new ClusterTopologyServerNotFoundException( "Failed to map keys for cache " + "(all partition nodes left the grid) [topVer=" + topVer + ", cache=" + cctx.name() + ']')); return; } Map<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> mappings = U.newHashMap(cacheNodes.size()); final int keysSize = keys.size(); Map<K, V> locVals = U.newHashMap(keysSize); boolean hasRmtNodes = false; // Assign keys to primary nodes. for (KeyCacheObject key : keys) hasRmtNodes |= map(key, mappings, locVals, topVer, mapped); if (isDone()) return; if (!locVals.isEmpty()) add(new GridFinishedFuture<>(locVals)); if (hasRmtNodes) { if (!trackable) { trackable = true; cctx.mvcc().addFuture(this, futId); } } // Create mini futures. for (Map.Entry<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> entry : mappings.entrySet()) { final ClusterNode n = entry.getKey(); final LinkedHashMap<KeyCacheObject, Boolean> mappedKeys = entry.getValue(); assert !mappedKeys.isEmpty(); // If this is the primary or backup node for the keys. if (n.isLocal()) { final GridDhtFuture<Collection<GridCacheEntryInfo>> fut = cache() .getDhtAsync( n.id(), -1, mappedKeys, readThrough, topVer, subjId, taskName == null ? 0 : taskName.hashCode(), expiryPlc, skipVals); final Collection<Integer> invalidParts = fut.invalidPartitions(); if (!F.isEmpty(invalidParts)) { Collection<KeyCacheObject> remapKeys = new ArrayList<>(keysSize); for (KeyCacheObject key : keys) { if (key != null && invalidParts.contains(cctx.affinity().partition(key))) remapKeys.add(key); } AffinityTopologyVersion updTopVer = cctx.discovery().topologyVersionEx(); assert updTopVer.compareTo(topVer) > 0 : "Got invalid partitions for local node but topology version did " + "not change [topVer=" + topVer + ", updTopVer=" + updTopVer + ", invalidParts=" + invalidParts + ']'; // Remap recursively. map(remapKeys, mappings, updTopVer); } // Add new future. add( fut.chain( new C1<IgniteInternalFuture<Collection<GridCacheEntryInfo>>, Map<K, V>>() { @Override public Map<K, V> apply(IgniteInternalFuture<Collection<GridCacheEntryInfo>> fut) { try { return createResultMap(fut.get()); } catch (Exception e) { U.error(log, "Failed to get values from dht cache [fut=" + fut + "]", e); onDone(e); return Collections.emptyMap(); } } })); } else { MiniFuture fut = new MiniFuture(n, mappedKeys, topVer); GridCacheMessage req = new GridNearGetRequest( cctx.cacheId(), futId, fut.futureId(), n.version().compareTo(SINGLE_GET_MSG_SINCE) >= 0 ? null : DUMMY_VER, mappedKeys, readThrough, topVer, subjId, taskName == null ? 0 : taskName.hashCode(), expiryPlc != null ? expiryPlc.forAccess() : -1L, skipVals, cctx.deploymentEnabled()); add(fut); // Append new future. try { cctx.io().send(n, req, cctx.ioPolicy()); } catch (IgniteCheckedException e) { // Fail the whole thing. if (e instanceof ClusterTopologyCheckedException) fut.onNodeLeft((ClusterTopologyCheckedException) e); else fut.onResult(e); } } } }
/** * @param node Node. * @param msg Message. */ private void onFail(ClusterNode node, GridQueryFailResponse msg) { QueryRun r = runs.get(msg.queryRequestId()); fail(r, node.id(), msg.error()); }
/** * Updates value for single partition. * * @param p Partition. * @param nodeId Node ID. * @param state State. * @param updateSeq Update sequence. */ @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"}) private void updateLocal(int p, UUID nodeId, GridDhtPartitionState state, long updateSeq) { assert lock.isWriteLockedByCurrentThread(); assert nodeId.equals(cctx.nodeId()); // In case if node joins, get topology at the time of joining node. ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer); assert oldest != null; // If this node became the oldest node. if (oldest.id().equals(cctx.nodeId())) { long seq = node2part.updateSequence(); if (seq != updateSeq) { if (seq > updateSeq) { if (this.updateSeq.get() < seq) { // Update global counter if necessary. boolean b = this.updateSeq.compareAndSet(this.updateSeq.get(), seq + 1); assert b : "Invalid update sequence [updateSeq=" + updateSeq + ", seq=" + seq + ", curUpdateSeq=" + this.updateSeq.get() + ", node2part=" + node2part.toFullString() + ']'; updateSeq = seq + 1; } else updateSeq = seq; } node2part.updateSequence(updateSeq); } } GridDhtPartitionMap map = node2part.get(nodeId); if (map == null) node2part.put( nodeId, map = new GridDhtPartitionMap( nodeId, updateSeq, Collections.<Integer, GridDhtPartitionState>emptyMap(), false)); map.updateSequence(updateSeq); map.put(p, state); Set<UUID> ids = part2node.get(p); if (ids == null) part2node.put(p, ids = U.newHashSet(3)); ids.add(nodeId); }
/** @param res Result callback. */ @SuppressWarnings("ThrowableResultOfMethodCallIgnored") void onResult(final GridNearGetResponse res) { final Collection<Integer> invalidParts = res.invalidPartitions(); // If error happened on remote node, fail the whole future. if (res.error() != null) { onDone(res.error()); return; } // Remap invalid partitions. if (!F.isEmpty(invalidParts)) { AffinityTopologyVersion rmtTopVer = res.topologyVersion(); assert !rmtTopVer.equals(AffinityTopologyVersion.ZERO); if (rmtTopVer.compareTo(topVer) <= 0) { // Fail the whole get future. onDone( new IgniteCheckedException( "Failed to process invalid partitions response (remote node reported " + "invalid partitions but remote topology version does not differ from local) " + "[topVer=" + topVer + ", rmtTopVer=" + rmtTopVer + ", invalidParts=" + invalidParts + ", nodeId=" + node.id() + ']')); return; } if (log.isDebugEnabled()) log.debug( "Remapping mini get future [invalidParts=" + invalidParts + ", fut=" + this + ']'); if (!canRemap) { map( F.view( keys.keySet(), new P1<KeyCacheObject>() { @Override public boolean apply(KeyCacheObject key) { return invalidParts.contains(cctx.affinity().partition(key)); } }), F.t(node, keys), topVer); onDone(createResultMap(res.entries())); return; } // Need to wait for next topology version to remap. IgniteInternalFuture<AffinityTopologyVersion> topFut = cctx.affinity().affinityReadyFuture(rmtTopVer); topFut.listen( new CIX1<IgniteInternalFuture<AffinityTopologyVersion>>() { @SuppressWarnings("unchecked") @Override public void applyx(IgniteInternalFuture<AffinityTopologyVersion> fut) throws IgniteCheckedException { AffinityTopologyVersion topVer = fut.get(); // This will append new futures to compound list. map( F.view( keys.keySet(), new P1<KeyCacheObject>() { @Override public boolean apply(KeyCacheObject key) { return invalidParts.contains(cctx.affinity().partition(key)); } }), F.t(node, keys), topVer); onDone(createResultMap(res.entries())); } }); } else { try { onDone(createResultMap(res.entries())); } catch (Exception e) { onDone(e); } } }
/** * @param node Node. * @param msg Message. */ private void onNextPage(final ClusterNode node, GridQueryNextPageResponse msg) { final long qryReqId = msg.queryRequestId(); final int qry = msg.query(); final QueryRun r = runs.get(qryReqId); if (r == null) // Already finished with error or canceled. return; final int pageSize = r.pageSize; GridMergeIndex idx = r.idxs.get(msg.query()); GridResultPage page; try { page = new GridResultPage(ctx, node.id(), msg) { @Override public void fetchNextPage() { Object errState = r.state.get(); if (errState != null) { CacheException err0 = errState instanceof CacheException ? (CacheException) errState : null; if (err0 != null && err0.getCause() instanceof IgniteClientDisconnectedException) throw err0; CacheException e = new CacheException("Failed to fetch data from node: " + node.id()); if (err0 != null) e.addSuppressed(err0); throw e; } try { GridQueryNextPageRequest msg0 = new GridQueryNextPageRequest(qryReqId, qry, pageSize); if (node.isLocal()) h2.mapQueryExecutor().onMessage(ctx.localNodeId(), msg0); else ctx.io().send(node, GridTopic.TOPIC_QUERY, msg0, QUERY_POOL); } catch (IgniteCheckedException e) { throw new CacheException("Failed to fetch data from node: " + node.id(), e); } } }; } catch (Exception e) { U.error(log, "Error in message.", e); fail(r, node.id(), "Error in message."); return; } idx.addPage(page); if (msg.retry() != null) retry(r, msg.retry(), node.id()); else if (msg.allRows() != -1) // Only the first page contains row count. r.latch.countDown(); }