/**
   * @param nodes Nodes.
   * @param id ID.
   * @throws IgniteCheckedException If failed.
   */
  private void sendAllPartitions(
      Collection<? extends ClusterNode> nodes, GridDhtPartitionExchangeId id)
      throws IgniteCheckedException {
    GridDhtPartitionsFullMessage m =
        new GridDhtPartitionsFullMessage(id, lastVer.get(), id.topologyVersion());

    for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
      if (!cacheCtx.isLocal()) {
        AffinityTopologyVersion startTopVer = cacheCtx.startTopologyVersion();

        boolean ready = startTopVer == null || startTopVer.compareTo(id.topologyVersion()) <= 0;

        if (ready)
          m.addFullPartitionsMap(cacheCtx.cacheId(), cacheCtx.topology().partitionMap(true));
      }
    }

    // It is important that client topologies be added after contexts.
    for (GridClientPartitionTopology top : cctx.exchange().clientTopologies())
      m.addFullPartitionsMap(top.cacheId(), top.partitionMap(true));

    if (log.isDebugEnabled())
      log.debug(
          "Sending full partition map [nodeIds="
              + F.viewReadOnly(nodes, F.node2id())
              + ", exchId="
              + exchId
              + ", msg="
              + m
              + ']');

    cctx.io().safeSend(nodes, m, SYSTEM_POOL, null);
  }
  /**
   * Creates a future that will wait for all explicit locks acquired on given topology version to be
   * released.
   *
   * @param topVer Topology version to wait for.
   * @return Explicit locks release future.
   */
  public IgniteInternalFuture<?> finishExplicitLocks(AffinityTopologyVersion topVer) {
    GridCompoundFuture<Object, Object> res = new GridCompoundFuture<>();

    for (GridCacheExplicitLockSpan span : pendingExplicit.values()) {
      AffinityTopologyVersion snapshot = span.topologyVersion();

      if (snapshot != null && snapshot.compareTo(topVer) < 0) res.add(span.releaseFuture());
    }

    res.markInitialized();

    return res;
  }
  /** {@inheritDoc} */
  @Override
  public Collection<ClusterNode> nodes(int p, AffinityTopologyVersion topVer) {
    Collection<ClusterNode> affNodes = cctx.affinity().nodes(p, topVer);

    lock.readLock().lock();

    try {
      assert node2part != null && node2part.valid()
          : "Invalid node-to-partitions map [topVer1="
              + topVer
              + ", topVer2="
              + this.topVer
              + ", cache="
              + cctx.name()
              + ", node2part="
              + node2part
              + ']';

      Collection<ClusterNode> nodes = null;

      Collection<UUID> nodeIds = part2node.get(p);

      if (!F.isEmpty(nodeIds)) {
        Collection<UUID> affIds = new HashSet<>(F.viewReadOnly(affNodes, F.node2id()));

        for (UUID nodeId : nodeIds) {
          if (!affIds.contains(nodeId) && hasState(p, nodeId, OWNING, MOVING, RENTING)) {
            ClusterNode n = cctx.discovery().node(nodeId);

            if (n != null
                && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion())) {
              if (nodes == null) {
                nodes = new ArrayList<>(affNodes.size() + 2);

                nodes.addAll(affNodes);
              }

              nodes.add(n);
            }
          }
        }
      }

      return nodes != null ? nodes : affNodes;
    } finally {
      lock.readLock().unlock();
    }
  }
    /**
     * @param topVer Topology version.
     * @param entries Entries.
     */
    FinishLockFuture(Iterable<GridDistributedCacheEntry> entries, AffinityTopologyVersion topVer) {
      assert topVer.compareTo(AffinityTopologyVersion.ZERO) > 0;

      this.topVer = topVer;

      for (GridCacheEntryEx entry : entries) {
        // Either local or near local candidates.
        try {
          Collection<GridCacheMvccCandidate> locs = entry.localCandidates();

          if (!F.isEmpty(locs)) {
            Collection<GridCacheMvccCandidate> cands = new ConcurrentLinkedQueue<>();

            cands.addAll(F.view(locs, versionFilter()));

            if (!F.isEmpty(cands)) pendingLocks.put(entry.txKey(), cands);
          }
        } catch (GridCacheEntryRemovedException ignored) {
          if (exchLog.isDebugEnabled())
            exchLog.debug(
                "Got removed entry when adding it to finish lock future (will ignore): " + entry);
        }
      }

      if (exchLog.isDebugEnabled())
        exchLog.debug("Pending lock set [topVer=" + topVer + ", locks=" + pendingLocks + ']');
    }
  /**
   * @param p Partition.
   * @param topVer Topology version ({@code -1} for all nodes).
   * @param state Partition state.
   * @param states Additional partition states.
   * @return List of nodes for the partition.
   */
  private List<ClusterNode> nodes(
      int p,
      AffinityTopologyVersion topVer,
      GridDhtPartitionState state,
      GridDhtPartitionState... states) {
    Collection<UUID> allIds =
        topVer.topologyVersion() > 0 ? F.nodeIds(CU.affinityNodes(cctx, topVer)) : null;

    lock.readLock().lock();

    try {
      assert node2part != null && node2part.valid()
          : "Invalid node-to-partitions map [topVer="
              + topVer
              + ", allIds="
              + allIds
              + ", node2part="
              + node2part
              + ", cache="
              + cctx.name()
              + ']';

      Collection<UUID> nodeIds = part2node.get(p);

      // Node IDs can be null if both, primary and backup, nodes disappear.
      int size = nodeIds == null ? 0 : nodeIds.size();

      if (size == 0) return Collections.emptyList();

      List<ClusterNode> nodes = new ArrayList<>(size);

      for (UUID id : nodeIds) {
        if (topVer.topologyVersion() > 0 && !allIds.contains(id)) continue;

        if (hasState(p, id, state, states)) {
          ClusterNode n = cctx.discovery().node(id);

          if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion()))
            nodes.add(n);
        }
      }

      return nodes;
    } finally {
      lock.readLock().unlock();
    }
  }
    /**
     * @param log Logger.
     * @param topVer Topology version.
     * @param initCntr Update counters.
     */
    public PartitionRecovery(
        IgniteLogger log, AffinityTopologyVersion topVer, @Nullable Long initCntr) {
      assert topVer.topologyVersion() > 0 : topVer;

      this.log = log;

      if (initCntr != null) {
        this.lastFiredEvt = initCntr;

        curTop = topVer;
      }
    }
  /** {@inheritDoc} */
  @Override
  public AffinityTopologyVersion topologyVersion() {
    lock.readLock().lock();

    try {
      assert topVer.topologyVersion() > 0
          : "Invalid topology version [topVer=" + topVer + ", cacheName=" + cctx.name() + ']';

      return topVer;
    } finally {
      lock.readLock().unlock();
    }
  }
  /**
   * @param keyFilter Key filter.
   * @param topVer Topology version.
   * @return Future that signals when all locks for given partitions will be released.
   */
  private IgniteInternalFuture<?> finishLocks(
      @Nullable final IgnitePredicate<KeyCacheObject> keyFilter, AffinityTopologyVersion topVer) {
    assert topVer.topologyVersion() != 0;

    if (topVer.equals(AffinityTopologyVersion.NONE)) return new GridFinishedFuture();

    final FinishLockFuture finishFut =
        new FinishLockFuture(
            keyFilter == null
                ? locked()
                : F.view(
                    locked(),
                    new P1<GridDistributedCacheEntry>() {
                      @Override
                      public boolean apply(GridDistributedCacheEntry e) {
                        return F.isAll(e.key(), keyFilter);
                      }
                    }),
            topVer);

    finishFuts.add(finishFut);

    finishFut.listen(
        new CI1<IgniteInternalFuture<?>>() {
          @Override
          public void apply(IgniteInternalFuture<?> e) {
            finishFuts.remove(finishFut);

            // This call is required to make sure that the concurrent queue
            // clears memory occupied by internal nodes.
            finishFuts.peek();
          }
        });

    finishFut.recheck();

    return finishFut;
  }
    /** @return Filter. */
    private IgnitePredicate<GridCacheMvccCandidate> versionFilter() {
      assert topVer.topologyVersion() > 0;

      return new P1<GridCacheMvccCandidate>() {
        @Override
        public boolean apply(GridCacheMvccCandidate c) {
          assert c.nearLocal() || c.dhtLocal();

          // Wait for explicit locks.
          return c.topologyVersion().equals(AffinityTopologyVersion.ZERO)
              || c.topologyVersion().compareTo(topVer) < 0;
        }
      };
    }
Example #10
0
  /**
   * @param expVer Expected topology version.
   * @param curVer Current topology version.
   * @return {@code True} if cache affinity changed and operation should be remapped.
   */
  protected final boolean needRemap(
      AffinityTopologyVersion expVer, AffinityTopologyVersion curVer) {
    if (expVer.equals(curVer)) return false;

    Collection<ClusterNode> cacheNodes0 = ctx.discovery().cacheAffinityNodes(ctx.name(), expVer);
    Collection<ClusterNode> cacheNodes1 = ctx.discovery().cacheAffinityNodes(ctx.name(), curVer);

    if (!cacheNodes0.equals(cacheNodes1)
        || ctx.affinity().affinityTopologyVersion().compareTo(curVer) < 0) return true;

    try {
      List<List<ClusterNode>> aff1 = ctx.affinity().assignments(expVer);
      List<List<ClusterNode>> aff2 = ctx.affinity().assignments(curVer);

      return !aff1.equals(aff2);
    } catch (IllegalStateException e) {
      return true;
    }
  }
    /** @param e Failure exception. */
    @SuppressWarnings("UnusedParameters")
    synchronized void onNodeLeft(ClusterTopologyCheckedException e) {
      if (remapped) return;

      remapped = true;

      if (log.isDebugEnabled())
        log.debug("Remote node left grid while sending or waiting for reply (will retry): " + this);

      // Try getting from existing nodes.
      if (!canRemap) {
        map(keys.keySet(), F.t(node, keys), topVer);

        onDone(Collections.<K, V>emptyMap());
      } else {
        final AffinityTopologyVersion updTopVer =
            new AffinityTopologyVersion(
                Math.max(topVer.topologyVersion() + 1, cctx.discovery().topologyVersion()));

        cctx.affinity()
            .affinityReadyFuture(updTopVer)
            .listen(
                new CI1<IgniteInternalFuture<AffinityTopologyVersion>>() {
                  @Override
                  public void apply(IgniteInternalFuture<AffinityTopologyVersion> fut) {
                    try {
                      fut.get();

                      // Remap.
                      map(keys.keySet(), F.t(node, keys), updTopVer);

                      onDone(Collections.<K, V>emptyMap());
                    } catch (IgniteCheckedException e) {
                      GridPartitionedGetFuture.this.onDone(e);
                    }
                  }
                });
      }
    }
    /**
     * Add continuous entry.
     *
     * @param entry Cache continuous query entry.
     * @return Collection entries which will be fired.
     */
    public Collection<CacheContinuousQueryEntry> collectEntries(CacheContinuousQueryEntry entry) {
      assert entry != null;

      List<CacheContinuousQueryEntry> entries;

      synchronized (pendingEvts) {
        // Received first event.
        if (curTop == AffinityTopologyVersion.NONE) {
          lastFiredEvt = entry.updateCounter();

          curTop = entry.topologyVersion();

          return F.asList(entry);
        }

        if (curTop.compareTo(entry.topologyVersion()) < 0) {
          if (entry.updateCounter() == 1L && !entry.isBackup()) {
            entries = new ArrayList<>(pendingEvts.size());

            for (CacheContinuousQueryEntry evt : pendingEvts.values()) {
              if (evt != HOLE && !evt.isFiltered()) entries.add(evt);
            }

            pendingEvts.clear();

            curTop = entry.topologyVersion();

            lastFiredEvt = entry.updateCounter();

            entries.add(entry);

            return entries;
          }

          curTop = entry.topologyVersion();
        }

        // Check duplicate.
        if (entry.updateCounter() > lastFiredEvt) {
          pendingEvts.put(entry.updateCounter(), entry);

          // Put filtered events.
          if (entry.filteredEvents() != null) {
            for (long cnrt : entry.filteredEvents()) {
              if (cnrt > lastFiredEvt) pendingEvts.put(cnrt, HOLE);
            }
          }
        } else {
          if (log.isDebugEnabled()) log.debug("Skip duplicate continuous query message: " + entry);

          return Collections.emptyList();
        }

        if (pendingEvts.isEmpty()) return Collections.emptyList();

        Iterator<Map.Entry<Long, CacheContinuousQueryEntry>> iter =
            pendingEvts.entrySet().iterator();

        entries = new ArrayList<>();

        if (pendingEvts.size() >= MAX_BUFF_SIZE) {
          for (int i = 0; i < MAX_BUFF_SIZE - (MAX_BUFF_SIZE / 10); i++) {
            Map.Entry<Long, CacheContinuousQueryEntry> e = iter.next();

            if (e.getValue() != HOLE && !e.getValue().isFiltered()) entries.add(e.getValue());

            lastFiredEvt = e.getKey();

            iter.remove();
          }
        } else {
          // Elements are consistently.
          while (iter.hasNext()) {
            Map.Entry<Long, CacheContinuousQueryEntry> e = iter.next();

            if (e.getKey() == lastFiredEvt + 1) {
              ++lastFiredEvt;

              if (e.getValue() != HOLE && !e.getValue().isFiltered()) entries.add(e.getValue());

              iter.remove();
            } else break;
          }
        }
      }

      return entries;
    }
  /** {@inheritDoc} */
  @Override
  public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut)
      throws IgniteCheckedException {
    boolean changed = waitForRent();

    ClusterNode loc = cctx.localNode();

    int num = cctx.affinity().partitions();

    AffinityTopologyVersion topVer = exchFut.topologyVersion();

    lock.writeLock().lock();

    try {
      if (stopping) return false;

      assert topVer.equals(exchFut.topologyVersion())
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']';

      if (log.isDebugEnabled())
        log.debug(
            "Partition map before afterExchange [exchId="
                + exchFut.exchangeId()
                + ", fullMap="
                + fullMapString()
                + ']');

      long updateSeq = this.updateSeq.incrementAndGet();

      for (int p = 0; p < num; p++) {
        GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);

        if (cctx.affinity().localNode(p, topVer)) {
          // This partition will be created during next topology event,
          // which obviously has not happened at this point.
          if (locPart == null) {
            if (log.isDebugEnabled())
              log.debug("Skipping local partition afterExchange (will not create): " + p);

            continue;
          }

          GridDhtPartitionState state = locPart.state();

          if (state == MOVING) {
            if (cctx.rebalanceEnabled()) {
              Collection<ClusterNode> owners = owners(p);

              // If there are no other owners, then become an owner.
              if (F.isEmpty(owners)) {
                boolean owned = locPart.own();

                assert owned
                    : "Failed to own partition [cacheName"
                        + cctx.name()
                        + ", locPart="
                        + locPart
                        + ']';

                updateLocal(p, loc.id(), locPart.state(), updateSeq);

                changed = true;

                if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) {
                  DiscoveryEvent discoEvt = exchFut.discoveryEvent();

                  cctx.events()
                      .addPreloadEvent(
                          p,
                          EVT_CACHE_REBALANCE_PART_DATA_LOST,
                          discoEvt.eventNode(),
                          discoEvt.type(),
                          discoEvt.timestamp());
                }

                if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart);
              } else if (log.isDebugEnabled())
                log.debug(
                    "Will not own partition (there are owners to rebalance from) [locPart="
                        + locPart
                        + ", owners = "
                        + owners
                        + ']');
            } else updateLocal(p, loc.id(), locPart.state(), updateSeq);
          }
        } else {
          if (locPart != null) {
            GridDhtPartitionState state = locPart.state();

            if (state == MOVING) {
              locPart.rent(false);

              updateLocal(p, loc.id(), locPart.state(), updateSeq);

              changed = true;

              if (log.isDebugEnabled())
                log.debug("Evicting moving partition (it does not belong to affinity): " + locPart);
            }
          }
        }
      }

      consistencyCheck();
    } finally {
      lock.writeLock().unlock();
    }

    return changed;
  }
  /** @throws InterruptedException If interrupted. */
  @SuppressWarnings("BusyWait")
  protected void awaitPartitionMapExchange() throws InterruptedException {
    for (Ignite g : G.allGrids()) {
      IgniteKernal g0 = (IgniteKernal) g;

      for (IgniteCacheProxy<?, ?> c : g0.context().cache().jcaches()) {
        CacheConfiguration cfg = c.context().config();

        if (cfg.getCacheMode() == PARTITIONED
            && cfg.getRebalanceMode() != NONE
            && g.cluster().nodes().size() > 1) {
          AffinityFunction aff = cfg.getAffinity();

          GridDhtCacheAdapter<?, ?> dht = dht(c);

          GridDhtPartitionTopology top = dht.topology();

          for (int p = 0; p < aff.partitions(); p++) {
            long start = 0;

            for (int i = 0; ; i++) {
              boolean match = false;

              AffinityTopologyVersion readyVer =
                  dht.context().shared().exchange().readyAffinityVersion();

              if (readyVer.topologyVersion() > 0 && c.context().started()) {
                // Must map on updated version of topology.
                Collection<ClusterNode> affNodes =
                    g0.affinity(cfg.getName()).mapPartitionToPrimaryAndBackups(p);

                int exp = affNodes.size();

                GridDhtTopologyFuture topFut = top.topologyVersionFuture();

                Collection<ClusterNode> owners =
                    (topFut != null && topFut.isDone())
                        ? top.nodes(p, AffinityTopologyVersion.NONE)
                        : Collections.<ClusterNode>emptyList();

                int actual = owners.size();

                if (affNodes.size() != owners.size() || !affNodes.containsAll(owners)) {
                  LT.warn(
                      log(),
                      null,
                      "Waiting for topology map update ["
                          + "grid="
                          + g.name()
                          + ", cache="
                          + cfg.getName()
                          + ", cacheId="
                          + dht.context().cacheId()
                          + ", topVer="
                          + top.topologyVersion()
                          + ", topFut="
                          + topFut
                          + ", p="
                          + p
                          + ", affNodesCnt="
                          + exp
                          + ", ownersCnt="
                          + actual
                          + ", affNodes="
                          + affNodes
                          + ", owners="
                          + owners
                          + ", locNode="
                          + g.cluster().localNode()
                          + ']');
                } else match = true;
              } else {
                LT.warn(
                    log(),
                    null,
                    "Waiting for topology map update ["
                        + "grid="
                        + g.name()
                        + ", cache="
                        + cfg.getName()
                        + ", cacheId="
                        + dht.context().cacheId()
                        + ", topVer="
                        + top.topologyVersion()
                        + ", started="
                        + dht.context().started()
                        + ", p="
                        + p
                        + ", readVer="
                        + readyVer
                        + ", locNode="
                        + g.cluster().localNode()
                        + ']');
              }

              if (!match) {
                if (i == 0) start = System.currentTimeMillis();

                if (System.currentTimeMillis() - start > 30_000)
                  throw new IgniteException(
                      "Timeout of waiting for topology map update ["
                          + "grid="
                          + g.name()
                          + ", cache="
                          + cfg.getName()
                          + ", cacheId="
                          + dht.context().cacheId()
                          + ", topVer="
                          + top.topologyVersion()
                          + ", p="
                          + p
                          + ", readVer="
                          + readyVer
                          + ", locNode="
                          + g.cluster().localNode()
                          + ']');

                Thread.sleep(200); // Busy wait.

                continue;
              }

              if (i > 0)
                log()
                    .warning(
                        "Finished waiting for topology map update [grid="
                            + g.name()
                            + ", p="
                            + p
                            + ", duration="
                            + (System.currentTimeMillis() - start)
                            + "ms]");

              break;
            }
          }
        }
      }
    }
  }
  /**
   * Starts activity.
   *
   * @throws IgniteInterruptedCheckedException If interrupted.
   */
  public void init() throws IgniteInterruptedCheckedException {
    if (isDone()) return;

    if (init.compareAndSet(false, true)) {
      if (isDone()) return;

      try {
        // Wait for event to occur to make sure that discovery
        // will return corresponding nodes.
        U.await(evtLatch);

        assert discoEvt != null : this;
        assert !dummy && !forcePreload : this;

        ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx, exchId.topologyVersion());

        oldestNode.set(oldest);

        startCaches();

        // True if client node joined or failed.
        boolean clientNodeEvt;

        if (F.isEmpty(reqs)) {
          int type = discoEvt.type();

          assert type == EVT_NODE_JOINED || type == EVT_NODE_LEFT || type == EVT_NODE_FAILED
              : discoEvt;

          clientNodeEvt = CU.clientNode(discoEvt.eventNode());
        } else {
          assert discoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT : discoEvt;

          boolean clientOnlyStart = true;

          for (DynamicCacheChangeRequest req : reqs) {
            if (!req.clientStartOnly()) {
              clientOnlyStart = false;

              break;
            }
          }

          clientNodeEvt = clientOnlyStart;
        }

        if (clientNodeEvt) {
          ClusterNode node = discoEvt.eventNode();

          // Client need to initialize affinity for local join event or for stated client caches.
          if (!node.isLocal()) {
            for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
              if (cacheCtx.isLocal()) continue;

              GridDhtPartitionTopology top = cacheCtx.topology();

              top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId()));

              if (cacheCtx.affinity().affinityTopologyVersion() == AffinityTopologyVersion.NONE) {
                initTopology(cacheCtx);

                top.beforeExchange(this);
              } else
                cacheCtx.affinity().clientEventTopologyChange(discoEvt, exchId.topologyVersion());
            }

            if (exchId.isLeft())
              cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion());

            onDone(exchId.topologyVersion());

            skipPreload = cctx.kernalContext().clientNode();

            return;
          }
        }

        if (cctx.kernalContext().clientNode()) {
          skipPreload = true;

          for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
            if (cacheCtx.isLocal()) continue;

            GridDhtPartitionTopology top = cacheCtx.topology();

            top.updateTopologyVersion(exchId, this, -1, stopping(cacheCtx.cacheId()));
          }

          for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
            if (cacheCtx.isLocal()) continue;

            initTopology(cacheCtx);
          }

          if (oldestNode.get() != null) {
            rmtNodes =
                new ConcurrentLinkedQueue<>(
                    CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion()));

            rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes)));

            ready.set(true);

            initFut.onDone(true);

            if (log.isDebugEnabled()) log.debug("Initialized future: " + this);

            sendPartitions();
          } else onDone(exchId.topologyVersion());

          return;
        }

        assert oldestNode.get() != null;

        for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
          if (isCacheAdded(cacheCtx.cacheId(), exchId.topologyVersion())) {
            if (cacheCtx
                .discovery()
                .cacheAffinityNodes(cacheCtx.name(), topologyVersion())
                .isEmpty())
              U.quietAndWarn(log, "No server nodes found for cache client: " + cacheCtx.namex());
          }

          cacheCtx.preloader().onExchangeFutureAdded();
        }

        List<String> cachesWithoutNodes = null;

        if (exchId.isLeft()) {
          for (String name : cctx.cache().cacheNames()) {
            if (cctx.discovery().cacheAffinityNodes(name, topologyVersion()).isEmpty()) {
              if (cachesWithoutNodes == null) cachesWithoutNodes = new ArrayList<>();

              cachesWithoutNodes.add(name);

              // Fire event even if there is no client cache started.
              if (cctx.gridEvents().isRecordable(EventType.EVT_CACHE_NODES_LEFT)) {
                Event evt =
                    new CacheEvent(
                        name,
                        cctx.localNode(),
                        cctx.localNode(),
                        "All server nodes have left the cluster.",
                        EventType.EVT_CACHE_NODES_LEFT,
                        0,
                        false,
                        null,
                        null,
                        null,
                        null,
                        false,
                        null,
                        false,
                        null,
                        null,
                        null);

                cctx.gridEvents().record(evt);
              }
            }
          }
        }

        if (cachesWithoutNodes != null) {
          StringBuilder sb =
              new StringBuilder(
                  "All server nodes for the following caches have left the cluster: ");

          for (int i = 0; i < cachesWithoutNodes.size(); i++) {
            String cache = cachesWithoutNodes.get(i);

            sb.append('\'').append(cache).append('\'');

            if (i != cachesWithoutNodes.size() - 1) sb.append(", ");
          }

          U.quietAndWarn(log, sb.toString());

          U.quietAndWarn(log, "Must have server nodes for caches to operate.");
        }

        assert discoEvt != null;

        assert exchId.nodeId().equals(discoEvt.eventNode().id());

        for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
          GridClientPartitionTopology clientTop =
              cctx.exchange().clearClientTopology(cacheCtx.cacheId());

          long updSeq = clientTop == null ? -1 : clientTop.lastUpdateSequence();

          // Update before waiting for locks.
          if (!cacheCtx.isLocal())
            cacheCtx
                .topology()
                .updateTopologyVersion(exchId, this, updSeq, stopping(cacheCtx.cacheId()));
        }

        // Grab all alive remote nodes with order of equal or less than last joined node.
        rmtNodes =
            new ConcurrentLinkedQueue<>(
                CU.aliveRemoteServerNodesWithCaches(cctx, exchId.topologyVersion()));

        rmtIds = Collections.unmodifiableSet(new HashSet<>(F.nodeIds(rmtNodes)));

        for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> m : singleMsgs.entrySet())
          // If received any messages, process them.
          onReceive(m.getKey(), m.getValue());

        for (Map.Entry<UUID, GridDhtPartitionsFullMessage> m : fullMsgs.entrySet())
          // If received any messages, process them.
          onReceive(m.getKey(), m.getValue());

        AffinityTopologyVersion topVer = exchId.topologyVersion();

        for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
          if (cacheCtx.isLocal()) continue;

          // Must initialize topology after we get discovery event.
          initTopology(cacheCtx);

          cacheCtx.preloader().updateLastExchangeFuture(this);
        }

        IgniteInternalFuture<?> partReleaseFut = cctx.partitionReleaseFuture(topVer);

        // Assign to class variable so it will be included into toString() method.
        this.partReleaseFut = partReleaseFut;

        if (log.isDebugEnabled()) log.debug("Before waiting for partition release future: " + this);

        while (true) {
          try {
            partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS);

            break;
          } catch (IgniteFutureTimeoutCheckedException ignored) {
            // Print pending transactions and locks that might have led to hang.
            dumpPendingObjects();
          }
        }

        if (log.isDebugEnabled()) log.debug("After waiting for partition release future: " + this);

        if (!F.isEmpty(reqs)) blockGateways();

        if (exchId.isLeft())
          cctx.mvcc().removeExplicitNodeLocks(exchId.nodeId(), exchId.topologyVersion());

        IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion());

        while (true) {
          try {
            locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS);

            break;
          } catch (IgniteFutureTimeoutCheckedException ignored) {
            U.warn(
                log,
                "Failed to wait for locks release future. "
                    + "Dumping pending objects that might be the cause: "
                    + cctx.localNodeId());

            U.warn(log, "Locked entries:");

            Map<IgniteTxKey, Collection<GridCacheMvccCandidate>> locks =
                cctx.mvcc().unfinishedLocks(exchId.topologyVersion());

            for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet())
              U.warn(log, "Locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']');
          }
        }

        for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
          if (cacheCtx.isLocal()) continue;

          // Notify replication manager.
          GridCacheContext drCacheCtx =
              cacheCtx.isNear() ? cacheCtx.near().dht().context() : cacheCtx;

          if (drCacheCtx.isDrEnabled()) drCacheCtx.dr().beforeExchange(topVer, exchId.isLeft());

          // Partition release future is done so we can flush the write-behind store.
          cacheCtx.store().forceFlush();

          // Process queued undeploys prior to sending/spreading map.
          cacheCtx.preloader().unwindUndeploys();

          GridDhtPartitionTopology top = cacheCtx.topology();

          assert topVer.equals(top.topologyVersion())
              : "Topology version is updated only in this class instances inside single ExchangeWorker thread.";

          top.beforeExchange(this);
        }

        for (GridClientPartitionTopology top : cctx.exchange().clientTopologies()) {
          top.updateTopologyVersion(exchId, this, -1, stopping(top.cacheId()));

          top.beforeExchange(this);
        }
      } catch (IgniteInterruptedCheckedException e) {
        onDone(e);

        throw e;
      } catch (Throwable e) {
        U.error(
            log,
            "Failed to reinitialize local partitions (preloading will be stopped): " + exchId,
            e);

        onDone(e);

        if (e instanceof Error) throw (Error) e;

        return;
      }

      if (F.isEmpty(rmtIds)) {
        onDone(exchId.topologyVersion());

        return;
      }

      ready.set(true);

      initFut.onDone(true);

      if (log.isDebugEnabled()) log.debug("Initialized future: " + this);

      // If this node is not oldest.
      if (!oldestNode.get().id().equals(cctx.localNodeId())) sendPartitions();
      else {
        boolean allReceived = allReceived();

        if (allReceived && replied.compareAndSet(false, true)) {
          if (spreadPartitions()) onDone(exchId.topologyVersion());
        }
      }

      scheduleRecheck();
    } else assert false : "Skipped init future: " + this;
  }
  /**
   * @param keys Keys.
   * @param mapped Mappings to check for duplicates.
   * @param topVer Topology version on which keys should be mapped.
   */
  private void map(
      Collection<KeyCacheObject> keys,
      Map<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> mapped,
      AffinityTopologyVersion topVer) {
    Collection<ClusterNode> cacheNodes = CU.affinityNodes(cctx, topVer);

    if (cacheNodes.isEmpty()) {
      onDone(
          new ClusterTopologyServerNotFoundException(
              "Failed to map keys for cache "
                  + "(all partition nodes left the grid) [topVer="
                  + topVer
                  + ", cache="
                  + cctx.name()
                  + ']'));

      return;
    }

    Map<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> mappings =
        U.newHashMap(cacheNodes.size());

    final int keysSize = keys.size();

    Map<K, V> locVals = U.newHashMap(keysSize);

    boolean hasRmtNodes = false;

    // Assign keys to primary nodes.
    for (KeyCacheObject key : keys) hasRmtNodes |= map(key, mappings, locVals, topVer, mapped);

    if (isDone()) return;

    if (!locVals.isEmpty()) add(new GridFinishedFuture<>(locVals));

    if (hasRmtNodes) {
      if (!trackable) {
        trackable = true;

        cctx.mvcc().addFuture(this, futId);
      }
    }

    // Create mini futures.
    for (Map.Entry<ClusterNode, LinkedHashMap<KeyCacheObject, Boolean>> entry :
        mappings.entrySet()) {
      final ClusterNode n = entry.getKey();

      final LinkedHashMap<KeyCacheObject, Boolean> mappedKeys = entry.getValue();

      assert !mappedKeys.isEmpty();

      // If this is the primary or backup node for the keys.
      if (n.isLocal()) {
        final GridDhtFuture<Collection<GridCacheEntryInfo>> fut =
            cache()
                .getDhtAsync(
                    n.id(),
                    -1,
                    mappedKeys,
                    readThrough,
                    topVer,
                    subjId,
                    taskName == null ? 0 : taskName.hashCode(),
                    expiryPlc,
                    skipVals);

        final Collection<Integer> invalidParts = fut.invalidPartitions();

        if (!F.isEmpty(invalidParts)) {
          Collection<KeyCacheObject> remapKeys = new ArrayList<>(keysSize);

          for (KeyCacheObject key : keys) {
            if (key != null && invalidParts.contains(cctx.affinity().partition(key)))
              remapKeys.add(key);
          }

          AffinityTopologyVersion updTopVer = cctx.discovery().topologyVersionEx();

          assert updTopVer.compareTo(topVer) > 0
              : "Got invalid partitions for local node but topology version did "
                  + "not change [topVer="
                  + topVer
                  + ", updTopVer="
                  + updTopVer
                  + ", invalidParts="
                  + invalidParts
                  + ']';

          // Remap recursively.
          map(remapKeys, mappings, updTopVer);
        }

        // Add new future.
        add(
            fut.chain(
                new C1<IgniteInternalFuture<Collection<GridCacheEntryInfo>>, Map<K, V>>() {
                  @Override
                  public Map<K, V> apply(IgniteInternalFuture<Collection<GridCacheEntryInfo>> fut) {
                    try {
                      return createResultMap(fut.get());
                    } catch (Exception e) {
                      U.error(log, "Failed to get values from dht cache [fut=" + fut + "]", e);

                      onDone(e);

                      return Collections.emptyMap();
                    }
                  }
                }));
      } else {
        MiniFuture fut = new MiniFuture(n, mappedKeys, topVer);

        GridCacheMessage req =
            new GridNearGetRequest(
                cctx.cacheId(),
                futId,
                fut.futureId(),
                n.version().compareTo(SINGLE_GET_MSG_SINCE) >= 0 ? null : DUMMY_VER,
                mappedKeys,
                readThrough,
                topVer,
                subjId,
                taskName == null ? 0 : taskName.hashCode(),
                expiryPlc != null ? expiryPlc.forAccess() : -1L,
                skipVals,
                cctx.deploymentEnabled());

        add(fut); // Append new future.

        try {
          cctx.io().send(n, req, cctx.ioPolicy());
        } catch (IgniteCheckedException e) {
          // Fail the whole thing.
          if (e instanceof ClusterTopologyCheckedException)
            fut.onNodeLeft((ClusterTopologyCheckedException) e);
          else fut.onResult(e);
        }
      }
    }
  }
 /**
  * @param topVer Topology version.
  * @return Future that signals when all locks for given partitions are released.
  */
 @SuppressWarnings({"unchecked"})
 public IgniteInternalFuture<?> finishLocks(AffinityTopologyVersion topVer) {
   assert topVer.compareTo(AffinityTopologyVersion.ZERO) > 0;
   return finishLocks(null, topVer);
 }
  /**
   * @param key Key.
   * @param part Partition.
   * @param locVals Local values.
   * @return {@code True} if there is no need to further search value.
   */
  private boolean localGet(KeyCacheObject key, int part, Map<K, V> locVals) {
    assert cctx.affinityNode() : this;

    GridDhtCacheAdapter<K, V> cache = cache();

    while (true) {
      GridCacheEntryEx entry;

      try {
        entry = cache.context().isSwapOrOffheapEnabled() ? cache.entryEx(key) : cache.peekEx(key);

        // If our DHT cache do has value, then we peek it.
        if (entry != null) {
          boolean isNew = entry.isNewLocked();

          CacheObject v = null;
          GridCacheVersion ver = null;

          if (needVer) {
            T2<CacheObject, GridCacheVersion> res =
                entry.innerGetVersioned(
                    null,
                    null,
                    /*swap*/ true,
                    /*unmarshal*/ true,
                    /** update-metrics */
                    false,
                    /*event*/ !skipVals,
                    subjId,
                    null,
                    taskName,
                    expiryPlc,
                    !deserializeBinary);

            if (res != null) {
              v = res.get1();
              ver = res.get2();
            }
          } else {
            v =
                entry.innerGet(
                    null,
                    null,
                    /*swap*/ true,
                    /*read-through*/ false,
                    /** update-metrics */
                    false,
                    /*event*/ !skipVals,
                    /*temporary*/ false,
                    subjId,
                    null,
                    taskName,
                    expiryPlc,
                    !deserializeBinary);
          }

          cache.context().evicts().touch(entry, topVer);

          // Entry was not in memory or in swap, so we remove it from cache.
          if (v == null) {
            if (isNew && entry.markObsoleteIfEmpty(ver)) cache.removeEntry(entry);
          } else {
            cctx.addResult(
                locVals, key, v, skipVals, keepCacheObjects, deserializeBinary, true, ver);

            return true;
          }
        }

        boolean topStable = cctx.isReplicated() || topVer.equals(cctx.topology().topologyVersion());

        // Entry not found, do not continue search if topology did not change and there is no store.
        if (!cctx.readThroughConfigured() && (topStable || partitionOwned(part))) {
          if (!skipVals && cctx.config().isStatisticsEnabled()) cache.metrics0().onRead(false);

          return true;
        }

        return false;
      } catch (GridCacheEntryRemovedException ignored) {
        // No-op, will retry.
      } catch (GridDhtInvalidPartitionException ignored) {
        return false;
      } catch (IgniteCheckedException e) {
        onDone(e);

        return true;
      }
    }
  }
  /** {@inheritDoc} */
  @Override
  public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut)
      throws IgniteCheckedException {
    waitForRent();

    ClusterNode loc = cctx.localNode();

    int num = cctx.affinity().partitions();

    lock.writeLock().lock();

    try {
      GridDhtPartitionExchangeId exchId = exchFut.exchangeId();

      if (stopping) return;

      assert topVer.equals(exchId.topologyVersion())
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']';

      if (exchId.isLeft()) removeNode(exchId.nodeId());

      // In case if node joins, get topology at the time of joining node.
      ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer);

      assert oldest != null;

      if (log.isDebugEnabled())
        log.debug(
            "Partition map beforeExchange [exchId="
                + exchId
                + ", fullMap="
                + fullMapString()
                + ']');

      long updateSeq = this.updateSeq.incrementAndGet();

      // If this is the oldest node.
      if (oldest.id().equals(loc.id())
          || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) {
        if (node2part == null) {
          node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq);

          if (log.isDebugEnabled())
            log.debug(
                "Created brand new full topology map on oldest node [exchId="
                    + exchId
                    + ", fullMap="
                    + fullMapString()
                    + ']');
        } else if (!node2part.valid()) {
          node2part =
              new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);

          if (log.isDebugEnabled())
            log.debug(
                "Created new full topology map on oldest node [exchId="
                    + exchId
                    + ", fullMap="
                    + node2part
                    + ']');
        } else if (!node2part.nodeId().equals(loc.id())) {
          node2part =
              new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);

          if (log.isDebugEnabled())
            log.debug(
                "Copied old map into new map on oldest node (previous oldest node left) [exchId="
                    + exchId
                    + ", fullMap="
                    + fullMapString()
                    + ']');
        }
      }

      if (cctx.rebalanceEnabled()) {
        for (int p = 0; p < num; p++) {
          // If this is the first node in grid.
          boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion());

          if ((oldest.id().equals(loc.id())
                  && oldest.id().equals(exchId.nodeId())
                  && exchId.isJoined())
              || added) {
            assert exchId.isJoined() || added;

            try {
              GridDhtLocalPartition locPart = localPartition(p, topVer, true, false);

              assert locPart != null;

              boolean owned = locPart.own();

              assert owned
                  : "Failed to own partition for oldest node [cacheName"
                      + cctx.name()
                      + ", part="
                      + locPart
                      + ']';

              if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart);

              updateLocal(p, loc.id(), locPart.state(), updateSeq);
            } catch (GridDhtInvalidPartitionException e) {
              if (log.isDebugEnabled())
                log.debug(
                    "Ignoring invalid partition on oldest node (no need to create a partition "
                        + "if it no longer belongs to local node: "
                        + e.partition());
            }
          }
          // If this is not the first node in grid.
          else {
            if (node2part != null && node2part.valid()) {
              if (cctx.affinity().localNode(p, topVer)) {
                try {
                  // This will make sure that all non-existing partitions
                  // will be created in MOVING state.
                  GridDhtLocalPartition locPart = localPartition(p, topVer, true, false);

                  updateLocal(p, loc.id(), locPart.state(), updateSeq);
                } catch (GridDhtInvalidPartitionException e) {
                  if (log.isDebugEnabled())
                    log.debug(
                        "Ignoring invalid partition (no need to create a partition if it "
                            + "no longer belongs to local node: "
                            + e.partition());
                }
              }
            }
            // If this node's map is empty, we pre-create local partitions,
            // so local map will be sent correctly during exchange.
            else if (cctx.affinity().localNode(p, topVer)) {
              try {
                localPartition(p, topVer, true, false);
              } catch (GridDhtInvalidPartitionException e) {
                if (log.isDebugEnabled())
                  log.debug(
                      "Ignoring invalid partition (no need to pre-create a partition if it "
                          + "no longer belongs to local node: "
                          + e.partition());
              }
            }
          }
        }
      } else {
        // If preloader is disabled, then we simply clear out
        // the partitions this node is not responsible for.
        for (int p = 0; p < num; p++) {
          GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);

          boolean belongs = cctx.affinity().localNode(p, topVer);

          if (locPart != null) {
            if (!belongs) {
              GridDhtPartitionState state = locPart.state();

              if (state.active()) {
                locPart.rent(false);

                updateLocal(p, loc.id(), locPart.state(), updateSeq);

                if (log.isDebugEnabled())
                  log.debug(
                      "Evicting partition with rebalancing disabled "
                          + "(it does not belong to affinity): "
                          + locPart);
              }
            }
          } else if (belongs) {
            try {
              // Pre-create partitions.
              localPartition(p, topVer, true, false);
            } catch (GridDhtInvalidPartitionException e) {
              if (log.isDebugEnabled())
                log.debug(
                    "Ignoring invalid partition with disabled rebalancer (no need to "
                        + "pre-create a partition if it no longer belongs to local node: "
                        + e.partition());
            }
          }
        }
      }

      if (node2part != null && node2part.valid()) checkEvictions(updateSeq);

      consistencyCheck();

      if (log.isDebugEnabled())
        log.debug(
            "Partition map after beforeExchange [exchId="
                + exchId
                + ", fullMap="
                + fullMapString()
                + ']');
    } finally {
      lock.writeLock().unlock();
    }

    // Wait for evictions.
    waitForRent();
  }
    /** @param res Result callback. */
    @SuppressWarnings("ThrowableResultOfMethodCallIgnored")
    void onResult(final GridNearGetResponse res) {
      final Collection<Integer> invalidParts = res.invalidPartitions();

      // If error happened on remote node, fail the whole future.
      if (res.error() != null) {
        onDone(res.error());

        return;
      }

      // Remap invalid partitions.
      if (!F.isEmpty(invalidParts)) {
        AffinityTopologyVersion rmtTopVer = res.topologyVersion();

        assert !rmtTopVer.equals(AffinityTopologyVersion.ZERO);

        if (rmtTopVer.compareTo(topVer) <= 0) {
          // Fail the whole get future.
          onDone(
              new IgniteCheckedException(
                  "Failed to process invalid partitions response (remote node reported "
                      + "invalid partitions but remote topology version does not differ from local) "
                      + "[topVer="
                      + topVer
                      + ", rmtTopVer="
                      + rmtTopVer
                      + ", invalidParts="
                      + invalidParts
                      + ", nodeId="
                      + node.id()
                      + ']'));

          return;
        }

        if (log.isDebugEnabled())
          log.debug(
              "Remapping mini get future [invalidParts=" + invalidParts + ", fut=" + this + ']');

        if (!canRemap) {
          map(
              F.view(
                  keys.keySet(),
                  new P1<KeyCacheObject>() {
                    @Override
                    public boolean apply(KeyCacheObject key) {
                      return invalidParts.contains(cctx.affinity().partition(key));
                    }
                  }),
              F.t(node, keys),
              topVer);

          onDone(createResultMap(res.entries()));

          return;
        }

        // Need to wait for next topology version to remap.
        IgniteInternalFuture<AffinityTopologyVersion> topFut =
            cctx.affinity().affinityReadyFuture(rmtTopVer);

        topFut.listen(
            new CIX1<IgniteInternalFuture<AffinityTopologyVersion>>() {
              @SuppressWarnings("unchecked")
              @Override
              public void applyx(IgniteInternalFuture<AffinityTopologyVersion> fut)
                  throws IgniteCheckedException {
                AffinityTopologyVersion topVer = fut.get();

                // This will append new futures to compound list.
                map(
                    F.view(
                        keys.keySet(),
                        new P1<KeyCacheObject>() {
                          @Override
                          public boolean apply(KeyCacheObject key) {
                            return invalidParts.contains(cctx.affinity().partition(key));
                          }
                        }),
                    F.t(node, keys),
                    topVer);

                onDone(createResultMap(res.entries()));
              }
            });
      } else {
        try {
          onDone(createResultMap(res.entries()));
        } catch (Exception e) {
          onDone(e);
        }
      }
    }