private void sendPartitions() {
    ClusterNode oldestNode = this.oldestNode.get();

    try {
      sendLocalPartitions(oldestNode, exchId);
    } catch (ClusterTopologyCheckedException ignore) {
      if (log.isDebugEnabled())
        log.debug(
            "Oldest node left during partition exchange [nodeId="
                + oldestNode.id()
                + ", exchId="
                + exchId
                + ']');
    } catch (IgniteCheckedException e) {
      scheduleRecheck();

      U.error(
          log,
          "Failed to send local partitions to oldest node (will retry after timeout) [oldestNodeId="
              + oldestNode.id()
              + ", exchId="
              + exchId
              + ']',
          e);
    }
  }
  /**
   * Sends query request.
   *
   * @param fut Distributed future.
   * @param req Request.
   * @param nodes Nodes.
   * @throws IgniteCheckedException In case of error.
   */
  @SuppressWarnings("unchecked")
  private void sendRequest(
      final GridCacheDistributedQueryFuture<?, ?, ?> fut,
      final GridCacheQueryRequest req,
      Collection<ClusterNode> nodes)
      throws IgniteCheckedException {
    assert fut != null;
    assert req != null;
    assert nodes != null;

    final UUID locNodeId = cctx.localNodeId();

    ClusterNode locNode = null;

    Collection<ClusterNode> rmtNodes = null;

    for (ClusterNode n : nodes) {
      if (n.id().equals(locNodeId)) locNode = n;
      else {
        if (rmtNodes == null) rmtNodes = new ArrayList<>(nodes.size());

        rmtNodes.add(n);
      }
    }

    // Request should be sent to remote nodes before the query is processed on the local node.
    // For example, a remote reducer has a state, we should not serialize and then send
    // the reducer changed by the local node.
    if (!F.isEmpty(rmtNodes)) {
      cctx.io()
          .safeSend(
              rmtNodes,
              req,
              cctx.ioPolicy(),
              new P1<ClusterNode>() {
                @Override
                public boolean apply(ClusterNode node) {
                  fut.onNodeLeft(node.id());

                  return !fut.isDone();
                }
              });
    }

    if (locNode != null) {
      cctx.closures()
          .callLocalSafe(
              new Callable<Object>() {
                @Override
                public Object call() throws Exception {
                  req.beforeLocalExecution(cctx);

                  processQueryRequest(locNodeId, req);

                  return null;
                }
              });
    }
  }
  /** @param nodeId Node to remove. */
  private void removeNode(UUID nodeId) {
    assert nodeId != null;
    assert lock.writeLock().isHeldByCurrentThread();

    ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer);

    assert oldest != null;

    ClusterNode loc = cctx.localNode();

    if (node2part != null) {
      if (oldest.equals(loc) && !node2part.nodeId().equals(loc.id())) {
        updateSeq.setIfGreater(node2part.updateSequence());

        node2part =
            new GridDhtPartitionFullMap(
                loc.id(), loc.order(), updateSeq.incrementAndGet(), node2part, false);
      } else node2part = new GridDhtPartitionFullMap(node2part, node2part.updateSequence());

      part2node = new HashMap<>(part2node);

      GridDhtPartitionMap parts = node2part.remove(nodeId);

      if (parts != null) {
        for (Integer p : parts.keySet()) {
          Set<UUID> nodeIds = part2node.get(p);

          if (nodeIds != null) {
            nodeIds.remove(nodeId);

            if (nodeIds.isEmpty()) part2node.remove(p);
          }
        }
      }

      consistencyCheck();
    }
  }
  /** {@inheritDoc} */
  @Override
  public String toString() {
    ClusterNode oldestNode = this.oldestNode.get();

    return S.toString(
        GridDhtPartitionsExchangeFuture.class,
        this,
        "oldest",
        oldestNode == null ? "null" : oldestNode.id(),
        "oldestOrder",
        oldestNode == null ? "null" : oldestNode.order(),
        "evtLatch",
        evtLatch == null ? "null" : evtLatch.getCount(),
        "remaining",
        remaining(),
        "super",
        super.toString());
  }
Ejemplo n.º 5
0
  /**
   * Send delete message to all meta cache nodes in the grid.
   *
   * @param msg Message to send.
   */
  private void sendDeleteMessage(IgfsDeleteMessage msg) {
    assert msg != null;

    Collection<ClusterNode> nodes = meta.metaCacheNodes();

    for (ClusterNode node : nodes) {
      try {
        igfsCtx.send(node, topic, msg, GridIoPolicy.SYSTEM_POOL);
      } catch (IgniteCheckedException e) {
        U.warn(
            log,
            "Failed to send IGFS delete message to node [nodeId="
                + node.id()
                + ", msg="
                + msg
                + ", err="
                + e.getMessage()
                + ']');
      }
    }
  }
  /** {@inheritDoc} */
  @Override
  public boolean own(GridDhtLocalPartition part) {
    ClusterNode loc = cctx.localNode();

    lock.writeLock().lock();

    try {
      if (part.own()) {
        updateLocal(part.id(), loc.id(), part.state(), updateSeq.incrementAndGet());

        consistencyCheck();

        return true;
      }

      consistencyCheck();

      return false;
    } finally {
      lock.writeLock().unlock();
    }
  }
  /**
   * @param node Node.
   * @param id ID.
   * @throws IgniteCheckedException If failed.
   */
  private void sendLocalPartitions(ClusterNode node, @Nullable GridDhtPartitionExchangeId id)
      throws IgniteCheckedException {
    GridDhtPartitionsSingleMessage m =
        new GridDhtPartitionsSingleMessage(
            id, cctx.kernalContext().clientNode(), cctx.versions().last());

    for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
      if (!cacheCtx.isLocal())
        m.addLocalPartitionMap(cacheCtx.cacheId(), cacheCtx.topology().localPartitionMap());
    }

    if (log.isDebugEnabled())
      log.debug(
          "Sending local partitions [nodeId="
              + node.id()
              + ", exchId="
              + exchId
              + ", msg="
              + m
              + ']');

    cctx.io().send(node, m, SYSTEM_POOL);
  }
  /** {@inheritDoc} */
  @Override
  public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut)
      throws IgniteCheckedException {
    boolean changed = waitForRent();

    ClusterNode loc = cctx.localNode();

    int num = cctx.affinity().partitions();

    AffinityTopologyVersion topVer = exchFut.topologyVersion();

    lock.writeLock().lock();

    try {
      if (stopping) return false;

      assert topVer.equals(exchFut.topologyVersion())
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']';

      if (log.isDebugEnabled())
        log.debug(
            "Partition map before afterExchange [exchId="
                + exchFut.exchangeId()
                + ", fullMap="
                + fullMapString()
                + ']');

      long updateSeq = this.updateSeq.incrementAndGet();

      for (int p = 0; p < num; p++) {
        GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);

        if (cctx.affinity().localNode(p, topVer)) {
          // This partition will be created during next topology event,
          // which obviously has not happened at this point.
          if (locPart == null) {
            if (log.isDebugEnabled())
              log.debug("Skipping local partition afterExchange (will not create): " + p);

            continue;
          }

          GridDhtPartitionState state = locPart.state();

          if (state == MOVING) {
            if (cctx.rebalanceEnabled()) {
              Collection<ClusterNode> owners = owners(p);

              // If there are no other owners, then become an owner.
              if (F.isEmpty(owners)) {
                boolean owned = locPart.own();

                assert owned
                    : "Failed to own partition [cacheName"
                        + cctx.name()
                        + ", locPart="
                        + locPart
                        + ']';

                updateLocal(p, loc.id(), locPart.state(), updateSeq);

                changed = true;

                if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) {
                  DiscoveryEvent discoEvt = exchFut.discoveryEvent();

                  cctx.events()
                      .addPreloadEvent(
                          p,
                          EVT_CACHE_REBALANCE_PART_DATA_LOST,
                          discoEvt.eventNode(),
                          discoEvt.type(),
                          discoEvt.timestamp());
                }

                if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart);
              } else if (log.isDebugEnabled())
                log.debug(
                    "Will not own partition (there are owners to rebalance from) [locPart="
                        + locPart
                        + ", owners = "
                        + owners
                        + ']');
            } else updateLocal(p, loc.id(), locPart.state(), updateSeq);
          }
        } else {
          if (locPart != null) {
            GridDhtPartitionState state = locPart.state();

            if (state == MOVING) {
              locPart.rent(false);

              updateLocal(p, loc.id(), locPart.state(), updateSeq);

              changed = true;

              if (log.isDebugEnabled())
                log.debug("Evicting moving partition (it does not belong to affinity): " + locPart);
            }
          }
        }
      }

      consistencyCheck();
    } finally {
      lock.writeLock().unlock();
    }

    return changed;
  }
  /** {@inheritDoc} */
  @Override
  public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut)
      throws IgniteCheckedException {
    waitForRent();

    ClusterNode loc = cctx.localNode();

    int num = cctx.affinity().partitions();

    lock.writeLock().lock();

    try {
      GridDhtPartitionExchangeId exchId = exchFut.exchangeId();

      if (stopping) return;

      assert topVer.equals(exchId.topologyVersion())
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']';

      if (exchId.isLeft()) removeNode(exchId.nodeId());

      // In case if node joins, get topology at the time of joining node.
      ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer);

      assert oldest != null;

      if (log.isDebugEnabled())
        log.debug(
            "Partition map beforeExchange [exchId="
                + exchId
                + ", fullMap="
                + fullMapString()
                + ']');

      long updateSeq = this.updateSeq.incrementAndGet();

      // If this is the oldest node.
      if (oldest.id().equals(loc.id())
          || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) {
        if (node2part == null) {
          node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq);

          if (log.isDebugEnabled())
            log.debug(
                "Created brand new full topology map on oldest node [exchId="
                    + exchId
                    + ", fullMap="
                    + fullMapString()
                    + ']');
        } else if (!node2part.valid()) {
          node2part =
              new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);

          if (log.isDebugEnabled())
            log.debug(
                "Created new full topology map on oldest node [exchId="
                    + exchId
                    + ", fullMap="
                    + node2part
                    + ']');
        } else if (!node2part.nodeId().equals(loc.id())) {
          node2part =
              new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);

          if (log.isDebugEnabled())
            log.debug(
                "Copied old map into new map on oldest node (previous oldest node left) [exchId="
                    + exchId
                    + ", fullMap="
                    + fullMapString()
                    + ']');
        }
      }

      if (cctx.rebalanceEnabled()) {
        for (int p = 0; p < num; p++) {
          // If this is the first node in grid.
          boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion());

          if ((oldest.id().equals(loc.id())
                  && oldest.id().equals(exchId.nodeId())
                  && exchId.isJoined())
              || added) {
            assert exchId.isJoined() || added;

            try {
              GridDhtLocalPartition locPart = localPartition(p, topVer, true, false);

              assert locPart != null;

              boolean owned = locPart.own();

              assert owned
                  : "Failed to own partition for oldest node [cacheName"
                      + cctx.name()
                      + ", part="
                      + locPart
                      + ']';

              if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart);

              updateLocal(p, loc.id(), locPart.state(), updateSeq);
            } catch (GridDhtInvalidPartitionException e) {
              if (log.isDebugEnabled())
                log.debug(
                    "Ignoring invalid partition on oldest node (no need to create a partition "
                        + "if it no longer belongs to local node: "
                        + e.partition());
            }
          }
          // If this is not the first node in grid.
          else {
            if (node2part != null && node2part.valid()) {
              if (cctx.affinity().localNode(p, topVer)) {
                try {
                  // This will make sure that all non-existing partitions
                  // will be created in MOVING state.
                  GridDhtLocalPartition locPart = localPartition(p, topVer, true, false);

                  updateLocal(p, loc.id(), locPart.state(), updateSeq);
                } catch (GridDhtInvalidPartitionException e) {
                  if (log.isDebugEnabled())
                    log.debug(
                        "Ignoring invalid partition (no need to create a partition if it "
                            + "no longer belongs to local node: "
                            + e.partition());
                }
              }
            }
            // If this node's map is empty, we pre-create local partitions,
            // so local map will be sent correctly during exchange.
            else if (cctx.affinity().localNode(p, topVer)) {
              try {
                localPartition(p, topVer, true, false);
              } catch (GridDhtInvalidPartitionException e) {
                if (log.isDebugEnabled())
                  log.debug(
                      "Ignoring invalid partition (no need to pre-create a partition if it "
                          + "no longer belongs to local node: "
                          + e.partition());
              }
            }
          }
        }
      } else {
        // If preloader is disabled, then we simply clear out
        // the partitions this node is not responsible for.
        for (int p = 0; p < num; p++) {
          GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);

          boolean belongs = cctx.affinity().localNode(p, topVer);

          if (locPart != null) {
            if (!belongs) {
              GridDhtPartitionState state = locPart.state();

              if (state.active()) {
                locPart.rent(false);

                updateLocal(p, loc.id(), locPart.state(), updateSeq);

                if (log.isDebugEnabled())
                  log.debug(
                      "Evicting partition with rebalancing disabled "
                          + "(it does not belong to affinity): "
                          + locPart);
              }
            }
          } else if (belongs) {
            try {
              // Pre-create partitions.
              localPartition(p, topVer, true, false);
            } catch (GridDhtInvalidPartitionException e) {
              if (log.isDebugEnabled())
                log.debug(
                    "Ignoring invalid partition with disabled rebalancer (no need to "
                        + "pre-create a partition if it no longer belongs to local node: "
                        + e.partition());
            }
          }
        }
      }

      if (node2part != null && node2part.valid()) checkEvictions(updateSeq);

      consistencyCheck();

      if (log.isDebugEnabled())
        log.debug(
            "Partition map after beforeExchange [exchId="
                + exchId
                + ", fullMap="
                + fullMapString()
                + ']');
    } finally {
      lock.writeLock().unlock();
    }

    // Wait for evictions.
    waitForRent();
  }
  /**
   * Updates value for single partition.
   *
   * @param p Partition.
   * @param nodeId Node ID.
   * @param state State.
   * @param updateSeq Update sequence.
   */
  @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"})
  private void updateLocal(int p, UUID nodeId, GridDhtPartitionState state, long updateSeq) {
    assert lock.isWriteLockedByCurrentThread();
    assert nodeId.equals(cctx.nodeId());

    // In case if node joins, get topology at the time of joining node.
    ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer);

    assert oldest != null;

    // If this node became the oldest node.
    if (oldest.id().equals(cctx.nodeId())) {
      long seq = node2part.updateSequence();

      if (seq != updateSeq) {
        if (seq > updateSeq) {
          if (this.updateSeq.get() < seq) {
            // Update global counter if necessary.
            boolean b = this.updateSeq.compareAndSet(this.updateSeq.get(), seq + 1);

            assert b
                : "Invalid update sequence [updateSeq="
                    + updateSeq
                    + ", seq="
                    + seq
                    + ", curUpdateSeq="
                    + this.updateSeq.get()
                    + ", node2part="
                    + node2part.toFullString()
                    + ']';

            updateSeq = seq + 1;
          } else updateSeq = seq;
        }

        node2part.updateSequence(updateSeq);
      }
    }

    GridDhtPartitionMap map = node2part.get(nodeId);

    if (map == null)
      node2part.put(
          nodeId,
          map =
              new GridDhtPartitionMap(
                  nodeId,
                  updateSeq,
                  Collections.<Integer, GridDhtPartitionState>emptyMap(),
                  false));

    map.updateSequence(updateSeq);

    map.put(p, state);

    Set<UUID> ids = part2node.get(p);

    if (ids == null) part2node.put(p, ids = U.newHashSet(3));

    ids.add(nodeId);
  }
  /**
   * @param updateSeq Update sequence.
   * @return Checks if any of the local partitions need to be evicted.
   */
  private boolean checkEvictions(long updateSeq) {
    assert lock.isWriteLockedByCurrentThread();

    boolean changed = false;

    UUID locId = cctx.nodeId();

    for (GridDhtLocalPartition part : locParts.values()) {
      GridDhtPartitionState state = part.state();

      if (state.active()) {
        int p = part.id();

        List<ClusterNode> affNodes = cctx.affinity().nodes(p, topVer);

        if (!affNodes.contains(cctx.localNode())) {
          Collection<UUID> nodeIds = F.nodeIds(nodes(p, topVer, OWNING));

          // If all affinity nodes are owners, then evict partition from local node.
          if (nodeIds.containsAll(F.nodeIds(affNodes))) {
            part.rent(false);

            updateLocal(part.id(), locId, part.state(), updateSeq);

            changed = true;

            if (log.isDebugEnabled())
              log.debug("Evicted local partition (all affinity nodes are owners): " + part);
          } else {
            int ownerCnt = nodeIds.size();
            int affCnt = affNodes.size();

            if (ownerCnt > affCnt) {
              List<ClusterNode> sorted = new ArrayList<>(cctx.discovery().nodes(nodeIds));

              // Sort by node orders in ascending order.
              Collections.sort(sorted, CU.nodeComparator(true));

              int diff = sorted.size() - affCnt;

              for (int i = 0; i < diff; i++) {
                ClusterNode n = sorted.get(i);

                if (locId.equals(n.id())) {
                  part.rent(false);

                  updateLocal(part.id(), locId, part.state(), updateSeq);

                  changed = true;

                  if (log.isDebugEnabled())
                    log.debug(
                        "Evicted local partition (this node is oldest non-affinity node): " + part);

                  break;
                }
              }
            }
          }
        }
      }
    }

    return changed;
  }