/**
   * @param p Partition number.
   * @param topVer Topology version.
   * @param create Create flag.
   * @param updateSeq Update sequence.
   * @return Local partition.
   */
  private GridDhtLocalPartition localPartition(
      int p, AffinityTopologyVersion topVer, boolean create, boolean updateSeq) {
    while (true) {
      boolean belongs = cctx.affinity().localNode(p, topVer);

      GridDhtLocalPartition loc = locParts.get(p);

      if (loc != null && loc.state() == EVICTED) {
        locParts.remove(p, loc);

        if (!create) return null;

        if (!belongs)
          throw new GridDhtInvalidPartitionException(
              p,
              "Adding entry to evicted partition [part="
                  + p
                  + ", topVer="
                  + topVer
                  + ", this.topVer="
                  + this.topVer
                  + ']');

        continue;
      }

      if (loc == null && create) {
        if (!belongs)
          throw new GridDhtInvalidPartitionException(
              p,
              "Creating partition which does not belong [part="
                  + p
                  + ", topVer="
                  + topVer
                  + ", this.topVer="
                  + this.topVer
                  + ']');

        lock.writeLock().lock();

        try {
          GridDhtLocalPartition old =
              locParts.putIfAbsent(p, loc = new GridDhtLocalPartition(cctx, p));

          if (old != null) loc = old;
          else {
            if (updateSeq) this.updateSeq.incrementAndGet();

            if (log.isDebugEnabled()) log.debug("Created local partition: " + loc);
          }
        } finally {
          lock.writeLock().unlock();
        }
      }

      return loc;
    }
  }
Example #2
0
    public static void main(String[] args) throws Throwable {
      final ReentrantLock lock = new ReentrantLock();
      lock.lock();

      final ReentrantReadWriteLock rwlock = new ReentrantReadWriteLock();
      final ReentrantReadWriteLock.ReadLock readLock = rwlock.readLock();
      final ReentrantReadWriteLock.WriteLock writeLock = rwlock.writeLock();
      rwlock.writeLock().lock();

      final BlockingQueue<Object> q = new LinkedBlockingQueue<Object>();
      final Semaphore fairSem = new Semaphore(0, true);
      final Semaphore unfairSem = new Semaphore(0, false);
      // final int threads =
      // rnd.nextInt(Runtime.getRuntime().availableProcessors() + 1) + 1;
      final int threads = 3;
      // On Linux, this test runs very slowly for some reason,
      // so use a smaller number of iterations.
      // Solaris can handle 1 << 18.
      // On the other hand, jmap is much slower on Solaris...
      final int iterations = 1 << 8;
      final CyclicBarrier cb = new CyclicBarrier(threads + 1);

      for (int i = 0; i < threads; i++)
        new Thread() {
          public void run() {
            try {
              final Random rnd = new Random();
              for (int j = 0; j < iterations; j++) {
                if (j == iterations / 10 || j == iterations - 1) {
                  cb.await(); // Quiesce
                  cb.await(); // Resume
                }
                // int t = rnd.nextInt(2000);
                int t = rnd.nextInt(900);
                check(!lock.tryLock(t, NANOSECONDS));
                check(!readLock.tryLock(t, NANOSECONDS));
                check(!writeLock.tryLock(t, NANOSECONDS));
                equal(null, q.poll(t, NANOSECONDS));
                check(!fairSem.tryAcquire(t, NANOSECONDS));
                check(!unfairSem.tryAcquire(t, NANOSECONDS));
              }
            } catch (Throwable t) {
              unexpected(t);
            }
          }
        }.start();

      cb.await(); // Quiesce
      rendezvousChild(); // Measure
      cb.await(); // Resume

      cb.await(); // Quiesce
      rendezvousChild(); // Measure
      cb.await(); // Resume

      System.exit(failed);
    }
  /** Checks consistency after all operations. */
  private void consistencyCheck() {
    if (CONSISTENCY_CHECK) {
      assert lock.writeLock().isHeldByCurrentThread();

      if (node2part == null) return;

      for (Map.Entry<UUID, GridDhtPartitionMap> e : node2part.entrySet()) {
        for (Integer p : e.getValue().keySet()) {
          Set<UUID> nodeIds = part2node.get(p);

          assert nodeIds != null
              : "Failed consistency check [part=" + p + ", nodeId=" + e.getKey() + ']';
          assert nodeIds.contains(e.getKey())
              : "Failed consistency check [part="
                  + p
                  + ", nodeId="
                  + e.getKey()
                  + ", nodeIds="
                  + nodeIds
                  + ']';
        }
      }

      for (Map.Entry<Integer, Set<UUID>> e : part2node.entrySet()) {
        for (UUID nodeId : e.getValue()) {
          GridDhtPartitionMap map = node2part.get(nodeId);

          assert map != null
              : "Failed consistency check [part=" + e.getKey() + ", nodeId=" + nodeId + ']';
          assert map.containsKey(e.getKey())
              : "Failed consistency check [part=" + e.getKey() + ", nodeId=" + nodeId + ']';
        }
      }
    }
  }
  public void onReconnected() {
    lock.writeLock().lock();

    try {
      node2part = null;

      part2node = new HashMap<>();

      lastExchangeId = null;

      updateSeq.set(1);

      topReadyFut = null;

      topVer = AffinityTopologyVersion.NONE;
    } finally {
      lock.writeLock().unlock();
    }
  }
  /** {@inheritDoc} */
  @Override
  public void onEvicted(GridDhtLocalPartition part, boolean updateSeq) {
    assert updateSeq || lock.isWriteLockedByCurrentThread();

    lock.writeLock().lock();

    try {
      if (stopping) return;

      assert part.state() == EVICTED;

      long seq = updateSeq ? this.updateSeq.incrementAndGet() : this.updateSeq.get();

      updateLocal(part.id(), cctx.localNodeId(), part.state(), seq);

      consistencyCheck();
    } finally {
      lock.writeLock().unlock();
    }
  }
  /** {@inheritDoc} */
  @Override
  public boolean own(GridDhtLocalPartition part) {
    ClusterNode loc = cctx.localNode();

    lock.writeLock().lock();

    try {
      if (part.own()) {
        updateLocal(part.id(), loc.id(), part.state(), updateSeq.incrementAndGet());

        consistencyCheck();

        return true;
      }

      consistencyCheck();

      return false;
    } finally {
      lock.writeLock().unlock();
    }
  }
  /** {@inheritDoc} */
  @Override
  public void updateTopologyVersion(
      GridDhtPartitionExchangeId exchId,
      GridDhtPartitionsExchangeFuture exchFut,
      long updSeq,
      boolean stopping) {
    lock.writeLock().lock();

    try {
      assert exchId.topologyVersion().compareTo(topVer) > 0
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']';

      this.stopping = stopping;

      topVer = exchId.topologyVersion();

      updateSeq.setIfGreater(updSeq);

      topReadyFut = exchFut;
    } finally {
      lock.writeLock().unlock();
    }
  }
  /** @param nodeId Node to remove. */
  private void removeNode(UUID nodeId) {
    assert nodeId != null;
    assert lock.writeLock().isHeldByCurrentThread();

    ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer);

    assert oldest != null;

    ClusterNode loc = cctx.localNode();

    if (node2part != null) {
      if (oldest.equals(loc) && !node2part.nodeId().equals(loc.id())) {
        updateSeq.setIfGreater(node2part.updateSequence());

        node2part =
            new GridDhtPartitionFullMap(
                loc.id(), loc.order(), updateSeq.incrementAndGet(), node2part, false);
      } else node2part = new GridDhtPartitionFullMap(node2part, node2part.updateSequence());

      part2node = new HashMap<>(part2node);

      GridDhtPartitionMap parts = node2part.remove(nodeId);

      if (parts != null) {
        for (Integer p : parts.keySet()) {
          Set<UUID> nodeIds = part2node.get(p);

          if (nodeIds != null) {
            nodeIds.remove(nodeId);

            if (nodeIds.isEmpty()) part2node.remove(p);
          }
        }
      }

      consistencyCheck();
    }
  }
  /** {@inheritDoc} */
  @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"})
  @Nullable
  @Override
  public GridDhtPartitionMap update(
      @Nullable GridDhtPartitionExchangeId exchId, GridDhtPartitionFullMap partMap) {
    if (log.isDebugEnabled())
      log.debug(
          "Updating full partition map [exchId=" + exchId + ", parts=" + fullMapString() + ']');

    assert partMap != null;

    lock.writeLock().lock();

    try {
      if (stopping) return null;

      if (exchId != null && lastExchangeId != null && lastExchangeId.compareTo(exchId) >= 0) {
        if (log.isDebugEnabled())
          log.debug(
              "Stale exchange id for full partition map update (will ignore) [lastExchId="
                  + lastExchangeId
                  + ", exchId="
                  + exchId
                  + ']');

        return null;
      }

      if (node2part != null && node2part.compareTo(partMap) >= 0) {
        if (log.isDebugEnabled())
          log.debug(
              "Stale partition map for full partition map update (will ignore) [lastExchId="
                  + lastExchangeId
                  + ", exchId="
                  + exchId
                  + ", curMap="
                  + node2part
                  + ", newMap="
                  + partMap
                  + ']');

        return null;
      }

      long updateSeq = this.updateSeq.incrementAndGet();

      if (exchId != null) lastExchangeId = exchId;

      if (node2part != null) {
        for (GridDhtPartitionMap part : node2part.values()) {
          GridDhtPartitionMap newPart = partMap.get(part.nodeId());

          // If for some nodes current partition has a newer map,
          // then we keep the newer value.
          if (newPart != null && newPart.updateSequence() < part.updateSequence()) {
            if (log.isDebugEnabled())
              log.debug(
                  "Overriding partition map in full update map [exchId="
                      + exchId
                      + ", curPart="
                      + mapString(part)
                      + ", newPart="
                      + mapString(newPart)
                      + ']');

            partMap.put(part.nodeId(), part);
          }
        }

        for (Iterator<UUID> it = partMap.keySet().iterator(); it.hasNext(); ) {
          UUID nodeId = it.next();

          if (!cctx.discovery().alive(nodeId)) {
            if (log.isDebugEnabled())
              log.debug(
                  "Removing left node from full map update [nodeId="
                      + nodeId
                      + ", partMap="
                      + partMap
                      + ']');

            it.remove();
          }
        }
      }

      node2part = partMap;

      Map<Integer, Set<UUID>> p2n = new HashMap<>(cctx.affinity().partitions(), 1.0f);

      for (Map.Entry<UUID, GridDhtPartitionMap> e : partMap.entrySet()) {
        for (Integer p : e.getValue().keySet()) {
          Set<UUID> ids = p2n.get(p);

          if (ids == null)
            // Initialize HashSet to size 3 in anticipation that there won't be
            // more than 3 nodes per partitions.
            p2n.put(p, ids = U.newHashSet(3));

          ids.add(e.getKey());
        }
      }

      part2node = p2n;

      boolean changed = checkEvictions(updateSeq);

      consistencyCheck();

      if (log.isDebugEnabled()) log.debug("Partition map after full update: " + fullMapString());

      return changed ? localPartitionMap() : null;
    } finally {
      lock.writeLock().unlock();
    }
  }
  /** {@inheritDoc} */
  @Override
  public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut)
      throws IgniteCheckedException {
    boolean changed = waitForRent();

    ClusterNode loc = cctx.localNode();

    int num = cctx.affinity().partitions();

    AffinityTopologyVersion topVer = exchFut.topologyVersion();

    lock.writeLock().lock();

    try {
      if (stopping) return false;

      assert topVer.equals(exchFut.topologyVersion())
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchFut.exchangeId() + ']';

      if (log.isDebugEnabled())
        log.debug(
            "Partition map before afterExchange [exchId="
                + exchFut.exchangeId()
                + ", fullMap="
                + fullMapString()
                + ']');

      long updateSeq = this.updateSeq.incrementAndGet();

      for (int p = 0; p < num; p++) {
        GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);

        if (cctx.affinity().localNode(p, topVer)) {
          // This partition will be created during next topology event,
          // which obviously has not happened at this point.
          if (locPart == null) {
            if (log.isDebugEnabled())
              log.debug("Skipping local partition afterExchange (will not create): " + p);

            continue;
          }

          GridDhtPartitionState state = locPart.state();

          if (state == MOVING) {
            if (cctx.rebalanceEnabled()) {
              Collection<ClusterNode> owners = owners(p);

              // If there are no other owners, then become an owner.
              if (F.isEmpty(owners)) {
                boolean owned = locPart.own();

                assert owned
                    : "Failed to own partition [cacheName"
                        + cctx.name()
                        + ", locPart="
                        + locPart
                        + ']';

                updateLocal(p, loc.id(), locPart.state(), updateSeq);

                changed = true;

                if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) {
                  DiscoveryEvent discoEvt = exchFut.discoveryEvent();

                  cctx.events()
                      .addPreloadEvent(
                          p,
                          EVT_CACHE_REBALANCE_PART_DATA_LOST,
                          discoEvt.eventNode(),
                          discoEvt.type(),
                          discoEvt.timestamp());
                }

                if (log.isDebugEnabled()) log.debug("Owned partition: " + locPart);
              } else if (log.isDebugEnabled())
                log.debug(
                    "Will not own partition (there are owners to rebalance from) [locPart="
                        + locPart
                        + ", owners = "
                        + owners
                        + ']');
            } else updateLocal(p, loc.id(), locPart.state(), updateSeq);
          }
        } else {
          if (locPart != null) {
            GridDhtPartitionState state = locPart.state();

            if (state == MOVING) {
              locPart.rent(false);

              updateLocal(p, loc.id(), locPart.state(), updateSeq);

              changed = true;

              if (log.isDebugEnabled())
                log.debug("Evicting moving partition (it does not belong to affinity): " + locPart);
            }
          }
        }
      }

      consistencyCheck();
    } finally {
      lock.writeLock().unlock();
    }

    return changed;
  }
  /** {@inheritDoc} */
  @Override
  public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut)
      throws IgniteCheckedException {
    waitForRent();

    ClusterNode loc = cctx.localNode();

    int num = cctx.affinity().partitions();

    lock.writeLock().lock();

    try {
      GridDhtPartitionExchangeId exchId = exchFut.exchangeId();

      if (stopping) return;

      assert topVer.equals(exchId.topologyVersion())
          : "Invalid topology version [topVer=" + topVer + ", exchId=" + exchId + ']';

      if (exchId.isLeft()) removeNode(exchId.nodeId());

      // In case if node joins, get topology at the time of joining node.
      ClusterNode oldest = CU.oldestAliveCacheServerNode(cctx.shared(), topVer);

      assert oldest != null;

      if (log.isDebugEnabled())
        log.debug(
            "Partition map beforeExchange [exchId="
                + exchId
                + ", fullMap="
                + fullMapString()
                + ']');

      long updateSeq = this.updateSeq.incrementAndGet();

      // If this is the oldest node.
      if (oldest.id().equals(loc.id())
          || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion())) {
        if (node2part == null) {
          node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq);

          if (log.isDebugEnabled())
            log.debug(
                "Created brand new full topology map on oldest node [exchId="
                    + exchId
                    + ", fullMap="
                    + fullMapString()
                    + ']');
        } else if (!node2part.valid()) {
          node2part =
              new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);

          if (log.isDebugEnabled())
            log.debug(
                "Created new full topology map on oldest node [exchId="
                    + exchId
                    + ", fullMap="
                    + node2part
                    + ']');
        } else if (!node2part.nodeId().equals(loc.id())) {
          node2part =
              new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);

          if (log.isDebugEnabled())
            log.debug(
                "Copied old map into new map on oldest node (previous oldest node left) [exchId="
                    + exchId
                    + ", fullMap="
                    + fullMapString()
                    + ']');
        }
      }

      if (cctx.rebalanceEnabled()) {
        for (int p = 0; p < num; p++) {
          // If this is the first node in grid.
          boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion());

          if ((oldest.id().equals(loc.id())
                  && oldest.id().equals(exchId.nodeId())
                  && exchId.isJoined())
              || added) {
            assert exchId.isJoined() || added;

            try {
              GridDhtLocalPartition locPart = localPartition(p, topVer, true, false);

              assert locPart != null;

              boolean owned = locPart.own();

              assert owned
                  : "Failed to own partition for oldest node [cacheName"
                      + cctx.name()
                      + ", part="
                      + locPart
                      + ']';

              if (log.isDebugEnabled()) log.debug("Owned partition for oldest node: " + locPart);

              updateLocal(p, loc.id(), locPart.state(), updateSeq);
            } catch (GridDhtInvalidPartitionException e) {
              if (log.isDebugEnabled())
                log.debug(
                    "Ignoring invalid partition on oldest node (no need to create a partition "
                        + "if it no longer belongs to local node: "
                        + e.partition());
            }
          }
          // If this is not the first node in grid.
          else {
            if (node2part != null && node2part.valid()) {
              if (cctx.affinity().localNode(p, topVer)) {
                try {
                  // This will make sure that all non-existing partitions
                  // will be created in MOVING state.
                  GridDhtLocalPartition locPart = localPartition(p, topVer, true, false);

                  updateLocal(p, loc.id(), locPart.state(), updateSeq);
                } catch (GridDhtInvalidPartitionException e) {
                  if (log.isDebugEnabled())
                    log.debug(
                        "Ignoring invalid partition (no need to create a partition if it "
                            + "no longer belongs to local node: "
                            + e.partition());
                }
              }
            }
            // If this node's map is empty, we pre-create local partitions,
            // so local map will be sent correctly during exchange.
            else if (cctx.affinity().localNode(p, topVer)) {
              try {
                localPartition(p, topVer, true, false);
              } catch (GridDhtInvalidPartitionException e) {
                if (log.isDebugEnabled())
                  log.debug(
                      "Ignoring invalid partition (no need to pre-create a partition if it "
                          + "no longer belongs to local node: "
                          + e.partition());
              }
            }
          }
        }
      } else {
        // If preloader is disabled, then we simply clear out
        // the partitions this node is not responsible for.
        for (int p = 0; p < num; p++) {
          GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);

          boolean belongs = cctx.affinity().localNode(p, topVer);

          if (locPart != null) {
            if (!belongs) {
              GridDhtPartitionState state = locPart.state();

              if (state.active()) {
                locPart.rent(false);

                updateLocal(p, loc.id(), locPart.state(), updateSeq);

                if (log.isDebugEnabled())
                  log.debug(
                      "Evicting partition with rebalancing disabled "
                          + "(it does not belong to affinity): "
                          + locPart);
              }
            }
          } else if (belongs) {
            try {
              // Pre-create partitions.
              localPartition(p, topVer, true, false);
            } catch (GridDhtInvalidPartitionException e) {
              if (log.isDebugEnabled())
                log.debug(
                    "Ignoring invalid partition with disabled rebalancer (no need to "
                        + "pre-create a partition if it no longer belongs to local node: "
                        + e.partition());
            }
          }
        }
      }

      if (node2part != null && node2part.valid()) checkEvictions(updateSeq);

      consistencyCheck();

      if (log.isDebugEnabled())
        log.debug(
            "Partition map after beforeExchange [exchId="
                + exchId
                + ", fullMap="
                + fullMapString()
                + ']');
    } finally {
      lock.writeLock().unlock();
    }

    // Wait for evictions.
    waitForRent();
  }
  /** {@inheritDoc} */
  @SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"})
  @Nullable
  @Override
  public GridDhtPartitionMap update(
      @Nullable GridDhtPartitionExchangeId exchId, GridDhtPartitionMap parts) {
    if (log.isDebugEnabled())
      log.debug(
          "Updating single partition map [exchId=" + exchId + ", parts=" + mapString(parts) + ']');

    if (!cctx.discovery().alive(parts.nodeId())) {
      if (log.isDebugEnabled())
        log.debug(
            "Received partition update for non-existing node (will ignore) [exchId="
                + exchId
                + ", parts="
                + parts
                + ']');

      return null;
    }

    lock.writeLock().lock();

    try {
      if (stopping) return null;

      if (lastExchangeId != null && exchId != null && lastExchangeId.compareTo(exchId) > 0) {
        if (log.isDebugEnabled())
          log.debug(
              "Stale exchange id for single partition map update (will ignore) [lastExchId="
                  + lastExchangeId
                  + ", exchId="
                  + exchId
                  + ']');

        return null;
      }

      if (exchId != null) lastExchangeId = exchId;

      if (node2part == null)
        // Create invalid partition map.
        node2part = new GridDhtPartitionFullMap();

      GridDhtPartitionMap cur = node2part.get(parts.nodeId());

      if (cur != null && cur.updateSequence() >= parts.updateSequence()) {
        if (log.isDebugEnabled())
          log.debug(
              "Stale update sequence for single partition map update (will ignore) [exchId="
                  + exchId
                  + ", curSeq="
                  + cur.updateSequence()
                  + ", newSeq="
                  + parts.updateSequence()
                  + ']');

        return null;
      }

      long updateSeq = this.updateSeq.incrementAndGet();

      node2part = new GridDhtPartitionFullMap(node2part, updateSeq);

      boolean changed = false;

      if (cur == null || !cur.equals(parts)) changed = true;

      node2part.put(parts.nodeId(), parts);

      part2node = new HashMap<>(part2node);

      // Add new mappings.
      for (Integer p : parts.keySet()) {
        Set<UUID> ids = part2node.get(p);

        if (ids == null)
          // Initialize HashSet to size 3 in anticipation that there won't be
          // more than 3 nodes per partition.
          part2node.put(p, ids = U.newHashSet(3));

        changed |= ids.add(parts.nodeId());
      }

      // Remove obsolete mappings.
      if (cur != null) {
        for (Integer p : F.view(cur.keySet(), F0.notIn(parts.keySet()))) {
          Set<UUID> ids = part2node.get(p);

          if (ids != null) changed |= ids.remove(parts.nodeId());
        }
      }

      changed |= checkEvictions(updateSeq);

      consistencyCheck();

      if (log.isDebugEnabled()) log.debug("Partition map after single update: " + fullMapString());

      return changed ? localPartitionMap() : null;
    } finally {
      lock.writeLock().unlock();
    }
  }