예제 #1
0
  private boolean electPrimariesAndUnassignedDanglingReplicas(RoutingAllocation allocation) {
    boolean changed = false;
    RoutingNodes routingNodes = allocation.routingNodes();
    if (!routingNodes.hasUnassignedPrimaries()) {
      // move out if we don't have unassigned primaries
      return changed;
    }

    // go over and remove dangling replicas that are initializing for primary shards
    List<ShardRouting> shardsToFail = Lists.newArrayList();
    for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
      if (shardEntry.primary()) {
        for (MutableShardRouting routing : routingNodes.assignedShards(shardEntry)) {
          if (!routing.primary() && routing.initializing()) {
            shardsToFail.add(routing);
          }
        }
      }
    }
    for (ShardRouting shardToFail : shardsToFail) {
      changed |= applyFailedShard(allocation, shardToFail, false);
    }

    // now, go over and elect a new primary if possible, not, from this code block on, if one is
    // elected,
    // routingNodes.hasUnassignedPrimaries() will potentially be false
    for (MutableShardRouting shardEntry : routingNodes.unassigned()) {
      if (shardEntry.primary()) {
        MutableShardRouting candidate = allocation.routingNodes().activeReplica(shardEntry);
        if (candidate != null) {
          routingNodes.swapPrimaryFlag(shardEntry, candidate);
          if (candidate.relocatingNodeId() != null) {
            changed = true;
            // its also relocating, make sure to move the other routing to primary
            RoutingNode node = routingNodes.node(candidate.relocatingNodeId());
            if (node != null) {
              for (MutableShardRouting shardRouting : node) {
                if (shardRouting.shardId().equals(candidate.shardId()) && !shardRouting.primary()) {
                  routingNodes.swapPrimaryFlag(shardRouting);
                  break;
                }
              }
            }
          }
        }
      }
    }

    return changed;
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);

    MutableShardRouting shardRouting = null;
    for (MutableShardRouting routing : allocation.routingNodes().unassigned()) {
      if (routing.shardId().equals(shardId)) {
        // prefer primaries first to allocate
        if (shardRouting == null || routing.primary()) {
          shardRouting = routing;
        }
      }
    }

    if (shardRouting == null) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] failed to find " + shardId + " on the list of unassigned shards");
    }

    if (shardRouting.primary() && !allowPrimary) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] trying to allocate a primary shard " + shardId + "], which is disabled");
    }

    RoutingNode routingNode = allocation.routingNodes().node(discoNode.id());
    allocation.addIgnoreDisable(shardRouting.shardId(), routingNode.nodeId());
    if (!allocation.deciders().canAllocate(shardRouting, routingNode, allocation).allowed()) {
      throw new ElasticSearchIllegalArgumentException(
          "[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed");
    }
    // go over and remove it from the unassigned
    for (Iterator<MutableShardRouting> it = allocation.routingNodes().unassigned().iterator();
        it.hasNext(); ) {
      if (it.next() != shardRouting) {
        continue;
      }
      it.remove();
      routingNode.add(shardRouting);
      break;
    }
  }
  @Override
  public void execute(RoutingAllocation allocation) throws ElasticSearchException {
    DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
    boolean found = false;
    for (RoutingNodes.RoutingNodeIterator it =
            allocation.routingNodes().routingNodeIter(discoNode.id());
        it.hasNext(); ) {
      MutableShardRouting shardRouting = it.next();
      if (!shardRouting.shardId().equals(shardId)) {
        continue;
      }
      found = true;
      if (shardRouting.relocatingNodeId() != null) {
        if (shardRouting.initializing()) {
          // the shard is initializing and recovering from another node, simply cancel the recovery
          it.remove();
          // and cancel the relocating state from the shard its being relocated from
          RoutingNode relocatingFromNode =
              allocation.routingNodes().node(shardRouting.relocatingNodeId());
          if (relocatingFromNode != null) {
            for (MutableShardRouting fromShardRouting : relocatingFromNode) {
              if (fromShardRouting.shardId().equals(shardRouting.shardId())
                  && fromShardRouting.state() == RELOCATING) {
                allocation.routingNodes().cancelRelocation(fromShardRouting);
                break;
              }
            }
          }
        } else if (shardRouting.relocating()) {

          // the shard is relocating to another node, cancel the recovery on the other node, and
          // deallocate this one
          if (!allowPrimary && shardRouting.primary()) {
            // can't cancel a primary shard being initialized
            throw new ElasticSearchIllegalArgumentException(
                "[cancel_allocation] can't cancel "
                    + shardId
                    + " on node "
                    + discoNode
                    + ", shard is primary and initializing its state");
          }
          it.moveToUnassigned();
          // now, go and find the shard that is initializing on the target node, and cancel it as
          // well...
          RoutingNodes.RoutingNodeIterator initializingNode =
              allocation.routingNodes().routingNodeIter(shardRouting.relocatingNodeId());
          if (initializingNode != null) {
            while (initializingNode.hasNext()) {
              MutableShardRouting initializingShardRouting = initializingNode.next();
              if (initializingShardRouting.shardId().equals(shardRouting.shardId())
                  && initializingShardRouting.state() == INITIALIZING) {
                initializingNode.remove();
              }
            }
          }
        }
      } else {
        // the shard is not relocating, its either started, or initializing, just cancel it and move
        // on...
        if (!allowPrimary && shardRouting.primary()) {
          // can't cancel a primary shard being initialized
          throw new ElasticSearchIllegalArgumentException(
              "[cancel_allocation] can't cancel "
                  + shardId
                  + " on node "
                  + discoNode
                  + ", shard is primary and started");
        }
        it.remove();
        allocation
            .routingNodes()
            .unassigned()
            .add(
                new MutableShardRouting(
                    shardRouting.index(),
                    shardRouting.id(),
                    null,
                    shardRouting.primary(),
                    ShardRoutingState.UNASSIGNED,
                    shardRouting.version() + 1));
      }
    }
    if (!found) {
      throw new ElasticSearchIllegalArgumentException(
          "[cancel_allocation] can't cancel "
              + shardId
              + ", failed to find it on node "
              + discoNode);
    }
  }
  public boolean allocateUnassigned(RoutingAllocation allocation) {
    boolean changed = false;
    DiscoveryNodes nodes = allocation.nodes();
    RoutingNodes routingNodes = allocation.routingNodes();

    // First, handle primaries, they must find a place to be allocated on here
    Iterator<MutableShardRouting> unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      if (!shard.primary()) {
        continue;
      }

      // this is an API allocation, ignore since we know there is no data...
      if (!routingNodes
          .routingTable()
          .index(shard.index())
          .shard(shard.id())
          .primaryAllocatedPostApi()) {
        continue;
      }

      ObjectLongOpenHashMap<DiscoveryNode> nodesState = buildShardStates(nodes, shard);

      int numberOfAllocationsFound = 0;
      long highestVersion = -1;
      Set<DiscoveryNode> nodesWithHighestVersion = Sets.newHashSet();
      final boolean[] states = nodesState.allocated;
      final Object[] keys = nodesState.keys;
      final long[] values = nodesState.values;
      for (int i = 0; i < states.length; i++) {
        if (!states[i]) {
          continue;
        }

        DiscoveryNode node = (DiscoveryNode) keys[i];
        long version = values[i];
        // since we don't check in NO allocation, we need to double check here
        if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
          continue;
        }
        if (version != -1) {
          numberOfAllocationsFound++;
          if (highestVersion == -1) {
            nodesWithHighestVersion.add(node);
            highestVersion = version;
          } else {
            if (version > highestVersion) {
              nodesWithHighestVersion.clear();
              nodesWithHighestVersion.add(node);
              highestVersion = version;
            } else if (version == highestVersion) {
              nodesWithHighestVersion.add(node);
            }
          }
        }
      }

      // check if the counts meets the minimum set
      int requiredAllocation = 1;
      // if we restore from a repository one copy is more then enough
      if (shard.restoreSource() == null) {
        try {
          IndexMetaData indexMetaData = routingNodes.metaData().index(shard.index());
          String initialShards =
              indexMetaData
                  .settings()
                  .get(
                      INDEX_RECOVERY_INITIAL_SHARDS,
                      settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
          if ("quorum".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
            }
          } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 2) {
              requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
            }
          } else if ("one".equals(initialShards)) {
            requiredAllocation = 1;
          } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
            requiredAllocation = indexMetaData.numberOfReplicas() + 1;
          } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
            if (indexMetaData.numberOfReplicas() > 1) {
              requiredAllocation = indexMetaData.numberOfReplicas();
            }
          } else {
            requiredAllocation = Integer.parseInt(initialShards);
          }
        } catch (Exception e) {
          logger.warn(
              "[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}",
              shard.index(),
              shard.id(),
              initialShards,
              shard);
        }
      }

      // not enough found for this shard, continue...
      if (numberOfAllocationsFound < requiredAllocation) {
        // if we are restoring this shard we still can allocate
        if (shard.restoreSource() == null) {
          // we can't really allocate, so ignore it and continue
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]",
                shard.index(),
                shard.id(),
                numberOfAllocationsFound,
                requiredAllocation);
          }
        } else if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: missing local data, will restore from [{}]",
              shard.index(),
              shard.id(),
              shard.restoreSource());
        }
        continue;
      }

      Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
      Set<DiscoveryNode> noNodes = Sets.newHashSet();
      for (DiscoveryNode discoNode : nodesWithHighestVersion) {
        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          throttledNodes.add(discoNode);
        } else if (decision.type() == Decision.Type.NO) {
          noNodes.add(discoNode);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();

          // found a node, so no throttling, no "no", and break out of the loop
          throttledNodes.clear();
          noNodes.clear();
          break;
        }
      }
      if (throttledNodes.isEmpty()) {
        // if we have a node that we "can't" allocate to, force allocation, since this is our master
        // data!
        if (!noNodes.isEmpty()) {
          DiscoveryNode discoNode = noNodes.iterator().next();
          RoutingNode node = routingNodes.node(discoNode.id());
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: forcing allocating [{}] to [{}] on primary allocation",
                shard.index(),
                shard.id(),
                shard,
                discoNode);
          }
          // we found a match
          changed = true;
          // make sure we create one with the version from the recovered state
          allocation
              .routingNodes()
              .assign(new MutableShardRouting(shard, highestVersion), node.nodeId());
          unassignedIterator.remove();
        }
      } else {
        if (logger.isDebugEnabled()) {
          logger.debug(
              "[{}][{}]: throttling allocation [{}] to [{}] on primary allocation",
              shard.index(),
              shard.id(),
              shard,
              throttledNodes);
        }
        // we are throttling this, but we have enough to allocate to this node, ignore it for now
        unassignedIterator.remove();
        routingNodes.ignoredUnassigned().add(shard);
      }
    }

    if (!routingNodes.hasUnassigned()) {
      return changed;
    }

    // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was
    // allocated on
    unassignedIterator = routingNodes.unassigned().iterator();
    while (unassignedIterator.hasNext()) {
      MutableShardRouting shard = unassignedIterator.next();

      // pre-check if it can be allocated to any node that currently exists, so we won't list the
      // store for it for nothing
      boolean canBeAllocatedToAtLeastOneNode = false;
      for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
        RoutingNode node = routingNodes.node(cursor.value.id());
        if (node == null) {
          continue;
        }
        // if we can't allocate it on a node, ignore it, for example, this handles
        // cases for only allocating a replica after a primary
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.YES) {
          canBeAllocatedToAtLeastOneNode = true;
          break;
        }
      }

      if (!canBeAllocatedToAtLeastOneNode) {
        continue;
      }

      Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores =
          buildShardStores(nodes, shard);

      long lastSizeMatched = 0;
      DiscoveryNode lastDiscoNodeMatched = null;
      RoutingNode lastNodeMatched = null;

      for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData>
          nodeStoreEntry : shardStores.entrySet()) {
        DiscoveryNode discoNode = nodeStoreEntry.getKey();
        TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData =
            nodeStoreEntry.getValue();
        logger.trace("{}: checking node [{}]", shard, discoNode);

        if (storeFilesMetaData == null) {
          // already allocated on that node...
          continue;
        }

        RoutingNode node = routingNodes.node(discoNode.id());
        if (node == null) {
          continue;
        }

        // check if we can allocate on that node...
        // we only check for NO, since if this node is THROTTLING and it has enough "same data"
        // then we will try and assign it next time
        Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
        if (decision.type() == Decision.Type.NO) {
          continue;
        }

        // if it is already allocated, we can't assign to it...
        if (storeFilesMetaData.allocated()) {
          continue;
        }

        if (!shard.primary()) {
          MutableShardRouting primaryShard = routingNodes.activePrimary(shard);
          if (primaryShard != null) {
            assert primaryShard.active();
            DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
            if (primaryNode != null) {
              TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore =
                  shardStores.get(primaryNode);
              if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                long sizeMatched = 0;

                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                  if (primaryNodeStore.fileExists(storeFileMetaData.name())
                      && primaryNodeStore
                          .file(storeFileMetaData.name())
                          .isSame(storeFileMetaData)) {
                    sizeMatched += storeFileMetaData.length();
                  }
                }
                logger.trace(
                    "{}: node [{}] has [{}/{}] bytes of re-usable data",
                    shard,
                    discoNode.name(),
                    new ByteSizeValue(sizeMatched),
                    sizeMatched);
                if (sizeMatched > lastSizeMatched) {
                  lastSizeMatched = sizeMatched;
                  lastDiscoNodeMatched = discoNode;
                  lastNodeMatched = node;
                }
              }
            }
          }
        }
      }

      if (lastNodeMatched != null) {
        // we only check on THROTTLE since we checked before before on NO
        Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we are throttling this, but we have enough to allocate to this node, ignore it for now
          unassignedIterator.remove();
          routingNodes.ignoredUnassigned().add(shard);
        } else {
          if (logger.isDebugEnabled()) {
            logger.debug(
                "[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]",
                shard.index(),
                shard.id(),
                shard,
                lastDiscoNodeMatched,
                new ByteSizeValue(lastSizeMatched));
          }
          // we found a match
          changed = true;
          allocation.routingNodes().assign(shard, lastNodeMatched.nodeId());
          unassignedIterator.remove();
        }
      }
    }
    return changed;
  }