private void sendFailShard( ShardRouting shardRouting, String indexUUID, String message, @Nullable Throwable failure) { try { logger.warn( "[{}] marking and sending shard failed due to [{}]", failure, shardRouting.shardId(), message); failedShards.put(shardRouting.shardId(), new FailedShard(shardRouting.version())); shardStateAction.shardFailed( shardRouting, indexUUID, "shard failure [" + message + "]" + (failure == null ? "" : "[" + detailedMessage(failure) + "]")); } catch (Throwable e1) { logger.warn( "[{}][{}] failed to mark shard as failed (because of [{}])", e1, shardRouting.getIndex(), shardRouting.getId(), message); } }
private void cleanFailedShards(final ClusterChangedEvent event) { RoutingTable routingTable = event.state().routingTable(); RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { failedShards.clear(); return; } DiscoveryNodes nodes = event.state().nodes(); long now = System.currentTimeMillis(); String localNodeId = nodes.localNodeId(); Iterator<Map.Entry<ShardId, FailedShard>> iterator = failedShards.entrySet().iterator(); shards: while (iterator.hasNext()) { Map.Entry<ShardId, FailedShard> entry = iterator.next(); FailedShard failedShard = entry.getValue(); IndexRoutingTable indexRoutingTable = routingTable.index(entry.getKey().getIndex()); if (indexRoutingTable != null) { IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(entry.getKey().id()); if (shardRoutingTable != null) { for (ShardRouting shardRouting : shardRoutingTable.assignedShards()) { if (localNodeId.equals(shardRouting.currentNodeId())) { // we have a timeout here just to make sure we don't have dangled failed shards for // some reason // its just another safely layer if (shardRouting.version() == failedShard.version && ((now - failedShard.timestamp) < TimeValue.timeValueMinutes(60).millis())) { // It's the same failed shard - keep it if it hasn't timed out continue shards; } else { // Different version or expired, remove it break; } } } } } iterator.remove(); } }
private void applyNewOrUpdatedShards(final ClusterChangedEvent event) { if (!indicesService.changesAllowed()) { return; } RoutingTable routingTable = event.state().routingTable(); RoutingNodes.RoutingNodeIterator routingNode = event.state().readOnlyRoutingNodes().routingNodeIter(event.state().nodes().localNodeId()); if (routingNode == null) { failedShards.clear(); return; } DiscoveryNodes nodes = event.state().nodes(); for (final ShardRouting shardRouting : routingNode) { final IndexService indexService = indicesService.indexService(shardRouting.index()); if (indexService == null) { // got deleted on us, ignore continue; } final IndexMetaData indexMetaData = event.state().metaData().index(shardRouting.index()); if (indexMetaData == null) { // the index got deleted on the metadata, we will clean it later in the apply deleted method // call continue; } final int shardId = shardRouting.id(); if (!indexService.hasShard(shardId) && shardRouting.started()) { if (failedShards.containsKey(shardRouting.shardId())) { if (nodes.masterNode() != null) { shardStateAction.resendShardFailed( shardRouting, indexMetaData.getIndexUUID(), "master " + nodes.masterNode() + " marked shard as started, but shard has previous failed. resending shard failure.", nodes.masterNode()); } } else { // the master thinks we are started, but we don't have this shard at all, mark it as // failed sendFailShard( shardRouting, indexMetaData.getIndexUUID(), "master [" + nodes.masterNode() + "] marked shard as started, but shard has not been created, mark shard as failed", null); } continue; } IndexShard indexShard = indexService.shard(shardId); if (indexShard != null) { ShardRouting currentRoutingEntry = indexShard.routingEntry(); // if the current and global routing are initializing, but are still not the same, its a // different "shard" being allocated // for example: a shard that recovers from one node and now needs to recover to another // node, // or a replica allocated and then allocating a primary because the primary // failed on another node boolean shardHasBeenRemoved = false; if (currentRoutingEntry.initializing() && shardRouting.initializing() && !currentRoutingEntry.equals(shardRouting)) { logger.debug( "[{}][{}] removing shard (different instance of it allocated on this node, current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting); // closing the shard will also cancel any ongoing recovery. indexService.removeShard( shardRouting.id(), "removing shard (different instance of it allocated on this node)"); shardHasBeenRemoved = true; } else if (isPeerRecovery(shardRouting)) { final DiscoveryNode sourceNode = findSourceNodeForPeerRecovery(routingTable, nodes, shardRouting); // check if there is an existing recovery going, and if so, and the source node is not the // same, cancel the recovery to restart it final Predicate<RecoveryStatus> shouldCancel = new Predicate<RecoveryStatus>() { @Override public boolean apply(@Nullable RecoveryStatus status) { return status.sourceNode().equals(sourceNode) == false; } }; if (recoveryTarget.cancelRecoveriesForShard( indexShard.shardId(), "recovery source node changed", shouldCancel)) { logger.debug( "[{}][{}] removing shard (recovery source changed), current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting); // closing the shard will also cancel any ongoing recovery. indexService.removeShard( shardRouting.id(), "removing shard (recovery source node changed)"); shardHasBeenRemoved = true; } } if (shardHasBeenRemoved == false && (shardRouting.equals(indexShard.routingEntry()) == false || shardRouting.version() > indexShard.routingEntry().version())) { if (shardRouting.primary() && indexShard.routingEntry().primary() == false && shardRouting.initializing() && indexShard.allowsPrimaryPromotion() == false) { logger.debug("{} reinitialize shard on primary promotion", indexShard.shardId()); indexService.removeShard(shardId, "promoted to primary"); } else { // if we happen to remove the shardRouting by id above we don't need to jump in here! indexShard.updateRoutingEntry( shardRouting, event.state().blocks().disableStatePersistence() == false); } } } if (shardRouting.initializing()) { applyInitializingShard(event.state(), indexMetaData, shardRouting); } } }