/** * Tries to lock all local shards for the given index. If any of the shard locks can't be acquired * an {@link LockObtainFailedException} is thrown and all previously acquired locks are released. * * @param index the index to lock shards for * @param lockTimeoutMS how long to wait for acquiring the indices shard locks * @return the {@link ShardLock} instances for this index. * @throws IOException if an IOException occurs. */ public List<ShardLock> lockAllForIndex( Index index, @IndexSettings Settings settings, long lockTimeoutMS) throws IOException { final Integer numShards = settings.getAsInt(IndexMetaData.SETTING_NUMBER_OF_SHARDS, null); if (numShards == null || numShards <= 0) { throw new IllegalArgumentException("settings must contain a non-null > 0 number of shards"); } logger.trace("locking all shards for index {} - [{}]", index, numShards); List<ShardLock> allLocks = new ArrayList<>(numShards); boolean success = false; long startTimeNS = System.nanoTime(); try { for (int i = 0; i < numShards; i++) { long timeoutLeftMS = Math.max(0, lockTimeoutMS - TimeValue.nsecToMSec((System.nanoTime() - startTimeNS))); allLocks.add(shardLock(new ShardId(index, i), timeoutLeftMS)); } success = true; } finally { if (success == false) { logger.trace("unable to lock all shards for index {}", index); IOUtils.closeWhileHandlingException(allLocks); } } return allLocks; }
<T> void runTasksForExecutor(ClusterStateTaskExecutor<T> executor) { final ArrayList<UpdateTask<T>> toExecute = new ArrayList<>(); final Map<String, ArrayList<T>> processTasksBySource = new HashMap<>(); synchronized (updateTasksPerExecutor) { List<UpdateTask> pending = updateTasksPerExecutor.remove(executor); if (pending != null) { for (UpdateTask<T> task : pending) { if (task.processed.getAndSet(true) == false) { logger.trace("will process {}", task.toString(executor)); toExecute.add(task); processTasksBySource .computeIfAbsent(task.source, s -> new ArrayList<>()) .add(task.task); } else { logger.trace("skipping {}, already processed", task.toString(executor)); } } } } if (toExecute.isEmpty()) { return; } final String tasksSummary = processTasksBySource .entrySet() .stream() .map( entry -> { String tasks = executor.describeTasks(entry.getValue()); return tasks.isEmpty() ? entry.getKey() : entry.getKey() + "[" + tasks + "]"; }) .reduce((s1, s2) -> s1 + ", " + s2) .orElse(""); if (!lifecycle.started()) { logger.debug("processing [{}]: ignoring, cluster_service not started", tasksSummary); return; } logger.debug("processing [{}]: execute", tasksSummary); ClusterState previousClusterState = clusterState; if (!previousClusterState.nodes().isLocalNodeElectedMaster() && executor.runOnlyOnMaster()) { logger.debug("failing [{}]: local node is no longer master", tasksSummary); toExecute.stream().forEach(task -> task.listener.onNoLongerMaster(task.source)); return; } ClusterStateTaskExecutor.BatchResult<T> batchResult; long startTimeNS = currentTimeInNanos(); try { List<T> inputs = toExecute.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList()); batchResult = executor.execute(previousClusterState, inputs); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); if (logger.isTraceEnabled()) { logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "failed to execute cluster state update in [{}], state:\nversion [{}], source [{}]\n{}{}{}", executionTime, previousClusterState.version(), tasksSummary, previousClusterState.nodes().prettyPrint(), previousClusterState.routingTable().prettyPrint(), previousClusterState.getRoutingNodes().prettyPrint()), e); } warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); batchResult = ClusterStateTaskExecutor.BatchResult.<T>builder() .failures(toExecute.stream().map(updateTask -> updateTask.task)::iterator, e) .build(previousClusterState); } assert batchResult.executionResults != null; assert batchResult.executionResults.size() == toExecute.size() : String.format( Locale.ROOT, "expected [%d] task result%s but was [%d]", toExecute.size(), toExecute.size() == 1 ? "" : "s", batchResult.executionResults.size()); boolean assertsEnabled = false; assert (assertsEnabled = true); if (assertsEnabled) { for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing task result for " + updateTask.toString(executor); } } ClusterState newClusterState = batchResult.resultingState; final ArrayList<UpdateTask<T>> proccessedListeners = new ArrayList<>(); // fail all tasks that have failed and extract those that are waiting for results for (UpdateTask<T> updateTask : toExecute) { assert batchResult.executionResults.containsKey(updateTask.task) : "missing " + updateTask.toString(executor); final ClusterStateTaskExecutor.TaskResult executionResult = batchResult.executionResults.get(updateTask.task); executionResult.handle( () -> proccessedListeners.add(updateTask), ex -> { logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "cluster state update task {} failed", updateTask.toString(executor)), ex); updateTask.listener.onFailure(updateTask.source, ex); }); } if (previousClusterState == newClusterState) { for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { // no need to wait for ack if nothing changed, the update can be counted as acknowledged ((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null); } task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] no change in cluster_state", tasksSummary, executionTime); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); return; } try { ArrayList<Discovery.AckListener> ackListeners = new ArrayList<>(); if (newClusterState.nodes().isLocalNodeElectedMaster()) { // only the master controls the version numbers Builder builder = ClusterState.builder(newClusterState).incrementVersion(); if (previousClusterState.routingTable() != newClusterState.routingTable()) { builder.routingTable( RoutingTable.builder(newClusterState.routingTable()) .version(newClusterState.routingTable().version() + 1) .build()); } if (previousClusterState.metaData() != newClusterState.metaData()) { builder.metaData( MetaData.builder(newClusterState.metaData()) .version(newClusterState.metaData().version() + 1)); } newClusterState = builder.build(); for (UpdateTask<T> task : proccessedListeners) { if (task.listener instanceof AckedClusterStateTaskListener) { final AckedClusterStateTaskListener ackedListener = (AckedClusterStateTaskListener) task.listener; if (ackedListener.ackTimeout() == null || ackedListener.ackTimeout().millis() == 0) { ackedListener.onAckTimeout(); } else { try { ackListeners.add( new AckCountDownListener( ackedListener, newClusterState.version(), newClusterState.nodes(), threadPool)); } catch (EsRejectedExecutionException ex) { if (logger.isDebugEnabled()) { logger.debug( "Couldn't schedule timeout thread - node might be shutting down", ex); } // timeout straightaway, otherwise we could wait forever as the timeout thread has // not started ackedListener.onAckTimeout(); } } } } } final Discovery.AckListener ackListener = new DelegetingAckListener(ackListeners); newClusterState.status(ClusterState.ClusterStateStatus.BEING_APPLIED); if (logger.isTraceEnabled()) { logger.trace( "cluster state updated, source [{}]\n{}", tasksSummary, newClusterState.prettyPrint()); } else if (logger.isDebugEnabled()) { logger.debug( "cluster state updated, version [{}], source [{}]", newClusterState.version(), tasksSummary); } ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(tasksSummary, newClusterState, previousClusterState); // new cluster state, notify all listeners final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta(); if (nodesDelta.hasChanges() && logger.isInfoEnabled()) { String summary = nodesDelta.shortSummary(); if (summary.length() > 0) { logger.info("{}, reason: {}", summary, tasksSummary); } } nodeConnectionsService.connectToAddedNodes(clusterChangedEvent); // if we are the master, publish the new state to all nodes // we publish here before we send a notification to all the listeners, since if it fails // we don't want to notify if (newClusterState.nodes().isLocalNodeElectedMaster()) { logger.debug("publishing cluster state version [{}]", newClusterState.version()); try { clusterStatePublisher.accept(clusterChangedEvent, ackListener); } catch (Discovery.FailedToCommitClusterStateException t) { final long version = newClusterState.version(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failing [{}]: failed to commit cluster state version [{}]", tasksSummary, version), t); proccessedListeners.forEach(task -> task.listener.onFailure(task.source, t)); return; } } // update the current cluster state clusterState = newClusterState; logger.debug("set local cluster state to version {}", newClusterState.version()); try { // nothing to do until we actually recover from the gateway or any other block indicates we // need to disable persistency if (clusterChangedEvent.state().blocks().disableStatePersistence() == false && clusterChangedEvent.metaDataChanged()) { final Settings incomingSettings = clusterChangedEvent.state().metaData().settings(); clusterSettings.applySettings(incomingSettings); } } catch (Exception ex) { logger.warn("failed to apply cluster settings", ex); } for (ClusterStateListener listener : preAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } nodeConnectionsService.disconnectFromRemovedNodes(clusterChangedEvent); newClusterState.status(ClusterState.ClusterStateStatus.APPLIED); for (ClusterStateListener listener : postAppliedListeners) { try { listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } } // manual ack only from the master at the end of the publish if (newClusterState.nodes().isLocalNodeElectedMaster()) { try { ackListener.onNodeAck(newClusterState.nodes().getLocalNode(), null); } catch (Exception e) { final DiscoveryNode localNode = newClusterState.nodes().getLocalNode(); logger.debug( (Supplier<?>) () -> new ParameterizedMessage( "error while processing ack for master node [{}]", localNode), e); } } for (UpdateTask<T> task : proccessedListeners) { task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } try { executor.clusterStatePublished(clusterChangedEvent); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown while notifying executor of new cluster state publication [{}]", tasksSummary), e); } TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug( "processing [{}]: took [{}] done applying updated cluster_state (version: {}, uuid: {})", tasksSummary, executionTime, newClusterState.version(), newClusterState.stateUUID()); warnAboutSlowTaskIfNeeded(executionTime, tasksSummary); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis( Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); final long version = newClusterState.version(); final String stateUUID = newClusterState.stateUUID(); final String prettyPrint = newClusterState.prettyPrint(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}", executionTime, version, stateUUID, tasksSummary, prettyPrint), e); // TODO: do we want to call updateTask.onFailure here? } }
/** * Prepares an update request by converting it into an index or delete request or an update * response (no action). */ @SuppressWarnings("unchecked") protected Result prepare(ShardId shardId, UpdateRequest request, final GetResult getResult) { long getDateNS = System.nanoTime(); if (!getResult.isExists()) { if (request.upsertRequest() == null && !request.docAsUpsert()) { throw new DocumentMissingException(shardId, request.type(), request.id()); } IndexRequest indexRequest = request.docAsUpsert() ? request.doc() : request.upsertRequest(); TimeValue ttl = indexRequest.ttl(); if (request.scriptedUpsert() && request.script() != null) { // Run the script to perform the create logic IndexRequest upsert = request.upsertRequest(); Map<String, Object> upsertDoc = upsert.sourceAsMap(); Map<String, Object> ctx = new HashMap<>(2); // Tell the script that this is a create and not an update ctx.put("op", "create"); ctx.put("_source", upsertDoc); ctx = executeScript(request.script, ctx); // Allow the script to set TTL using ctx._ttl if (ttl == null) { ttl = getTTLFromScriptContext(ctx); } // Allow the script to abort the create by setting "op" to "none" String scriptOpChoice = (String) ctx.get("op"); // Only valid options for an upsert script are "create" // (the default) or "none", meaning abort upsert if (!"create".equals(scriptOpChoice)) { if (!"none".equals(scriptOpChoice)) { logger.warn( "Used upsert operation [{}] for script [{}], doing nothing...", scriptOpChoice, request.script.getScript()); } UpdateResponse update = new UpdateResponse( shardId, getResult.getType(), getResult.getId(), getResult.getVersion(), false); update.setGetResult(getResult); return new Result(update, Operation.NONE, upsertDoc, XContentType.JSON); } indexRequest.source((Map) ctx.get("_source")); } indexRequest .index(request.index()) .type(request.type()) .id(request.id()) // it has to be a "create!" .create(true) .ttl(ttl) .refresh(request.refresh()) .routing(request.routing()) .parent(request.parent()) .consistencyLevel(request.consistencyLevel()); if (request.versionType() != VersionType.INTERNAL) { // in all but the internal versioning mode, we want to create the new document using the // given version. indexRequest.version(request.version()).versionType(request.versionType()); } return new Result(indexRequest, Operation.UPSERT, null, null); } long updateVersion = getResult.getVersion(); if (request.versionType() != VersionType.INTERNAL) { assert request.versionType() == VersionType.FORCE; updateVersion = request.version(); // remember, match_any is excluded by the conflict test } if (getResult.internalSourceRef() == null) { // no source, we can't do nothing, through a failure... throw new DocumentSourceMissingException(shardId, request.type(), request.id()); } Tuple<XContentType, Map<String, Object>> sourceAndContent = XContentHelper.convertToMap(getResult.internalSourceRef(), true); String operation = null; String timestamp = null; TimeValue ttl = null; final Map<String, Object> updatedSourceAsMap; final XContentType updateSourceContentType = sourceAndContent.v1(); String routing = getResult.getFields().containsKey(RoutingFieldMapper.NAME) ? getResult.field(RoutingFieldMapper.NAME).getValue().toString() : null; String parent = getResult.getFields().containsKey(ParentFieldMapper.NAME) ? getResult.field(ParentFieldMapper.NAME).getValue().toString() : null; if (request.script() == null && request.doc() != null) { IndexRequest indexRequest = request.doc(); updatedSourceAsMap = sourceAndContent.v2(); if (indexRequest.ttl() != null) { ttl = indexRequest.ttl(); } timestamp = indexRequest.timestamp(); if (indexRequest.routing() != null) { routing = indexRequest.routing(); } if (indexRequest.parent() != null) { parent = indexRequest.parent(); } boolean noop = !XContentHelper.update( updatedSourceAsMap, indexRequest.sourceAsMap(), request.detectNoop()); // noop could still be true even if detectNoop isn't because update detects empty maps as // noops. BUT we can only // actually turn the update into a noop if detectNoop is true to preserve backwards // compatibility and to handle // cases where users repopulating multi-fields or adding synonyms, etc. if (request.detectNoop() && noop) { operation = "none"; } } else { Map<String, Object> ctx = new HashMap<>(16); Long originalTtl = getResult.getFields().containsKey(TTLFieldMapper.NAME) ? (Long) getResult.field(TTLFieldMapper.NAME).getValue() : null; Long originalTimestamp = getResult.getFields().containsKey(TimestampFieldMapper.NAME) ? (Long) getResult.field(TimestampFieldMapper.NAME).getValue() : null; ctx.put("_index", getResult.getIndex()); ctx.put("_type", getResult.getType()); ctx.put("_id", getResult.getId()); ctx.put("_version", getResult.getVersion()); ctx.put("_routing", routing); ctx.put("_parent", parent); ctx.put("_timestamp", originalTimestamp); ctx.put("_ttl", originalTtl); ctx.put("_source", sourceAndContent.v2()); ctx = executeScript(request.script, ctx); operation = (String) ctx.get("op"); Object fetchedTimestamp = ctx.get("_timestamp"); if (fetchedTimestamp != null) { timestamp = fetchedTimestamp.toString(); } else if (originalTimestamp != null) { // No timestamp has been given in the update script, so we keep the previous timestamp if // there is one timestamp = originalTimestamp.toString(); } ttl = getTTLFromScriptContext(ctx); updatedSourceAsMap = (Map<String, Object>) ctx.get("_source"); } // apply script to update the source // No TTL has been given in the update script so we keep previous TTL value if there is one if (ttl == null) { Long ttlAsLong = getResult.getFields().containsKey(TTLFieldMapper.NAME) ? (Long) getResult.field(TTLFieldMapper.NAME).getValue() : null; if (ttlAsLong != null) { ttl = new TimeValue( ttlAsLong - TimeValue.nsecToMSec( System.nanoTime() - getDateNS)); // It is an approximation of exact TTL value, could be // improved } } if (operation == null || "index".equals(operation)) { final IndexRequest indexRequest = Requests.indexRequest(request.index()) .type(request.type()) .id(request.id()) .routing(routing) .parent(parent) .source(updatedSourceAsMap, updateSourceContentType) .version(updateVersion) .versionType(request.versionType()) .consistencyLevel(request.consistencyLevel()) .timestamp(timestamp) .ttl(ttl) .refresh(request.refresh()); return new Result(indexRequest, Operation.INDEX, updatedSourceAsMap, updateSourceContentType); } else if ("delete".equals(operation)) { DeleteRequest deleteRequest = Requests.deleteRequest(request.index()) .type(request.type()) .id(request.id()) .routing(routing) .parent(parent) .version(updateVersion) .versionType(request.versionType()) .consistencyLevel(request.consistencyLevel()); return new Result( deleteRequest, Operation.DELETE, updatedSourceAsMap, updateSourceContentType); } else if ("none".equals(operation)) { UpdateResponse update = new UpdateResponse( shardId, getResult.getType(), getResult.getId(), getResult.getVersion(), false); update.setGetResult( extractGetResult( request, request.index(), getResult.getVersion(), updatedSourceAsMap, updateSourceContentType, getResult.internalSourceRef())); return new Result(update, Operation.NONE, updatedSourceAsMap, updateSourceContentType); } else { logger.warn( "Used update operation [{}] for script [{}], doing nothing...", operation, request.script.getScript()); UpdateResponse update = new UpdateResponse( shardId, getResult.getType(), getResult.getId(), getResult.getVersion(), false); return new Result(update, Operation.NONE, updatedSourceAsMap, updateSourceContentType); } }