private NodeState buildLocalNodeState(NodeState body) { if (body == null) { body = new NodeState(); } body.id = getHost().getId(); body.status = NodeStatus.SYNCHRONIZING; body.groupReference = UriUtils.buildPublicUri(getHost(), getSelfLink()); body.documentSelfLink = UriUtils.buildUriPath(getSelfLink(), body.id); body.documentKind = Utils.buildKind(NodeState.class); body.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); return body; }
/** * A request sent to an enrolled service at the end of this transaction to clear the service' * transaction id */ public static class ClearTransactionRequest { public static final String KIND = Utils.buildKind(ClearTransactionRequest.class); public String kind; public TransactionOutcome transactionOutcome; public boolean isUpdated; public long originalVersion; }
/** Request for enrolling a service in this transaction */ public static class EnrollRequest { public static final String KIND = Utils.buildKind(EnrollRequest.class); public String kind = KIND; public String serviceSelfLink; public Action action; public long previousVersion; }
public static class JoinPeerRequest { public static final String KIND = Utils.buildKind(JoinPeerRequest.class); public static JoinPeerRequest create(URI peerToJoin, Integer synchQuorum) { JoinPeerRequest r = new JoinPeerRequest(); r.memberGroupReference = peerToJoin; r.synchQuorum = synchQuorum; r.kind = KIND; return r; } /** Member of the group we wish to join through */ public URI memberGroupReference; /** * Optional node join options. If specified the node state representing the local node will be * updated with these options. Further, these options determine join behavior. */ public EnumSet<NodeOption> localNodeOptions; /** Minimum number of nodes to enumeration, after join, for synchronization to start */ public Integer synchQuorum; public String kind; }
public static class UpdateQuorumRequest { public static final String KIND = Utils.buildKind(UpdateQuorumRequest.class); public static UpdateQuorumRequest create(boolean isGroupUpdate) { UpdateQuorumRequest r = new UpdateQuorumRequest(); r.isGroupUpdate = isGroupUpdate; r.kind = KIND; return r; } public UpdateQuorumRequest setMembershipQuorum(int count) { this.membershipQuorum = count; return this; } public UpdateQuorumRequest setSynchQuorum(int count) { this.synchQuorum = count; return this; } public boolean isGroupUpdate; public Integer membershipQuorum; public Integer synchQuorum; public String kind; }
private ResourceAggregateMetric processInput(Operation op) { if (!op.hasBody()) { throw (new IllegalArgumentException("body is required")); } ResourceAggregateMetric state = op.getBody(ResourceAggregateMetric.class); Utils.validateState(getStateDescription(), state); return state; }
public static Service createFactory() { // workaround for GSON issue https://github.com/google/gson/issues/764 // We serialize the complex type once, on service creation, to avoid possible GSON race ResourceGroupState st = new ResourceGroupState(); st.query = QueryTask.Query.Builder.create().addFieldClause("one", "one").build(); Utils.toJson(st); return FactoryService.createIdempotent(ResourceGroupService.class); }
/** * The {@link RoleState} represents a role. A role applies to users contained in its user group, * to HTTP verbs in the set of applicable verbs, and to resources in its resource group. */ public static class RoleState extends ServiceDocument { public static final String KIND = Utils.buildKind(RoleState.class); public static final String FIELD_NAME_USER_GROUP_LINK = "userGroupLink"; public static final String FIELD_NAME_RESOURCE_GROUP_LINK = "resourceGroupLink"; public String userGroupLink; public String resourceGroupLink; public Set<Action> verbs; public Policy policy; public int priority; }
/** Request for committing or aborting this transaction */ public static class EndTransactionRequest { public static final String KIND = Utils.buildKind(EndTransactionRequest.class); public enum TransactionOutcome { COMMIT, ABORT } public String kind = KIND; public TransactionOutcome transactionOutcome; }
/** * Build a state object that can be used to submit a stage progress self patch. * * @param stage * @param subStage * @param e * @return */ private State buildPatch(TaskState.TaskStage stage, TaskState.SubStage subStage, Throwable e) { State s = new State(); s.taskInfo = new TaskState(); s.taskInfo.stage = stage; s.taskInfo.subStage = subStage; if (e != null) { s.taskInfo.failure = Utils.toServiceErrorResponse(e); } return s; }
private DeploymentService.State getState() { if (deploymentLink != null) { URI serviceUri = UriUtils.buildUri(xenonHost, deploymentLink); Operation getOperation = Operation.createGet(serviceUri).setUri(serviceUri).setReferer(this.xenonHost.getUri()); OperationLatch operationLatch = new OperationLatch(getOperation); xenonHost.sendRequest(getOperation); Operation completedOperation = null; try { completedOperation = operationLatch.awaitOperationCompletion(TimeUnit.SECONDS.toMicros(90)); } catch (Throwable e) { logger.error("SysConfig get failed!! ", e); throw new RuntimeException(e); } return completedOperation.getBody(DeploymentService.State.class); } else { QueryTask.Query kindClause = new QueryTask.Query() .setTermPropertyName(ServiceDocument.FIELD_NAME_KIND) .setTermMatchValue(Utils.buildKind(DeploymentService.State.class)); QueryTask.QuerySpecification querySpecification = new QueryTask.QuerySpecification(); querySpecification.query = kindClause; Operation broadcastOp = xenonHost .getCloudStoreHelper() .createBroadcastPost( ServiceUriPaths.CORE_LOCAL_QUERY_TASKS, ServiceUriPaths.DEFAULT_NODE_SELECTOR) .setBody(QueryTask.create(querySpecification).setDirect(true)); OperationLatch operationLatch = new OperationLatch(broadcastOp); xenonHost.sendRequest(broadcastOp); Operation completedOperation = null; try { completedOperation = operationLatch.awaitOperationCompletion(TimeUnit.SECONDS.toMicros(90)); } catch (Throwable e) { logger.error("SysConfig broadcastquery failed!! ", e); throw new RuntimeException(e); } Collection<String> documentLinks = QueryTaskUtils.getBroadcastQueryDocumentLinks(completedOperation); if (documentLinks.size() == 0) { return null; } this.deploymentLink = documentLinks.iterator().next(); return getState(); } }
private void assertSwagger(Swagger swagger) { assertEquals("/", swagger.getBasePath()); assertEquals(INFO_DESCRIPTION, swagger.getInfo().getDescription()); assertEquals(INFO_TERMS_OF_SERVICE, swagger.getInfo().getTermsOfService()); // excluded prefixes assertNull(swagger.getPath(ServiceUriPaths.CORE_AUTHZ_USERS)); assertNull(swagger.getPath(ServiceUriPaths.CORE_AUTHZ_ROLES)); assertNotNull(swagger.getPath(ServiceUriPaths.CORE_PROCESSES)); assertNotNull(swagger.getPath(ServiceUriPaths.CORE_CREDENTIALS)); Path p = swagger.getPath("/cars"); assertNotNull(p); assertNotNull(p.getPost()); assertNotNull(p.getGet()); assertNotNull(swagger.getPath("/cars/template")); assertNotNull(swagger.getPath("/cars/available")); assertNotNull(swagger.getPath("/cars/config")); assertNotNull(swagger.getPath("/cars/stats")); assertNotNull(swagger.getPath("/cars/subscriptions")); assertNotNull(swagger.getPath("/cars/{id}/template")); assertNotNull(swagger.getPath("/cars/{id}/available")); assertNotNull(swagger.getPath("/cars/{id}/config")); assertNotNull(swagger.getPath("/cars/{id}/stats")); assertNotNull(swagger.getPath("/cars/{id}/subscriptions")); p = swagger.getPath("/cars/{id}"); assertNotNull(p); assertNull(p.getPost()); assertNull(p.getPatch()); assertNotNull(p.getGet()); assertNotNull(p.getPut()); p = swagger.getPath("/tokens"); assertNotNull(p); assertNotNull(p.getGet()); assertNotNull(p.getGet().getResponses()); assertNotNull(p.getPost()); assertNotNull(p.getPost().getParameters()); assertNull(p.getPatch()); assertNull(p.getDelete()); Model model = swagger.getDefinitions().get(Utils.buildKind(UserToken.class)); Map<String, Property> properties = model.getProperties(); assertNull(properties.get(UserToken.FIELD_NAME_INTERNAL_ID)); }
public static class CheckConvergenceRequest { public static final String KIND = Utils.buildKind(CheckConvergenceRequest.class); public long membershipUpdateTimeMicros; public static CheckConvergenceRequest create(long membershipUpdateTime) { CheckConvergenceRequest r = new CheckConvergenceRequest(); r.membershipUpdateTimeMicros = membershipUpdateTime; r.kind = KIND; return r; } public String kind; }
@Before public void setUp() throws Exception { try { this.baseAccountId = Utils.getNowMicrosUtc(); this.host.setTransactionService(null); if (this.host.getServiceStage(SimpleTransactionFactoryService.SELF_LINK) == null) { this.host.startServiceAndWait( SimpleTransactionFactoryService.class, SimpleTransactionFactoryService.SELF_LINK); this.host.startServiceAndWait( BankAccountFactoryService.class, BankAccountFactoryService.SELF_LINK); } } catch (Throwable e) { throw new RuntimeException(e); } }
/** * Creates datastore documents in cloudstore. * * <p>This method creates datastore documents with datastore IDs * 00000000-0000-0000-0000-000000000000, 00000000-0000-0000-0000-000000000001, and so on. * * @param numDatastores The number of datastore documents to create. */ public static void loadDatastores(TestEnvironment cloudstore, int numDatastores) throws Throwable { for (int i = 0; i < numDatastores; i++) { DatastoreService.State datastore = new DatastoreService.State(); String datastoreId = new UUID(0, i).toString(); datastore.id = datastoreId; datastore.name = datastoreId; datastore.documentSelfLink = datastoreId; datastore.type = "SHARED_VMFS"; // TODO(mmutsuzaki) Support datastore tags. datastore.tags = new HashSet<>(); Operation result = cloudstore.sendPostAndWait(DatastoreServiceFactory.SELF_LINK, datastore); assertThat(result.getStatusCode(), is(200)); logger.debug("Created a datastore document: {}", Utils.toJson(datastore)); } }
/** * This method queries for the document link of the cluster configuration for the Kubernetes * Cluster. * * @param currentState */ private void queryClusterConfiguration(final KubernetesClusterCreateTask currentState) { QueryTask.Query kindClause = new QueryTask.Query() .setTermPropertyName(ServiceDocument.FIELD_NAME_KIND) .setTermMatchValue(Utils.buildKind(ClusterConfigurationService.State.class)); QueryTask.Query idClause = new QueryTask.Query() .setTermPropertyName(ClusterConfigurationService.State.FIELD_NAME_SELF_LINK) .setTermMatchValue( ClusterConfigurationServiceFactory.SELF_LINK + "/" + ClusterType.KUBERNETES.toString().toLowerCase()); QueryTask.QuerySpecification querySpecification = new QueryTask.QuerySpecification(); querySpecification.query.addBooleanClause(kindClause); querySpecification.query.addBooleanClause(idClause); QueryTask queryTask = QueryTask.create(querySpecification).setDirect(true); sendRequest( HostUtils.getCloudStoreHelper(this) .createBroadcastPost( ServiceUriPaths.CORE_LOCAL_QUERY_TASKS, ServiceUriPaths.DEFAULT_NODE_SELECTOR) .setBody(queryTask) .setCompletion( (Operation operation, Throwable throwable) -> { if (null != throwable) { failTask(throwable); return; } NodeGroupBroadcastResponse queryResponse = operation.getBody(NodeGroupBroadcastResponse.class); Set<String> documentLinks = QueryTaskUtils.getBroadcastQueryResults(queryResponse); if (documentLinks.isEmpty()) { failTask( new IllegalStateException( String.format( "Cannot find cluster configuration for %s", ClusterType.KUBERNETES.toString()))); return; } retrieveClusterConfiguration(currentState, documentLinks.iterator().next()); })); }
/** * Creates host documents in cloudstore. * * @param cloudstore CloudStore test environment to create documents in. * @param numHosts The number of host documents to create. * @param hostConfigurations A map from {@link HostConfiguration} to the probability that this * host configuration is used in the deployment. The sum of all the values of this map must be * 1. * @param numDatastores The number of datastores. * @param numDatastoresDistribution Distribution for number of datastores on each host. This * distribution is expected to generate samples in the range [0, numDatastores]. * @throws Throwable */ public static void loadHosts( TestEnvironment cloudstore, int numHosts, Map<HostConfiguration, Double> hostConfigurations, int numDatastores, IntegerDistribution numDatastoresDistribution) throws Throwable { int[] indices = new int[hostConfigurations.size()]; HostConfiguration[] configs = new HostConfiguration[hostConfigurations.size()]; double[] probabilities = new double[hostConfigurations.size()]; int i = 0; for (Map.Entry<HostConfiguration, Double> entry : hostConfigurations.entrySet()) { indices[i] = i; configs[i] = entry.getKey(); probabilities[i] = entry.getValue(); i++; } EnumeratedIntegerDistribution configDistribution = new EnumeratedIntegerDistribution(indices, probabilities); for (i = 0; i < numHosts; i++) { HostService.State host = new HostService.State(); host.hostAddress = "host" + i; host.state = HostState.READY; host.userName = "******"; host.password = "******"; host.reportedDatastores = new HashSet<>(); int numDatastoresPerHost = numDatastoresDistribution.sample(); assertThat(numDatastoresPerHost >= 0, is(true)); assertThat(numDatastoresPerHost <= numDatastores, is(true)); while (host.reportedDatastores.size() < numDatastoresPerHost) { int randomInt = random.nextInt(numDatastores); host.reportedDatastores.add(new UUID(0, randomInt).toString()); } host.reportedNetworks = new HashSet<>(); host.usageTags = new HashSet<>(Arrays.asList(UsageTag.CLOUD.name())); int configIndex = configDistribution.sample(); host.cpuCount = configs[configIndex].numCpus; host.memoryMb = configs[configIndex].memoryMb; host.documentSelfLink = new UUID(0, i).toString(); // TODO(mmutsuzaki) Support availability zones. Operation result = cloudstore.sendPostAndWait(HostServiceFactory.SELF_LINK, host); assertThat(result.getStatusCode(), is(200)); logger.debug("Created a host document: {}", Utils.toJson(host)); } }
/** * This method retrieves the container templates of all the containers that are running on this * VM. * * @param currentState Supplies the current state object. */ private void queryContainers(final State currentState) { QueryTask.Query kindClause = new QueryTask.Query() .setTermPropertyName(ServiceDocument.FIELD_NAME_KIND) .setTermMatchValue(Utils.buildKind(ContainerService.State.class)); QueryTask.Query nameClause = new QueryTask.Query() .setTermPropertyName(ContainerService.State.FIELD_NAME_VM_SERVICE_LINK) .setTermMatchValue(currentState.vmServiceLink); QueryTask.QuerySpecification querySpecification = new QueryTask.QuerySpecification(); querySpecification.query.addBooleanClause(kindClause); querySpecification.query.addBooleanClause(nameClause); QueryTask queryTask = QueryTask.create(querySpecification).setDirect(true); sendRequest( Operation.createPost( UriUtils.buildBroadcastRequestUri( UriUtils.buildUri(getHost(), ServiceUriPaths.CORE_LOCAL_QUERY_TASKS), ServiceUriPaths.DEFAULT_NODE_SELECTOR)) .setBody(queryTask) .setCompletion( new Operation.CompletionHandler() { @Override public void handle(Operation operation, Throwable throwable) { if (null != throwable) { failTask(throwable); return; } try { Collection<String> documentLinks = QueryTaskUtils.getBroadcastQueryDocumentLinks(operation); QueryTaskUtils.logQueryResults( CreateManagementVmTaskService.this, documentLinks); checkState(documentLinks.size() > 0); getContainerTemplates(currentState, documentLinks); } catch (Throwable t) { failTask(t); } } })); }
/** * Convert the data returned as ServiceDocumentQueryResult from Xenon to ResourceList, which is * being used by api-fe. * * <p>The order of the data will be honored. * * @param documentType * @param queryResult * @param convert * @param <T> * @return */ public static <T, S> ResourceList<T> xenonQueryResultToResourceList( Class<S> documentType, ServiceDocumentQueryResult queryResult, Function<S, T> convert) { // The documents links stored in documentLinks are sorted while documents are not, and // the following loop iterates on the documentLinks to preserve this order. List<T> documents = new ArrayList<>(); if (queryResult.documentLinks != null) { for (String link : queryResult.documentLinks) { documents.add(convert.apply(Utils.fromJson(queryResult.documents.get(link), documentType))); } } ResourceList<T> resourceList = new ResourceList<>(); resourceList.setItems(documents); resourceList.setNextPageLink(queryResult.nextPageLink); resourceList.setPreviousPageLink(queryResult.prevPageLink); return resourceList; }
/** * This method queries the list of data stores available in this ESX cloud instance and, on query * completion, creates a set of ImageCopyService instances and transitions the current service * instance to the AWAIT_COMPLETION sub-state. * * @param current */ protected void handleTriggerCopies(final State current) { try { Set<Datastore> datastoreSet = getZookeeperHostMonitor().getAllDatastores(); ServiceUtils.logInfo(this, "All target datastores: %s", Utils.toJson(datastoreSet)); triggerCopyServices(datastoreSet, current); // move to next stage if (!current.isSelfProgressionDisabled) { State patch = ImageReplicatorService.this.buildPatch( TaskState.TaskStage.STARTED, TaskState.SubStage.AWAIT_COMPLETION, null); patch.dataStoreCount = datastoreSet.size(); sendSelfPatch(patch); } } catch (Exception e) { failTask(e); } }
private void assertDescriptorJson(Operation o, Throwable e) { if (e != null) { e.printStackTrace(); if (e.getMessage().contains("Unparseable JSON body")) { // Ignore failure // Expecting GSON classloading issue to be fixed: // - https://github.com/google/gson/issues/764 // - https://www.pivotaltracker.com/story/show/120885303 Utils.logWarning("GSON initialization failure: %s", e); // Stop assertion logic here, test will finish as success return; } else { fail(e.getMessage()); } } try { Swagger swagger = Json.mapper().readValue(o.getBody(String.class), Swagger.class); assertSwagger(swagger); } catch (IOException ioe) { fail(ioe.getMessage()); } }
/** * Issues updates to peer nodes, after a local update has been accepted. If the service support * OWNER_SELECTION the replication message is the Propose message in the consensus work flow. * * @param localState * @param outboundOp * @param req * @param rsp */ void replicateUpdate( NodeGroupState localState, Operation outboundOp, SelectAndForwardRequest req, SelectOwnerResponse rsp) { int memberCount = localState.nodes.size(); NodeState selfNode = localState.nodes.get(getHost().getId()); AtomicInteger successCount = new AtomicInteger(0); if (req.serviceOptions.contains(ServiceOption.OWNER_SELECTION) && selfNode.membershipQuorum > memberCount) { outboundOp.fail(new IllegalStateException("Not enough peers: " + memberCount)); return; } if (memberCount == 1) { outboundOp.complete(); return; } AtomicInteger failureCount = new AtomicInteger(); // The eligible count can be less than the member count if the parent node selector has // a smaller replication factor than group size. We need to use the replication factor // as the upper bound for calculating success and failure thresholds int eligibleMemberCount = rsp.selectedNodes.size(); // When quorum is not required, succeed when we replicate to at least one remote node, // or, if only local node is available, succeed immediately. int successThreshold = Math.min(2, eligibleMemberCount - 1); int failureThreshold = eligibleMemberCount - successThreshold; if (req.serviceOptions.contains(ServiceOption.OWNER_SELECTION)) { successThreshold = Math.min(eligibleMemberCount, selfNode.membershipQuorum); failureThreshold = eligibleMemberCount - successThreshold; if (failureThreshold == successThreshold && successThreshold == 1) { // degenerate case: node group has just two members and quorum must be one, which // means even the single remote peer is down, we should still succeed. failureThreshold = 0; } } final int successThresholdFinal = successThreshold; final int failureThresholdFinal = failureThreshold; CompletionHandler c = (o, e) -> { if (e == null && o != null && o.getStatusCode() >= Operation.STATUS_CODE_FAILURE_THRESHOLD) { e = new IllegalStateException("Request failed: " + o.toString()); } int sCount = successCount.get(); int fCount = failureCount.get(); if (e != null) { logInfo("Replication to %s failed: %s", o.getUri(), e.toString()); fCount = failureCount.incrementAndGet(); } else { sCount = successCount.incrementAndGet(); } if (sCount == successThresholdFinal) { outboundOp.complete(); return; } if (fCount == 0) { return; } if (fCount >= failureThresholdFinal || ((fCount + sCount) == memberCount)) { String error = String.format( "%s to %s failed. Success: %d, Fail: %d, quorum: %d, threshold: %d", outboundOp.getAction(), outboundOp.getUri().getPath(), sCount, fCount, selfNode.membershipQuorum, failureThresholdFinal); logWarning("%s", error); outboundOp.fail(new IllegalStateException(error)); } }; String jsonBody = Utils.toJson(req.linkedState); Operation update = Operation.createPost(null) .setAction(outboundOp.getAction()) .setBodyNoCloning(jsonBody) .setCompletion(c) .setRetryCount(1) .setExpiration(outboundOp.getExpirationMicrosUtc()) .transferRequestHeadersFrom(outboundOp) .removePragmaDirective(Operation.PRAGMA_DIRECTIVE_FORWARDED) .addPragmaDirective(Operation.PRAGMA_DIRECTIVE_REPLICATED) .setReferer(outboundOp.getReferer()); if (update.getCookies() != null) { update.getCookies().clear(); } ServiceClient cl = getHost().getClient(); String selfId = getHost().getId(); // trigger completion once, for self node, since its part of our accounting c.handle(null, null); rsp.selectedNodes.forEach( (m) -> { if (m.id.equals(selfId)) { return; } if (m.options.contains(NodeOption.OBSERVER)) { return; } try { URI remotePeerService = new URI( m.groupReference.getScheme(), null, m.groupReference.getHost(), m.groupReference.getPort(), outboundOp.getUri().getPath(), outboundOp.getUri().getQuery(), null); update.setUri(remotePeerService); } catch (Throwable e1) { } if (NodeState.isUnAvailable(m)) { c.handle(update, new IllegalStateException("node is not available")); return; } cl.send(update); }); }
private KubernetesClusterCreateTask buildPatch( TaskState.TaskStage stage, TaskState.SubStage subStage, @Nullable Throwable t) { return buildPatch(stage, subStage, null == t ? null : Utils.toServiceErrorResponse(t)); }
/** * This method builds a state object which can be used to submit a stage progress self-patch. * * @param stage Supplies the stage that the current service instance is moving to. * @param e Supplies the exception that the current service instance encountered if any. * @return Returns a patch state object that the current service instance is moving to. */ @VisibleForTesting protected State buildPatch(TaskState.TaskStage stage, @Nullable Throwable e) { return buildPatch(stage, (e != null) ? Utils.toServiceErrorResponse(e) : null); }
private void handleUpdateQuorumPatch(Operation patch, NodeGroupState localState) { UpdateQuorumRequest bd = patch.getBody(UpdateQuorumRequest.class); NodeState self = localState.nodes.get(getHost().getId()); logInfo("Updating self quorum from %d. Body: %s", self.membershipQuorum, Utils.toJsonHtml(bd)); if (bd.membershipQuorum != null) { self.membershipQuorum = bd.membershipQuorum; } if (bd.synchQuorum != null) { self.synchQuorum = bd.synchQuorum; } self.documentVersion++; self.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); localState.membershipUpdateTimeMicros = self.documentUpdateTimeMicros; if (!bd.isGroupUpdate) { patch.setBodyNoCloning(localState).complete(); return; } // TODO use a three phase consensus algorithm to update quorum similar // to the steady state replication consensus. // Issue N requests to update quorum to all member of the group. If they // do not all succeed the request, then the operation fails and some peers // will be left with a quorum level different than the others. That is // acceptable. The replication logic, can reject a peer if its quorum level // is not set at the same level as the owner. The client of this request can // also retry... bd.isGroupUpdate = false; int failureThreshold = (localState.nodes.size() - 1) / 2; AtomicInteger pending = new AtomicInteger(localState.nodes.size()); AtomicInteger failures = new AtomicInteger(); CompletionHandler c = (o, e) -> { if (e != null) { logWarning("Node %s failed quorum update: %s", o.getUri(), e.toString()); failures.incrementAndGet(); } int p = pending.decrementAndGet(); if (p != 0) { return; } if (failures.get() > failureThreshold) { patch.fail(new IllegalStateException("Majority of nodes failed request")); } else { patch.setBodyNoCloning(localState).complete(); } }; for (NodeState node : localState.nodes.values()) { if (!NodeState.isAvailable(node, getHost().getId(), true)) { c.handle(null, null); continue; } if (bd.membershipQuorum != null) { node.membershipQuorum = bd.membershipQuorum; } if (bd.synchQuorum != null) { node.synchQuorum = bd.synchQuorum; } node.documentVersion++; node.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); Operation p = Operation.createPatch(node.groupReference).setBody(bd).setCompletion(c); sendRequest(p); } }
/** * Merges current node group state with state that came through a PATCH. * * <p>PATCH requests are sent from * * <p>1) local service to itself, after it has communicated with a peer, during maintenance. * * <p>2) A remote peer when its probing this local service, during its maintenance cycle * * <p>The key invariants that should not be violated, guaranteeing forward evolution of state even * if nodes only talk to a small portion of their peers: * * <p>- When a status changes, the change is accepted if the remote version is higher * * <p>- A local node is the only node that can change its own node entry status, for a PATCH that * it receives. * * <p>- A node should never increment the version of a node entry, for other nodes, unless that * node entry is marked UNAVAILABLE * * <p>- When a status changes during gossip version must be incremented - Versions always move * forward */ private void mergeRemoteAndLocalMembership( NodeGroupState localState, NodeGroupState remotePeerState, EnumSet<NodeGroupChange> changes) { if (localState == null) { return; } boolean isSelfPatch = remotePeerState.documentOwner.equals(getHost().getId()); long now = Utils.getNowMicrosUtc(); NodeState selfEntry = localState.nodes.get(getHost().getId()); for (NodeState remoteNodeEntry : remotePeerState.nodes.values()) { NodeState l = localState.nodes.get(remoteNodeEntry.id); boolean isLocalNode = remoteNodeEntry.id.equals(getHost().getId()); if (!isSelfPatch && isLocalNode) { if (remoteNodeEntry.status != l.status) { logWarning( "Peer %s is reporting us as %s, current status: %s", remotePeerState.documentOwner, remoteNodeEntry.status, l.status); if (remoteNodeEntry.documentVersion > l.documentVersion) { // increment local version to re-enforce we are alive and well l.documentVersion = remoteNodeEntry.documentVersion; l.documentUpdateTimeMicros = now; changes.add(NodeGroupChange.SELF_CHANGE); } } // local instance of node group service is the only one that can update its own // status continue; } if (l == null) { boolean hasExpired = remoteNodeEntry.documentExpirationTimeMicros > 0 && remoteNodeEntry.documentExpirationTimeMicros < now; if (hasExpired || NodeState.isUnAvailable(remoteNodeEntry)) { continue; } if (!isLocalNode) { logInfo( "Adding new peer %s (%s), status %s", remoteNodeEntry.id, remoteNodeEntry.groupReference, remoteNodeEntry.status); } // we found a new peer, through the gossip PATCH. Add to our state localState.nodes.put(remoteNodeEntry.id, remoteNodeEntry); changes.add(NodeGroupChange.PEER_ADDED); continue; } boolean needsUpdate = l.status != remoteNodeEntry.status; if (needsUpdate) { changes.add(NodeGroupChange.PEER_STATUS_CHANGE); } if (isSelfPatch && isLocalNode && needsUpdate) { // we sent a self PATCH to update our status. Move our version forward; remoteNodeEntry.documentVersion = Math.max(remoteNodeEntry.documentVersion, l.documentVersion) + 1; } // versions move forward only, ignore stale nodes if (remoteNodeEntry.documentVersion < l.documentVersion) { logInfo( "v:%d - q:%d, v:%d - q:%d , %s - %s (local:%s %d)", l.documentVersion, l.membershipQuorum, remoteNodeEntry.documentVersion, remoteNodeEntry.membershipQuorum, l.id, remotePeerState.documentOwner, getHost().getId(), selfEntry.documentVersion); continue; } if (remoteNodeEntry.documentVersion == l.documentVersion && needsUpdate) { // pick update with most recent time, even if that is prone to drift and jitter // between nodes if (remoteNodeEntry.documentUpdateTimeMicros < l.documentUpdateTimeMicros) { logWarning( "Ignoring update for %s from peer %s. Local status: %s, remote status: %s", remoteNodeEntry.id, remotePeerState.documentOwner, l.status, remoteNodeEntry.status); continue; } } if (remoteNodeEntry.status == NodeStatus.UNAVAILABLE && l.documentExpirationTimeMicros == 0 && remoteNodeEntry.documentExpirationTimeMicros == 0) { remoteNodeEntry.documentExpirationTimeMicros = Utils.getNowMicrosUtc() + localState.config.nodeRemovalDelayMicros; logInfo( "Set expiration at %d for unavailable node %s(%s)", remoteNodeEntry.documentExpirationTimeMicros, remoteNodeEntry.id, remoteNodeEntry.groupReference); changes.add(NodeGroupChange.PEER_STATUS_CHANGE); needsUpdate = true; } if (remoteNodeEntry.status == NodeStatus.UNAVAILABLE && needsUpdate) { // nodes increment their own entry version, except, if they are unavailable remoteNodeEntry.documentVersion++; } localState.nodes.put(remoteNodeEntry.id, remoteNodeEntry); } List<String> missingNodes = new ArrayList<>(); for (NodeState l : localState.nodes.values()) { NodeState r = remotePeerState.nodes.get(l.id); if (!NodeState.isUnAvailable(l) || l.id.equals(getHost().getId())) { continue; } long expirationMicros = l.documentExpirationTimeMicros; if (r != null) { expirationMicros = Math.max(l.documentExpirationTimeMicros, r.documentExpirationTimeMicros); } if (expirationMicros > 0 && now > expirationMicros) { changes.add(NodeGroupChange.PEER_STATUS_CHANGE); logInfo("Removing expired unavailable node %s(%s)", l.id, l.groupReference); missingNodes.add(l.id); } } for (String id : missingNodes) { localState.nodes.remove(id); } boolean isModified = !changes.isEmpty(); localState.membershipUpdateTimeMicros = Math.max( remotePeerState.membershipUpdateTimeMicros, isModified ? now : localState.membershipUpdateTimeMicros); if (isModified) { logInfo( "State updated, merge with %s, self %s, %d", remotePeerState.documentOwner, localState.documentOwner, localState.membershipUpdateTimeMicros); } }
public static class BankAccountServiceState extends ServiceDocument { static final String KIND = Utils.buildKind(BankAccountServiceState.class); public double balance; }
public void handleGossipPatchCompletion( Operation maint, Operation patch, Throwable e, NodeGroupState localState, NodeGroupState patchBody, AtomicInteger remaining, NodeState remotePeer) { try { if (patch == null) { return; } long updateTime = localState.membershipUpdateTimeMicros; if (e != null) { updateTime = remotePeer.status != NodeStatus.UNAVAILABLE ? Utils.getNowMicrosUtc() : updateTime; if (remotePeer.status != NodeStatus.UNAVAILABLE) { remotePeer.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); remotePeer.documentVersion++; } remotePeer.status = NodeStatus.UNAVAILABLE; } else { NodeGroupState peerState = getBody(patch); if (peerState.documentOwner.equals(remotePeer.id)) { NodeState remotePeerStateFromRsp = peerState.nodes.get(remotePeer.id); if (remotePeerStateFromRsp.documentVersion > remotePeer.documentVersion) { remotePeer = remotePeerStateFromRsp; } } else if (remotePeer.status != NodeStatus.REPLACED) { logWarning( "Peer address %s has changed to id %s from %s", patch.getUri(), peerState.documentOwner, remotePeer.id); remotePeer.status = NodeStatus.REPLACED; remotePeer.documentVersion++; updateTime = Utils.getNowMicrosUtc(); } updateTime = Math.max(updateTime, peerState.membershipUpdateTimeMicros); } synchronized (patchBody) { patchBody.nodes.put(remotePeer.id, remotePeer); patchBody.membershipUpdateTimeMicros = Math.max(updateTime, patchBody.membershipUpdateTimeMicros); } } finally { int r = remaining.decrementAndGet(); if (r != 0) { return; } // to merge updated state, issue a self PATCH. It contains NodeState entries for every // peer node we just talked to sendRequest(Operation.createPatch(getUri()).setBody(patchBody)); maint.complete(); } }
private void performGroupMaintenance(Operation maint, Operation get, Throwable getEx) { // we ignore any body associated with the PUT if (getEx != null) { logWarning("Failure getting state: %s", getEx.toString()); maint.complete(); return; } if (!get.hasBody()) { maint.complete(); return; } NodeGroupState localState = get.getBody(NodeGroupState.class); if (localState == null || localState.nodes == null) { maint.complete(); return; } if (localState.nodes.size() <= 1) { maint.complete(); return; } if (getHost().isStopping()) { maint.complete(); return; } // probe a fixed, random selection of our peers, giving them our view of the group and // getting back theirs // probe log 10 of peers (exclude self) int peersToProbe = (int) Math.log10(localState.nodes.size() - 1); // probe at least N peers peersToProbe = Math.max(peersToProbe, MIN_PEER_GOSSIP_COUNT); // probe at most total number of peers peersToProbe = Math.min(localState.nodes.size() - 1, peersToProbe); AtomicInteger remaining = new AtomicInteger(peersToProbe); NodeState[] randomizedPeers = shuffleGroupMembers(localState); NodeState localNode = localState.nodes.get(getHost().getId()); localNode.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); localNode.groupReference = UriUtils.buildPublicUri(getHost(), getSelfLink()); localState.documentOwner = getHost().getId(); NodeGroupState patchBody = new NodeGroupState(); patchBody.documentOwner = getHost().getId(); patchBody.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); int probeCount = 0; for (NodeState peer : randomizedPeers) { if (peer == null) { continue; } if (peer.id.equals(getHost().getId())) { continue; } NodeState remotePeer = peer; URI peerUri = peer.groupReference; // send a gossip PATCH to the peer, with our state // perform a health check to N randomly selected peers // 1) We issue a PATCH to a peer, with the body set to our view of the group // 2a) if the peer is healthy, they will merge our state with theirs and return // the merged state in the response. We will then update our state and mark the // peer AVAILABLE. We just update peer node, we don't currently merge their state // 2b) if the PATCH failed, we mark the PEER it UNAVAILABLE CompletionHandler ch = (o, e) -> handleGossipPatchCompletion( maint, o, e, localState, patchBody, remaining, remotePeer); Operation patch = Operation.createPatch(peerUri) .setBody(localState) .setRetryCount(0) .setExpiration(Utils.getNowMicrosUtc() + getHost().getOperationTimeoutMicros() / 2) .forceRemote() .setCompletion(ch); if (peer.groupReference.equals(localNode.groupReference) && peer.status != NodeStatus.REPLACED) { // If we just detected this is a peer node that used to listen on our address, // but its obviously no longer around, mark it as REPLACED and do not send PATCH peer.status = NodeStatus.REPLACED; peer.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); peer.documentVersion++; ch.handle(null, null); } else { sendRequest(patch); } // only probe N peers if (++probeCount >= peersToProbe) { break; } } if (probeCount == 0) { maint.complete(); } }
private void handleJoinPost( JoinPeerRequest joinBody, Operation joinOp, NodeGroupState localState, NodeGroupState remotePeerState) { if (UriUtils.isHostEqual(getHost(), joinBody.memberGroupReference)) { logInfo("Skipping self join"); // we tried joining ourself, abort; joinOp.complete(); return; } NodeState self = localState.nodes.get(getHost().getId()); if (joinOp != null) { self.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); self.documentVersion++; // at a minimum we need 2 nodes to synch: self plus the node we are joining self.synchQuorum = 2; if (joinBody.synchQuorum != null) { self.synchQuorum = Math.max(self.synchQuorum, joinBody.synchQuorum); } if (joinBody.localNodeOptions != null) { if (!validateNodeOptions(joinOp, joinBody.localNodeOptions)) { return; } self.options = joinBody.localNodeOptions; } localState.membershipUpdateTimeMicros = self.documentUpdateTimeMicros; // complete the join POST, continue with state merge joinOp.complete(); } // this method is two pass // First pass get the remote peer state // Second pass, insert self if (remotePeerState == null) { // Pass 1, get existing member state sendRequest( Operation.createGet(joinBody.memberGroupReference) .setCompletion( (o, e) -> { if (e != null) { logWarning("Failure getting peer %s state:%s", o.getUri(), e.toString()); return; } NodeGroupState remoteState = getBody(o); handleJoinPost(joinBody, null, localState, remoteState); })); return; } // Pass 2, merge remote group state with ours, send self to peer sendRequest(Operation.createPatch(getUri()).setBody(remotePeerState)); logInfo( "Synch quorum: %d. Sending POST to insert self (%s) to peer %s", self.synchQuorum, self.groupReference, joinBody.memberGroupReference); Operation insertSelfToPeer = Operation.createPost(joinBody.memberGroupReference) .setBody(self) .setCompletion( (o, e) -> { if (e != null) { logSevere("Insert POST to %s failed", o.getUri()); return; } // we will restart services to synchronize with peers on the next // maintenance interval with a stable group membership }); sendRequest(insertSelfToPeer); }