private NodeState buildLocalNodeState(NodeState body) { if (body == null) { body = new NodeState(); } body.id = getHost().getId(); body.status = NodeStatus.SYNCHRONIZING; body.groupReference = UriUtils.buildPublicUri(getHost(), getSelfLink()); body.documentSelfLink = UriUtils.buildUriPath(getSelfLink(), body.id); body.documentKind = Utils.buildKind(NodeState.class); body.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); return body; }
@Before public void setUp() throws Exception { try { this.baseAccountId = Utils.getNowMicrosUtc(); this.host.setTransactionService(null); if (this.host.getServiceStage(SimpleTransactionFactoryService.SELF_LINK) == null) { this.host.startServiceAndWait( SimpleTransactionFactoryService.class, SimpleTransactionFactoryService.SELF_LINK); this.host.startServiceAndWait( BankAccountFactoryService.class, BankAccountFactoryService.SELF_LINK); } } catch (Throwable e) { throw new RuntimeException(e); } }
/** * Merges current node group state with state that came through a PATCH. * * <p>PATCH requests are sent from * * <p>1) local service to itself, after it has communicated with a peer, during maintenance. * * <p>2) A remote peer when its probing this local service, during its maintenance cycle * * <p>The key invariants that should not be violated, guaranteeing forward evolution of state even * if nodes only talk to a small portion of their peers: * * <p>- When a status changes, the change is accepted if the remote version is higher * * <p>- A local node is the only node that can change its own node entry status, for a PATCH that * it receives. * * <p>- A node should never increment the version of a node entry, for other nodes, unless that * node entry is marked UNAVAILABLE * * <p>- When a status changes during gossip version must be incremented - Versions always move * forward */ private void mergeRemoteAndLocalMembership( NodeGroupState localState, NodeGroupState remotePeerState, EnumSet<NodeGroupChange> changes) { if (localState == null) { return; } boolean isSelfPatch = remotePeerState.documentOwner.equals(getHost().getId()); long now = Utils.getNowMicrosUtc(); NodeState selfEntry = localState.nodes.get(getHost().getId()); for (NodeState remoteNodeEntry : remotePeerState.nodes.values()) { NodeState l = localState.nodes.get(remoteNodeEntry.id); boolean isLocalNode = remoteNodeEntry.id.equals(getHost().getId()); if (!isSelfPatch && isLocalNode) { if (remoteNodeEntry.status != l.status) { logWarning( "Peer %s is reporting us as %s, current status: %s", remotePeerState.documentOwner, remoteNodeEntry.status, l.status); if (remoteNodeEntry.documentVersion > l.documentVersion) { // increment local version to re-enforce we are alive and well l.documentVersion = remoteNodeEntry.documentVersion; l.documentUpdateTimeMicros = now; changes.add(NodeGroupChange.SELF_CHANGE); } } // local instance of node group service is the only one that can update its own // status continue; } if (l == null) { boolean hasExpired = remoteNodeEntry.documentExpirationTimeMicros > 0 && remoteNodeEntry.documentExpirationTimeMicros < now; if (hasExpired || NodeState.isUnAvailable(remoteNodeEntry)) { continue; } if (!isLocalNode) { logInfo( "Adding new peer %s (%s), status %s", remoteNodeEntry.id, remoteNodeEntry.groupReference, remoteNodeEntry.status); } // we found a new peer, through the gossip PATCH. Add to our state localState.nodes.put(remoteNodeEntry.id, remoteNodeEntry); changes.add(NodeGroupChange.PEER_ADDED); continue; } boolean needsUpdate = l.status != remoteNodeEntry.status; if (needsUpdate) { changes.add(NodeGroupChange.PEER_STATUS_CHANGE); } if (isSelfPatch && isLocalNode && needsUpdate) { // we sent a self PATCH to update our status. Move our version forward; remoteNodeEntry.documentVersion = Math.max(remoteNodeEntry.documentVersion, l.documentVersion) + 1; } // versions move forward only, ignore stale nodes if (remoteNodeEntry.documentVersion < l.documentVersion) { logInfo( "v:%d - q:%d, v:%d - q:%d , %s - %s (local:%s %d)", l.documentVersion, l.membershipQuorum, remoteNodeEntry.documentVersion, remoteNodeEntry.membershipQuorum, l.id, remotePeerState.documentOwner, getHost().getId(), selfEntry.documentVersion); continue; } if (remoteNodeEntry.documentVersion == l.documentVersion && needsUpdate) { // pick update with most recent time, even if that is prone to drift and jitter // between nodes if (remoteNodeEntry.documentUpdateTimeMicros < l.documentUpdateTimeMicros) { logWarning( "Ignoring update for %s from peer %s. Local status: %s, remote status: %s", remoteNodeEntry.id, remotePeerState.documentOwner, l.status, remoteNodeEntry.status); continue; } } if (remoteNodeEntry.status == NodeStatus.UNAVAILABLE && l.documentExpirationTimeMicros == 0 && remoteNodeEntry.documentExpirationTimeMicros == 0) { remoteNodeEntry.documentExpirationTimeMicros = Utils.getNowMicrosUtc() + localState.config.nodeRemovalDelayMicros; logInfo( "Set expiration at %d for unavailable node %s(%s)", remoteNodeEntry.documentExpirationTimeMicros, remoteNodeEntry.id, remoteNodeEntry.groupReference); changes.add(NodeGroupChange.PEER_STATUS_CHANGE); needsUpdate = true; } if (remoteNodeEntry.status == NodeStatus.UNAVAILABLE && needsUpdate) { // nodes increment their own entry version, except, if they are unavailable remoteNodeEntry.documentVersion++; } localState.nodes.put(remoteNodeEntry.id, remoteNodeEntry); } List<String> missingNodes = new ArrayList<>(); for (NodeState l : localState.nodes.values()) { NodeState r = remotePeerState.nodes.get(l.id); if (!NodeState.isUnAvailable(l) || l.id.equals(getHost().getId())) { continue; } long expirationMicros = l.documentExpirationTimeMicros; if (r != null) { expirationMicros = Math.max(l.documentExpirationTimeMicros, r.documentExpirationTimeMicros); } if (expirationMicros > 0 && now > expirationMicros) { changes.add(NodeGroupChange.PEER_STATUS_CHANGE); logInfo("Removing expired unavailable node %s(%s)", l.id, l.groupReference); missingNodes.add(l.id); } } for (String id : missingNodes) { localState.nodes.remove(id); } boolean isModified = !changes.isEmpty(); localState.membershipUpdateTimeMicros = Math.max( remotePeerState.membershipUpdateTimeMicros, isModified ? now : localState.membershipUpdateTimeMicros); if (isModified) { logInfo( "State updated, merge with %s, self %s, %d", remotePeerState.documentOwner, localState.documentOwner, localState.membershipUpdateTimeMicros); } }
public void handleGossipPatchCompletion( Operation maint, Operation patch, Throwable e, NodeGroupState localState, NodeGroupState patchBody, AtomicInteger remaining, NodeState remotePeer) { try { if (patch == null) { return; } long updateTime = localState.membershipUpdateTimeMicros; if (e != null) { updateTime = remotePeer.status != NodeStatus.UNAVAILABLE ? Utils.getNowMicrosUtc() : updateTime; if (remotePeer.status != NodeStatus.UNAVAILABLE) { remotePeer.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); remotePeer.documentVersion++; } remotePeer.status = NodeStatus.UNAVAILABLE; } else { NodeGroupState peerState = getBody(patch); if (peerState.documentOwner.equals(remotePeer.id)) { NodeState remotePeerStateFromRsp = peerState.nodes.get(remotePeer.id); if (remotePeerStateFromRsp.documentVersion > remotePeer.documentVersion) { remotePeer = remotePeerStateFromRsp; } } else if (remotePeer.status != NodeStatus.REPLACED) { logWarning( "Peer address %s has changed to id %s from %s", patch.getUri(), peerState.documentOwner, remotePeer.id); remotePeer.status = NodeStatus.REPLACED; remotePeer.documentVersion++; updateTime = Utils.getNowMicrosUtc(); } updateTime = Math.max(updateTime, peerState.membershipUpdateTimeMicros); } synchronized (patchBody) { patchBody.nodes.put(remotePeer.id, remotePeer); patchBody.membershipUpdateTimeMicros = Math.max(updateTime, patchBody.membershipUpdateTimeMicros); } } finally { int r = remaining.decrementAndGet(); if (r != 0) { return; } // to merge updated state, issue a self PATCH. It contains NodeState entries for every // peer node we just talked to sendRequest(Operation.createPatch(getUri()).setBody(patchBody)); maint.complete(); } }
private void performGroupMaintenance(Operation maint, Operation get, Throwable getEx) { // we ignore any body associated with the PUT if (getEx != null) { logWarning("Failure getting state: %s", getEx.toString()); maint.complete(); return; } if (!get.hasBody()) { maint.complete(); return; } NodeGroupState localState = get.getBody(NodeGroupState.class); if (localState == null || localState.nodes == null) { maint.complete(); return; } if (localState.nodes.size() <= 1) { maint.complete(); return; } if (getHost().isStopping()) { maint.complete(); return; } // probe a fixed, random selection of our peers, giving them our view of the group and // getting back theirs // probe log 10 of peers (exclude self) int peersToProbe = (int) Math.log10(localState.nodes.size() - 1); // probe at least N peers peersToProbe = Math.max(peersToProbe, MIN_PEER_GOSSIP_COUNT); // probe at most total number of peers peersToProbe = Math.min(localState.nodes.size() - 1, peersToProbe); AtomicInteger remaining = new AtomicInteger(peersToProbe); NodeState[] randomizedPeers = shuffleGroupMembers(localState); NodeState localNode = localState.nodes.get(getHost().getId()); localNode.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); localNode.groupReference = UriUtils.buildPublicUri(getHost(), getSelfLink()); localState.documentOwner = getHost().getId(); NodeGroupState patchBody = new NodeGroupState(); patchBody.documentOwner = getHost().getId(); patchBody.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); int probeCount = 0; for (NodeState peer : randomizedPeers) { if (peer == null) { continue; } if (peer.id.equals(getHost().getId())) { continue; } NodeState remotePeer = peer; URI peerUri = peer.groupReference; // send a gossip PATCH to the peer, with our state // perform a health check to N randomly selected peers // 1) We issue a PATCH to a peer, with the body set to our view of the group // 2a) if the peer is healthy, they will merge our state with theirs and return // the merged state in the response. We will then update our state and mark the // peer AVAILABLE. We just update peer node, we don't currently merge their state // 2b) if the PATCH failed, we mark the PEER it UNAVAILABLE CompletionHandler ch = (o, e) -> handleGossipPatchCompletion( maint, o, e, localState, patchBody, remaining, remotePeer); Operation patch = Operation.createPatch(peerUri) .setBody(localState) .setRetryCount(0) .setExpiration(Utils.getNowMicrosUtc() + getHost().getOperationTimeoutMicros() / 2) .forceRemote() .setCompletion(ch); if (peer.groupReference.equals(localNode.groupReference) && peer.status != NodeStatus.REPLACED) { // If we just detected this is a peer node that used to listen on our address, // but its obviously no longer around, mark it as REPLACED and do not send PATCH peer.status = NodeStatus.REPLACED; peer.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); peer.documentVersion++; ch.handle(null, null); } else { sendRequest(patch); } // only probe N peers if (++probeCount >= peersToProbe) { break; } } if (probeCount == 0) { maint.complete(); } }
private void handleJoinPost( JoinPeerRequest joinBody, Operation joinOp, NodeGroupState localState, NodeGroupState remotePeerState) { if (UriUtils.isHostEqual(getHost(), joinBody.memberGroupReference)) { logInfo("Skipping self join"); // we tried joining ourself, abort; joinOp.complete(); return; } NodeState self = localState.nodes.get(getHost().getId()); if (joinOp != null) { self.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); self.documentVersion++; // at a minimum we need 2 nodes to synch: self plus the node we are joining self.synchQuorum = 2; if (joinBody.synchQuorum != null) { self.synchQuorum = Math.max(self.synchQuorum, joinBody.synchQuorum); } if (joinBody.localNodeOptions != null) { if (!validateNodeOptions(joinOp, joinBody.localNodeOptions)) { return; } self.options = joinBody.localNodeOptions; } localState.membershipUpdateTimeMicros = self.documentUpdateTimeMicros; // complete the join POST, continue with state merge joinOp.complete(); } // this method is two pass // First pass get the remote peer state // Second pass, insert self if (remotePeerState == null) { // Pass 1, get existing member state sendRequest( Operation.createGet(joinBody.memberGroupReference) .setCompletion( (o, e) -> { if (e != null) { logWarning("Failure getting peer %s state:%s", o.getUri(), e.toString()); return; } NodeGroupState remoteState = getBody(o); handleJoinPost(joinBody, null, localState, remoteState); })); return; } // Pass 2, merge remote group state with ours, send self to peer sendRequest(Operation.createPatch(getUri()).setBody(remotePeerState)); logInfo( "Synch quorum: %d. Sending POST to insert self (%s) to peer %s", self.synchQuorum, self.groupReference, joinBody.memberGroupReference); Operation insertSelfToPeer = Operation.createPost(joinBody.memberGroupReference) .setBody(self) .setCompletion( (o, e) -> { if (e != null) { logSevere("Insert POST to %s failed", o.getUri()); return; } // we will restart services to synchronize with peers on the next // maintenance interval with a stable group membership }); sendRequest(insertSelfToPeer); }
private void handleUpdateQuorumPatch(Operation patch, NodeGroupState localState) { UpdateQuorumRequest bd = patch.getBody(UpdateQuorumRequest.class); NodeState self = localState.nodes.get(getHost().getId()); logInfo("Updating self quorum from %d. Body: %s", self.membershipQuorum, Utils.toJsonHtml(bd)); if (bd.membershipQuorum != null) { self.membershipQuorum = bd.membershipQuorum; } if (bd.synchQuorum != null) { self.synchQuorum = bd.synchQuorum; } self.documentVersion++; self.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); localState.membershipUpdateTimeMicros = self.documentUpdateTimeMicros; if (!bd.isGroupUpdate) { patch.setBodyNoCloning(localState).complete(); return; } // TODO use a three phase consensus algorithm to update quorum similar // to the steady state replication consensus. // Issue N requests to update quorum to all member of the group. If they // do not all succeed the request, then the operation fails and some peers // will be left with a quorum level different than the others. That is // acceptable. The replication logic, can reject a peer if its quorum level // is not set at the same level as the owner. The client of this request can // also retry... bd.isGroupUpdate = false; int failureThreshold = (localState.nodes.size() - 1) / 2; AtomicInteger pending = new AtomicInteger(localState.nodes.size()); AtomicInteger failures = new AtomicInteger(); CompletionHandler c = (o, e) -> { if (e != null) { logWarning("Node %s failed quorum update: %s", o.getUri(), e.toString()); failures.incrementAndGet(); } int p = pending.decrementAndGet(); if (p != 0) { return; } if (failures.get() > failureThreshold) { patch.fail(new IllegalStateException("Majority of nodes failed request")); } else { patch.setBodyNoCloning(localState).complete(); } }; for (NodeState node : localState.nodes.values()) { if (!NodeState.isAvailable(node, getHost().getId(), true)) { c.handle(null, null); continue; } if (bd.membershipQuorum != null) { node.membershipQuorum = bd.membershipQuorum; } if (bd.synchQuorum != null) { node.synchQuorum = bd.synchQuorum; } node.documentVersion++; node.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); Operation p = Operation.createPatch(node.groupReference).setBody(bd).setCompletion(c); sendRequest(p); } }