private void handleUpdateQuorumPatch(Operation patch, NodeGroupState localState) { UpdateQuorumRequest bd = patch.getBody(UpdateQuorumRequest.class); NodeState self = localState.nodes.get(getHost().getId()); logInfo("Updating self quorum from %d. Body: %s", self.membershipQuorum, Utils.toJsonHtml(bd)); if (bd.membershipQuorum != null) { self.membershipQuorum = bd.membershipQuorum; } if (bd.synchQuorum != null) { self.synchQuorum = bd.synchQuorum; } self.documentVersion++; self.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); localState.membershipUpdateTimeMicros = self.documentUpdateTimeMicros; if (!bd.isGroupUpdate) { patch.setBodyNoCloning(localState).complete(); return; } // TODO use a three phase consensus algorithm to update quorum similar // to the steady state replication consensus. // Issue N requests to update quorum to all member of the group. If they // do not all succeed the request, then the operation fails and some peers // will be left with a quorum level different than the others. That is // acceptable. The replication logic, can reject a peer if its quorum level // is not set at the same level as the owner. The client of this request can // also retry... bd.isGroupUpdate = false; int failureThreshold = (localState.nodes.size() - 1) / 2; AtomicInteger pending = new AtomicInteger(localState.nodes.size()); AtomicInteger failures = new AtomicInteger(); CompletionHandler c = (o, e) -> { if (e != null) { logWarning("Node %s failed quorum update: %s", o.getUri(), e.toString()); failures.incrementAndGet(); } int p = pending.decrementAndGet(); if (p != 0) { return; } if (failures.get() > failureThreshold) { patch.fail(new IllegalStateException("Majority of nodes failed request")); } else { patch.setBodyNoCloning(localState).complete(); } }; for (NodeState node : localState.nodes.values()) { if (!NodeState.isAvailable(node, getHost().getId(), true)) { c.handle(null, null); continue; } if (bd.membershipQuorum != null) { node.membershipQuorum = bd.membershipQuorum; } if (bd.synchQuorum != null) { node.synchQuorum = bd.synchQuorum; } node.documentVersion++; node.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); Operation p = Operation.createPatch(node.groupReference).setBody(bd).setCompletion(c); sendRequest(p); } }
private void performGroupMaintenance(Operation maint, Operation get, Throwable getEx) { // we ignore any body associated with the PUT if (getEx != null) { logWarning("Failure getting state: %s", getEx.toString()); maint.complete(); return; } if (!get.hasBody()) { maint.complete(); return; } NodeGroupState localState = get.getBody(NodeGroupState.class); if (localState == null || localState.nodes == null) { maint.complete(); return; } if (localState.nodes.size() <= 1) { maint.complete(); return; } if (getHost().isStopping()) { maint.complete(); return; } // probe a fixed, random selection of our peers, giving them our view of the group and // getting back theirs // probe log 10 of peers (exclude self) int peersToProbe = (int) Math.log10(localState.nodes.size() - 1); // probe at least N peers peersToProbe = Math.max(peersToProbe, MIN_PEER_GOSSIP_COUNT); // probe at most total number of peers peersToProbe = Math.min(localState.nodes.size() - 1, peersToProbe); AtomicInteger remaining = new AtomicInteger(peersToProbe); NodeState[] randomizedPeers = shuffleGroupMembers(localState); NodeState localNode = localState.nodes.get(getHost().getId()); localNode.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); localNode.groupReference = UriUtils.buildPublicUri(getHost(), getSelfLink()); localState.documentOwner = getHost().getId(); NodeGroupState patchBody = new NodeGroupState(); patchBody.documentOwner = getHost().getId(); patchBody.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); int probeCount = 0; for (NodeState peer : randomizedPeers) { if (peer == null) { continue; } if (peer.id.equals(getHost().getId())) { continue; } NodeState remotePeer = peer; URI peerUri = peer.groupReference; // send a gossip PATCH to the peer, with our state // perform a health check to N randomly selected peers // 1) We issue a PATCH to a peer, with the body set to our view of the group // 2a) if the peer is healthy, they will merge our state with theirs and return // the merged state in the response. We will then update our state and mark the // peer AVAILABLE. We just update peer node, we don't currently merge their state // 2b) if the PATCH failed, we mark the PEER it UNAVAILABLE CompletionHandler ch = (o, e) -> handleGossipPatchCompletion( maint, o, e, localState, patchBody, remaining, remotePeer); Operation patch = Operation.createPatch(peerUri) .setBody(localState) .setRetryCount(0) .setExpiration(Utils.getNowMicrosUtc() + getHost().getOperationTimeoutMicros() / 2) .forceRemote() .setCompletion(ch); if (peer.groupReference.equals(localNode.groupReference) && peer.status != NodeStatus.REPLACED) { // If we just detected this is a peer node that used to listen on our address, // but its obviously no longer around, mark it as REPLACED and do not send PATCH peer.status = NodeStatus.REPLACED; peer.documentUpdateTimeMicros = Utils.getNowMicrosUtc(); peer.documentVersion++; ch.handle(null, null); } else { sendRequest(patch); } // only probe N peers if (++probeCount >= peersToProbe) { break; } } if (probeCount == 0) { maint.complete(); } }
/** * Issues updates to peer nodes, after a local update has been accepted. If the service support * OWNER_SELECTION the replication message is the Propose message in the consensus work flow. * * @param localState * @param outboundOp * @param req * @param rsp */ void replicateUpdate( NodeGroupState localState, Operation outboundOp, SelectAndForwardRequest req, SelectOwnerResponse rsp) { int memberCount = localState.nodes.size(); NodeState selfNode = localState.nodes.get(getHost().getId()); AtomicInteger successCount = new AtomicInteger(0); if (req.serviceOptions.contains(ServiceOption.OWNER_SELECTION) && selfNode.membershipQuorum > memberCount) { outboundOp.fail(new IllegalStateException("Not enough peers: " + memberCount)); return; } if (memberCount == 1) { outboundOp.complete(); return; } AtomicInteger failureCount = new AtomicInteger(); // The eligible count can be less than the member count if the parent node selector has // a smaller replication factor than group size. We need to use the replication factor // as the upper bound for calculating success and failure thresholds int eligibleMemberCount = rsp.selectedNodes.size(); // When quorum is not required, succeed when we replicate to at least one remote node, // or, if only local node is available, succeed immediately. int successThreshold = Math.min(2, eligibleMemberCount - 1); int failureThreshold = eligibleMemberCount - successThreshold; if (req.serviceOptions.contains(ServiceOption.OWNER_SELECTION)) { successThreshold = Math.min(eligibleMemberCount, selfNode.membershipQuorum); failureThreshold = eligibleMemberCount - successThreshold; if (failureThreshold == successThreshold && successThreshold == 1) { // degenerate case: node group has just two members and quorum must be one, which // means even the single remote peer is down, we should still succeed. failureThreshold = 0; } } final int successThresholdFinal = successThreshold; final int failureThresholdFinal = failureThreshold; CompletionHandler c = (o, e) -> { if (e == null && o != null && o.getStatusCode() >= Operation.STATUS_CODE_FAILURE_THRESHOLD) { e = new IllegalStateException("Request failed: " + o.toString()); } int sCount = successCount.get(); int fCount = failureCount.get(); if (e != null) { logInfo("Replication to %s failed: %s", o.getUri(), e.toString()); fCount = failureCount.incrementAndGet(); } else { sCount = successCount.incrementAndGet(); } if (sCount == successThresholdFinal) { outboundOp.complete(); return; } if (fCount == 0) { return; } if (fCount >= failureThresholdFinal || ((fCount + sCount) == memberCount)) { String error = String.format( "%s to %s failed. Success: %d, Fail: %d, quorum: %d, threshold: %d", outboundOp.getAction(), outboundOp.getUri().getPath(), sCount, fCount, selfNode.membershipQuorum, failureThresholdFinal); logWarning("%s", error); outboundOp.fail(new IllegalStateException(error)); } }; String jsonBody = Utils.toJson(req.linkedState); Operation update = Operation.createPost(null) .setAction(outboundOp.getAction()) .setBodyNoCloning(jsonBody) .setCompletion(c) .setRetryCount(1) .setExpiration(outboundOp.getExpirationMicrosUtc()) .transferRequestHeadersFrom(outboundOp) .removePragmaDirective(Operation.PRAGMA_DIRECTIVE_FORWARDED) .addPragmaDirective(Operation.PRAGMA_DIRECTIVE_REPLICATED) .setReferer(outboundOp.getReferer()); if (update.getCookies() != null) { update.getCookies().clear(); } ServiceClient cl = getHost().getClient(); String selfId = getHost().getId(); // trigger completion once, for self node, since its part of our accounting c.handle(null, null); rsp.selectedNodes.forEach( (m) -> { if (m.id.equals(selfId)) { return; } if (m.options.contains(NodeOption.OBSERVER)) { return; } try { URI remotePeerService = new URI( m.groupReference.getScheme(), null, m.groupReference.getHost(), m.groupReference.getPort(), outboundOp.getUri().getPath(), outboundOp.getUri().getQuery(), null); update.setUri(remotePeerService); } catch (Throwable e1) { } if (NodeState.isUnAvailable(m)) { c.handle(update, new IllegalStateException("node is not available")); return; } cl.send(update); }); }