private synchronized void waitUntilActiveNodeIDNotNull(long timeout) { while (activeNode.isNull() && timeout > 0) { long start = System.currentTimeMillis(); try { wait(timeout); } catch (InterruptedException e) { logger.warn("Interrupted while waiting for ACTIVE to declare WON message ! ", e); break; } timeout = timeout - (System.currentTimeMillis() - start); } debugInfo( "Wait for other active to declare as active over. Declared? activeNodeId.isNull() = " + activeNode.isNull() + ", activeNode=" + activeNode); }
private synchronized void handleElectionWonMessage(L2StateMessage clusterMsg) { debugInfo("Received election_won or election_already_won msg: " + clusterMsg); Enrollment winningEnrollment = clusterMsg.getEnrollment(); if (state == ACTIVE_COORDINATOR) { // Can't get Election Won from another node : Split brain String error = state + " Received Election Won Msg : " + clusterMsg + ". A Terracotta server tried to join the mirror group as a second ACTIVE"; logger.error(error); if (clusterMsg.getType() == L2StateMessage.ELECTION_WON_ALREADY) { sendNGResponse(clusterMsg.messageFrom(), clusterMsg); } groupManager.zapNode( winningEnrollment.getNodeID(), L2HAZapNodeRequestProcessor.SPLIT_BRAIN, error); } else if (activeNode.isNull() || activeNode.equals(winningEnrollment.getNodeID()) || clusterMsg.getType() == L2StateMessage.ELECTION_WON) { // There is no active server for this node or the other node just detected a failure of ACTIVE // server and ran an // election and is sending the results. This can happen if this node for some reason is not // able to detect that // the active is down but the other node did. Go with the new active. setActiveNodeID(winningEnrollment.getNodeID()); moveToPassiveState(winningEnrollment); if (clusterMsg.getType() == L2StateMessage.ELECTION_WON_ALREADY) { sendOKResponse(clusterMsg.messageFrom(), clusterMsg); } } else { // This is done to solve DEV-1532. Node sent ELECTION_WON_ALREADY message but our ACTIVE is // intact. logger.warn( "Conflicting Election Won Msg : " + clusterMsg + " since I already have a ACTIVE Node : " + activeNode + ". Sending NG response"); // The reason we send a response for ELECTION_WON_ALREADY message is that if we don't agree we // don't want the // other server to send us cluster state messages. sendNGResponse(clusterMsg.messageFrom(), clusterMsg); } }
@Override public void startElectionIfNecessary(NodeID disconnectedNode) { Assert.assertFalse(disconnectedNode.equals(getLocalNodeID())); boolean elect = false; synchronized (this) { if (activeNode.equals(disconnectedNode)) { // ACTIVE Node is gone setActiveNodeID(ServerID.NULL_ID); } if (state != PASSIVE_UNINITIALIZED && state != ACTIVE_COORDINATOR && activeNode.isNull()) { elect = true; } } if (elect) { info("Starting Election to determine cluser wide ACTIVE L2"); startElection(); } else { debugInfo("Not starting election even though node left: " + disconnectedNode); } }
@Override public NodeID runElection(NodeID myNodeId, boolean isNew, WeightGeneratorFactory weightsFactory) { NodeID winnerID = ServerID.NULL_ID; int count = 0; while (winnerID.isNull()) { if (count++ > 0) { logger.info("Requesting Re-election !!! count = " + count); } try { winnerID = doElection(myNodeId, isNew, weightsFactory); } catch (InterruptedException e) { logger.error("Interrupted during election : ", e); reset(null); } catch (GroupException e1) { logger.error("Error during election : ", e1); reset(null); } } return winnerID; }
private synchronized void handleElectionResultMessage(L2StateMessage msg) throws GroupException { if (activeNode.equals(msg.getEnrollment().getNodeID())) { Assert.assertFalse(ServerID.NULL_ID.equals(activeNode)); // This wouldn't normally happen, but we agree - so ack GroupMessage resultAgreed = L2StateMessage.createResultAgreedMessage(msg, msg.getEnrollment()); logger.info("Agreed with Election Result from " + msg.messageFrom() + " : " + resultAgreed); groupManager.sendTo(msg.messageFrom(), resultAgreed); } else if (state == ACTIVE_COORDINATOR || !activeNode.isNull() || (msg.getEnrollment().isANewCandidate() && state != START_STATE)) { // Condition 1 : // Obviously an issue. // Condition 2 : // This shouldn't happen normally, but is possible when there is some weird network error // where A sees B, // B sees A/C and C sees B and A is active and C is trying to run election // Force other node to rerun election so that we can abort // Condition 3 : // We don't want new L2s to win an election when there are old L2s in PASSIVE states. GroupMessage resultConflict = L2StateMessage.createResultConflictMessage( msg, EnrollmentFactory.createTrumpEnrollment(getLocalNodeID(), weightsFactory)); warn( "WARNING :: Active Node = " + activeNode + " , " + state + " received ELECTION_RESULT message from another node : " + msg + " : Forcing re-election " + resultConflict); groupManager.sendTo(msg.messageFrom(), resultConflict); } else { debugInfo("ElectionMgr handling election result msg: " + msg); electionMgr.handleElectionResultMessage(msg); } }