private long assignLeader(int partitionId, List<Long> children) { // We used masterHostId = -1 as a way to force the leader choice to be // the first replica in the list, if we don't have some other mechanism // which has successfully overridden it. int masterHostId = -1; if (m_state.get() == AppointerState.CLUSTER_START) { try { // find master in topo JSONArray parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == partitionId) { masterHostId = aPartition.getInt("master"); } } } catch (JSONException jse) { tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0"); jse.printStackTrace(); masterHostId = -1; // stupid default } } else { // For now, if we're appointing a new leader as a result of a // failure, just pick the first replica in the children list. // Could eventually do something more complex here to try to keep a // semi-balance, but it's unclear that this has much utility until // we add rebalancing on rejoin as well. masterHostId = -1; } long masterHSId = children.get(0); for (Long child : children) { if (CoreUtils.getHostIdFromHSId(child) == masterHostId) { masterHSId = child; break; } } tmLog.info( "Appointing HSId " + CoreUtils.hsIdToString(masterHSId) + " as leader for partition " + partitionId); try { m_iv2appointees.put(partitionId, masterHSId); } catch (Exception e) { VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e); } return masterHSId; }
private Set<Integer> readPriorKnownLiveNodes() { Set<Integer> nodes = new HashSet<Integer>(); try { byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null); String jsonString = new String(data, "UTF-8"); tmLog.debug("Read prior known live nodes: " + jsonString); JSONObject jsObj = new JSONObject(jsonString); JSONArray jsonNodes = jsObj.getJSONArray("liveNodes"); for (int ii = 0; ii < jsonNodes.length(); ii++) { nodes.add(jsonNodes.getInt(ii)); } } catch (Exception e) { VoltDB.crashLocalVoltDB( "Unable to read prior known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e); } return nodes; }
@Override public void run(List<String> children) { List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children); // compute previously unseen HSId set in the callback list Set<Long> newHSIds = new HashSet<Long>(updatedHSIds); newHSIds.removeAll(m_replicas); tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds)); // compute previously seen but now vanished from the callback list HSId set Set<Long> missingHSIds = new HashSet<Long>(m_replicas); missingHSIds.removeAll(updatedHSIds); tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds)); tmLog.debug( "Handling babysitter callback for partition " + m_partitionId + ": children: " + CoreUtils.hsIdCollectionToString(updatedHSIds)); if (m_state.get() == AppointerState.CLUSTER_START) { // We can't yet tolerate a host failure during startup. Crash it all if (missingHSIds.size() > 0) { VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null); } // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor, // but for now we just look to see how many replicas of this partition we actually expect // and gate leader assignment on that many copies showing up. int replicaCount = m_kfactor + 1; JSONArray parts; try { parts = m_topo.getJSONArray("partitions"); for (int p = 0; p < parts.length(); p++) { JSONObject aPartition = parts.getJSONObject(p); int pid = aPartition.getInt("partition_id"); if (pid == m_partitionId) { replicaCount = aPartition.getJSONArray("replicas").length(); } } } catch (JSONException e) { // Ignore and just assume the normal number of replicas } if (children.size() == replicaCount) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } else { tmLog.info( "Waiting on " + ((m_kfactor + 1) - children.size()) + " more nodes " + "for k-safety before startup"); } } else { // Check for k-safety if (!isClusterKSafe()) { VoltDB.crashGlobalVoltDB( "Some partitions have no replicas. Cluster has become unviable.", false, null); } // Check if replay has completed if (m_replayComplete.get() == false) { VoltDB.crashGlobalVoltDB( "Detected node failure during command log replay. Cluster will shut down.", false, null); } // Check to see if there's been a possible network partition and we're not already handling // it if (m_partitionDetectionEnabled && !m_partitionDetected) { doPartitionDetectionActivities(); } // If we survived the above gauntlet of fail, appoint a new leader for this partition. if (missingHSIds.contains(m_currentLeader)) { m_currentLeader = assignLeader(m_partitionId, updatedHSIds); } } m_replicas.clear(); m_replicas.addAll(updatedHSIds); }
// XXX maybe consider an IOException subclass at some point public TableSaveFile( FileChannel dataIn, int readAheadChunks, Integer[] relevantPartitionIds, boolean continueOnCorruptedChunk) throws IOException { try { EELibraryLoader.loadExecutionEngineLibrary(true); if (relevantPartitionIds == null) { m_relevantPartitionIds = null; } else { m_relevantPartitionIds = new HashSet<Integer>(); for (Integer i : relevantPartitionIds) { m_relevantPartitionIds.add(i); } } m_chunkReads = new Semaphore(readAheadChunks); m_saveFile = dataIn; m_continueOnCorruptedChunk = continueOnCorruptedChunk; final PureJavaCrc32 crc = new PureJavaCrc32(); /* * If the CRC check fails because the file wasn't completed */ final PureJavaCrc32 secondCRC = new PureJavaCrc32(); /* * Get the header with the save restore specific information */ final ByteBuffer lengthBuffer = ByteBuffer.allocate(8); while (lengthBuffer.hasRemaining()) { final int read = m_saveFile.read(lengthBuffer); if (read == -1) { throw new EOFException(); } } lengthBuffer.flip(); final int originalCRC = lengthBuffer.getInt(); int length = lengthBuffer.getInt(); crc.update(lengthBuffer.array(), 4, 4); secondCRC.update(lengthBuffer.array(), 4, 4); if (length < 0) { throw new IOException("Corrupted save file has negative header length"); } if (length > 2097152) { throw new IOException("Corrupted save file has unreasonable header length > 2 megs"); } final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length); while (saveRestoreHeader.hasRemaining()) { final int read = m_saveFile.read(saveRestoreHeader); if (read == -1 || read < length) { throw new EOFException(); } } saveRestoreHeader.flip(); crc.update(saveRestoreHeader.array()); secondCRC.update(new byte[] {1}); secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1); /* * Get the template for the VoltTable serialization header. * It will have an extra length value preceded to it so that * it can be sucked straight into a buffer. This will not * contain a row count since that varies from chunk to chunk * and is supplied by the chunk */ lengthBuffer.clear(); lengthBuffer.limit(4); /* * Why this stupidity and no while loop? * Because java is broken and complains about a random final * elsewhere if you do. */ { final int read = m_saveFile.read(lengthBuffer); if (read == -1) { throw new EOFException(); } } crc.update(lengthBuffer.array(), 0, 4); secondCRC.update(lengthBuffer.array(), 0, 4); lengthBuffer.flip(); length = lengthBuffer.getInt(); if (length < 4) { throw new IOException( "Corrupted save file has negative length or too small length for VoltTable header"); } if (length > 2097152) { throw new IOException( "Corrupted save file has unreasonable VoltTable header length > 2 megs"); } m_tableHeader = ByteBuffer.allocate(length + 4); m_tableHeader.putInt(length); while (m_tableHeader.hasRemaining()) { final int read = m_saveFile.read(m_tableHeader); if (read == -1) { throw new EOFException(); } } crc.update(m_tableHeader.array(), 4, length); secondCRC.update(m_tableHeader.array(), 4, length); boolean failedCRCDueToNotCompleted = false; final int actualCRC = (int) crc.getValue(); if (originalCRC != actualCRC) { /* * Check if the CRC mismatch is due to the snapshot not being completed */ final int secondCRCValue = (int) secondCRC.getValue(); if (secondCRCValue == originalCRC) { failedCRCDueToNotCompleted = true; } else { throw new IOException("Checksum mismatch"); } } FastDeserializer fd = new FastDeserializer(saveRestoreHeader); byte completedByte = fd.readByte(); m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false); for (int ii = 0; ii < 4; ii++) { m_versionNum[ii] = fd.readInt(); } /* * Support the original pre 1.3 header format as well as a new JSON format. * JSON will make it possible to add info to a snapshot header without * breaking backwards compatibility. */ if (m_versionNum[3] == 0) { m_txnId = fd.readLong(); m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId); m_hostId = fd.readInt(); m_hostname = fd.readString(); m_clusterName = fd.readString(); m_databaseName = fd.readString(); m_tableName = fd.readString(); m_isReplicated = fd.readBoolean(); m_isCompressed = false; m_checksumType = ChecksumType.CRC32; if (!m_isReplicated) { m_partitionIds = (int[]) fd.readArray(int.class); if (!m_completed) { for (Integer partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } m_totalPartitions = fd.readInt(); } else { m_partitionIds = new int[] {0}; m_totalPartitions = 1; if (!m_completed) { m_corruptedPartitions.add(0); } } m_hasVersion2FormatChunks = false; } else { assert (m_versionNum[3] == 1 || m_versionNum[3] == 2); if (m_versionNum[3] >= 2) { m_hasVersion2FormatChunks = true; } else { m_hasVersion2FormatChunks = false; } int numJSONBytes = fd.readInt(); byte jsonBytes[] = new byte[numJSONBytes]; fd.readFully(jsonBytes); String jsonString = new String(jsonBytes, "UTF-8"); JSONObject obj = new JSONObject(jsonString); m_txnId = obj.getLong("txnId"); // Timestamp field added for 3.0, might not be there if (obj.has("timestamp")) { m_timestamp = obj.getLong("timestamp"); } else { // Pre 3.0/IV2 the timestamp was in the transactionid m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId); } m_hostId = obj.getInt("hostId"); m_hostname = obj.getString("hostname"); m_clusterName = obj.getString("clusterName"); m_databaseName = obj.getString("databaseName"); m_tableName = obj.getString("tableName"); m_isReplicated = obj.getBoolean("isReplicated"); m_isCompressed = obj.optBoolean("isCompressed", false); m_checksumType = ChecksumType.valueOf(obj.optString("checksumType", "CRC32")); if (!m_isReplicated) { JSONArray partitionIds = obj.getJSONArray("partitionIds"); m_partitionIds = new int[partitionIds.length()]; for (int ii = 0; ii < m_partitionIds.length; ii++) { m_partitionIds[ii] = partitionIds.getInt(ii); } if (!m_completed) { for (Integer partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } m_totalPartitions = obj.getInt("numPartitions"); } else { m_partitionIds = new int[] {0}; m_totalPartitions = 1; if (!m_completed) { m_corruptedPartitions.add(0); } } } /* * Several runtime exceptions can be thrown in valid failure cases where * a corrupt save file is being detected. */ } catch (BufferUnderflowException e) { throw new IOException(e); } catch (BufferOverflowException e) { throw new IOException(e); } catch (IndexOutOfBoundsException e) { throw new IOException(e); } catch (JSONException e) { throw new IOException(e); } }