Exemplo n.º 1
0
  private long assignLeader(int partitionId, List<Long> children) {
    // We used masterHostId = -1 as a way to force the leader choice to be
    // the first replica in the list, if we don't have some other mechanism
    // which has successfully overridden it.
    int masterHostId = -1;
    if (m_state.get() == AppointerState.CLUSTER_START) {
      try {
        // find master in topo
        JSONArray parts = m_topo.getJSONArray("partitions");
        for (int p = 0; p < parts.length(); p++) {
          JSONObject aPartition = parts.getJSONObject(p);
          int pid = aPartition.getInt("partition_id");
          if (pid == partitionId) {
            masterHostId = aPartition.getInt("master");
          }
        }
      } catch (JSONException jse) {
        tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0");
        jse.printStackTrace();
        masterHostId = -1; // stupid default
      }
    } else {
      // For now, if we're appointing a new leader as a result of a
      // failure, just pick the first replica in the children list.
      // Could eventually do something more complex here to try to keep a
      // semi-balance, but it's unclear that this has much utility until
      // we add rebalancing on rejoin as well.
      masterHostId = -1;
    }

    long masterHSId = children.get(0);
    for (Long child : children) {
      if (CoreUtils.getHostIdFromHSId(child) == masterHostId) {
        masterHSId = child;
        break;
      }
    }
    tmLog.info(
        "Appointing HSId "
            + CoreUtils.hsIdToString(masterHSId)
            + " as leader for partition "
            + partitionId);
    try {
      m_iv2appointees.put(partitionId, masterHSId);
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e);
    }
    return masterHSId;
  }
Exemplo n.º 2
0
 private Set<Integer> readPriorKnownLiveNodes() {
   Set<Integer> nodes = new HashSet<Integer>();
   try {
     byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null);
     String jsonString = new String(data, "UTF-8");
     tmLog.debug("Read prior known live nodes: " + jsonString);
     JSONObject jsObj = new JSONObject(jsonString);
     JSONArray jsonNodes = jsObj.getJSONArray("liveNodes");
     for (int ii = 0; ii < jsonNodes.length(); ii++) {
       nodes.add(jsonNodes.getInt(ii));
     }
   } catch (Exception e) {
     VoltDB.crashLocalVoltDB(
         "Unable to read prior known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes,
         true,
         e);
   }
   return nodes;
 }
Exemplo n.º 3
0
    @Override
    public void run(List<String> children) {
      List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children);
      // compute previously unseen HSId set in the callback list
      Set<Long> newHSIds = new HashSet<Long>(updatedHSIds);
      newHSIds.removeAll(m_replicas);
      tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds));
      // compute previously seen but now vanished from the callback list HSId set
      Set<Long> missingHSIds = new HashSet<Long>(m_replicas);
      missingHSIds.removeAll(updatedHSIds);
      tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds));

      tmLog.debug(
          "Handling babysitter callback for partition "
              + m_partitionId
              + ": children: "
              + CoreUtils.hsIdCollectionToString(updatedHSIds));
      if (m_state.get() == AppointerState.CLUSTER_START) {
        // We can't yet tolerate a host failure during startup.  Crash it all
        if (missingHSIds.size() > 0) {
          VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null);
        }
        // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor,
        // but for now we just look to see how many replicas of this partition we actually expect
        // and gate leader assignment on that many copies showing up.
        int replicaCount = m_kfactor + 1;
        JSONArray parts;
        try {
          parts = m_topo.getJSONArray("partitions");
          for (int p = 0; p < parts.length(); p++) {
            JSONObject aPartition = parts.getJSONObject(p);
            int pid = aPartition.getInt("partition_id");
            if (pid == m_partitionId) {
              replicaCount = aPartition.getJSONArray("replicas").length();
            }
          }
        } catch (JSONException e) {
          // Ignore and just assume the normal number of replicas
        }
        if (children.size() == replicaCount) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        } else {
          tmLog.info(
              "Waiting on "
                  + ((m_kfactor + 1) - children.size())
                  + " more nodes "
                  + "for k-safety before startup");
        }
      } else {
        // Check for k-safety
        if (!isClusterKSafe()) {
          VoltDB.crashGlobalVoltDB(
              "Some partitions have no replicas.  Cluster has become unviable.", false, null);
        }
        // Check if replay has completed
        if (m_replayComplete.get() == false) {
          VoltDB.crashGlobalVoltDB(
              "Detected node failure during command log replay. Cluster will shut down.",
              false,
              null);
        }
        // Check to see if there's been a possible network partition and we're not already handling
        // it
        if (m_partitionDetectionEnabled && !m_partitionDetected) {
          doPartitionDetectionActivities();
        }
        // If we survived the above gauntlet of fail, appoint a new leader for this partition.
        if (missingHSIds.contains(m_currentLeader)) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        }
      }
      m_replicas.clear();
      m_replicas.addAll(updatedHSIds);
    }
Exemplo n.º 4
0
  // XXX maybe consider an IOException subclass at some point
  public TableSaveFile(
      FileChannel dataIn,
      int readAheadChunks,
      Integer[] relevantPartitionIds,
      boolean continueOnCorruptedChunk)
      throws IOException {
    try {
      EELibraryLoader.loadExecutionEngineLibrary(true);
      if (relevantPartitionIds == null) {
        m_relevantPartitionIds = null;
      } else {
        m_relevantPartitionIds = new HashSet<Integer>();
        for (Integer i : relevantPartitionIds) {
          m_relevantPartitionIds.add(i);
        }
      }
      m_chunkReads = new Semaphore(readAheadChunks);
      m_saveFile = dataIn;
      m_continueOnCorruptedChunk = continueOnCorruptedChunk;

      final PureJavaCrc32 crc = new PureJavaCrc32();
      /*
       * If the CRC check fails because the file wasn't completed
       */
      final PureJavaCrc32 secondCRC = new PureJavaCrc32();

      /*
       * Get the header with the save restore specific information
       */
      final ByteBuffer lengthBuffer = ByteBuffer.allocate(8);
      while (lengthBuffer.hasRemaining()) {
        final int read = m_saveFile.read(lengthBuffer);
        if (read == -1) {
          throw new EOFException();
        }
      }
      lengthBuffer.flip();
      final int originalCRC = lengthBuffer.getInt();
      int length = lengthBuffer.getInt();
      crc.update(lengthBuffer.array(), 4, 4);
      secondCRC.update(lengthBuffer.array(), 4, 4);

      if (length < 0) {
        throw new IOException("Corrupted save file has negative header length");
      }

      if (length > 2097152) {
        throw new IOException("Corrupted save file has unreasonable header length > 2 megs");
      }

      final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length);
      while (saveRestoreHeader.hasRemaining()) {
        final int read = m_saveFile.read(saveRestoreHeader);
        if (read == -1 || read < length) {
          throw new EOFException();
        }
      }
      saveRestoreHeader.flip();
      crc.update(saveRestoreHeader.array());
      secondCRC.update(new byte[] {1});
      secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1);

      /*
       *  Get the template for the VoltTable serialization header.
       *  It will have an extra length value preceded to it so that
       *  it can be sucked straight into a buffer. This will not
       *  contain a row count since that varies from chunk to chunk
       *  and is supplied by the chunk
       */
      lengthBuffer.clear();
      lengthBuffer.limit(4);
      /*
       * Why this stupidity and no while loop?
       * Because java is broken and complains about a random final
       * elsewhere if you do.
       */
      {
        final int read = m_saveFile.read(lengthBuffer);
        if (read == -1) {
          throw new EOFException();
        }
      }
      crc.update(lengthBuffer.array(), 0, 4);
      secondCRC.update(lengthBuffer.array(), 0, 4);
      lengthBuffer.flip();
      length = lengthBuffer.getInt();

      if (length < 4) {
        throw new IOException(
            "Corrupted save file has negative length or too small length for VoltTable header");
      }

      if (length > 2097152) {
        throw new IOException(
            "Corrupted save file has unreasonable VoltTable header length > 2 megs");
      }

      m_tableHeader = ByteBuffer.allocate(length + 4);
      m_tableHeader.putInt(length);
      while (m_tableHeader.hasRemaining()) {
        final int read = m_saveFile.read(m_tableHeader);
        if (read == -1) {
          throw new EOFException();
        }
      }
      crc.update(m_tableHeader.array(), 4, length);
      secondCRC.update(m_tableHeader.array(), 4, length);

      boolean failedCRCDueToNotCompleted = false;

      final int actualCRC = (int) crc.getValue();
      if (originalCRC != actualCRC) {
        /*
         * Check if the CRC mismatch is due to the snapshot not being completed
         */
        final int secondCRCValue = (int) secondCRC.getValue();
        if (secondCRCValue == originalCRC) {
          failedCRCDueToNotCompleted = true;
        } else {
          throw new IOException("Checksum mismatch");
        }
      }

      FastDeserializer fd = new FastDeserializer(saveRestoreHeader);
      byte completedByte = fd.readByte();
      m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false);
      for (int ii = 0; ii < 4; ii++) {
        m_versionNum[ii] = fd.readInt();
      }

      /*
       * Support the original pre 1.3 header format as well as a new JSON format.
       * JSON will make it possible to add info to a snapshot header without
       * breaking backwards compatibility.
       */
      if (m_versionNum[3] == 0) {
        m_txnId = fd.readLong();
        m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId);
        m_hostId = fd.readInt();
        m_hostname = fd.readString();
        m_clusterName = fd.readString();
        m_databaseName = fd.readString();
        m_tableName = fd.readString();
        m_isReplicated = fd.readBoolean();
        m_isCompressed = false;
        m_checksumType = ChecksumType.CRC32;
        if (!m_isReplicated) {
          m_partitionIds = (int[]) fd.readArray(int.class);
          if (!m_completed) {
            for (Integer partitionId : m_partitionIds) {
              m_corruptedPartitions.add(partitionId);
            }
          }
          m_totalPartitions = fd.readInt();
        } else {
          m_partitionIds = new int[] {0};
          m_totalPartitions = 1;
          if (!m_completed) {
            m_corruptedPartitions.add(0);
          }
        }
        m_hasVersion2FormatChunks = false;
      } else {
        assert (m_versionNum[3] == 1 || m_versionNum[3] == 2);
        if (m_versionNum[3] >= 2) {
          m_hasVersion2FormatChunks = true;
        } else {
          m_hasVersion2FormatChunks = false;
        }
        int numJSONBytes = fd.readInt();
        byte jsonBytes[] = new byte[numJSONBytes];
        fd.readFully(jsonBytes);
        String jsonString = new String(jsonBytes, "UTF-8");
        JSONObject obj = new JSONObject(jsonString);

        m_txnId = obj.getLong("txnId");
        // Timestamp field added for 3.0, might not be there
        if (obj.has("timestamp")) {
          m_timestamp = obj.getLong("timestamp");
        } else {
          // Pre 3.0/IV2 the timestamp was in the transactionid
          m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId);
        }
        m_hostId = obj.getInt("hostId");
        m_hostname = obj.getString("hostname");
        m_clusterName = obj.getString("clusterName");
        m_databaseName = obj.getString("databaseName");
        m_tableName = obj.getString("tableName");
        m_isReplicated = obj.getBoolean("isReplicated");
        m_isCompressed = obj.optBoolean("isCompressed", false);
        m_checksumType = ChecksumType.valueOf(obj.optString("checksumType", "CRC32"));
        if (!m_isReplicated) {
          JSONArray partitionIds = obj.getJSONArray("partitionIds");
          m_partitionIds = new int[partitionIds.length()];
          for (int ii = 0; ii < m_partitionIds.length; ii++) {
            m_partitionIds[ii] = partitionIds.getInt(ii);
          }

          if (!m_completed) {
            for (Integer partitionId : m_partitionIds) {
              m_corruptedPartitions.add(partitionId);
            }
          }
          m_totalPartitions = obj.getInt("numPartitions");
        } else {
          m_partitionIds = new int[] {0};
          m_totalPartitions = 1;
          if (!m_completed) {
            m_corruptedPartitions.add(0);
          }
        }
      }
      /*
       * Several runtime exceptions can be thrown in valid failure cases where
       * a corrupt save file is being detected.
       */
    } catch (BufferUnderflowException e) {
      throw new IOException(e);
    } catch (BufferOverflowException e) {
      throw new IOException(e);
    } catch (IndexOutOfBoundsException e) {
      throw new IOException(e);
    } catch (JSONException e) {
      throw new IOException(e);
    }
  }