@Override
 protected void loadFromJSONObject(JSONObject obj, StmtTableScan tableScan) throws JSONException {
   m_columnIndex = obj.getInt(Members.COLUMN_IDX);
   if (obj.has(Members.TABLE_IDX)) {
     m_tableIdx = obj.getInt(Members.TABLE_IDX);
   }
   if (tableScan != null) {
     m_tableAlias = tableScan.getTableAlias();
     m_tableName = tableScan.getTableName();
     m_columnName = tableScan.getColumnName(m_columnIndex);
   }
 }
Exemple #2
0
  private long assignLeader(int partitionId, List<Long> children) {
    // We used masterHostId = -1 as a way to force the leader choice to be
    // the first replica in the list, if we don't have some other mechanism
    // which has successfully overridden it.
    int masterHostId = -1;
    if (m_state.get() == AppointerState.CLUSTER_START) {
      try {
        // find master in topo
        JSONArray parts = m_topo.getJSONArray("partitions");
        for (int p = 0; p < parts.length(); p++) {
          JSONObject aPartition = parts.getJSONObject(p);
          int pid = aPartition.getInt("partition_id");
          if (pid == partitionId) {
            masterHostId = aPartition.getInt("master");
          }
        }
      } catch (JSONException jse) {
        tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0");
        jse.printStackTrace();
        masterHostId = -1; // stupid default
      }
    } else {
      // For now, if we're appointing a new leader as a result of a
      // failure, just pick the first replica in the children list.
      // Could eventually do something more complex here to try to keep a
      // semi-balance, but it's unclear that this has much utility until
      // we add rebalancing on rejoin as well.
      masterHostId = -1;
    }

    long masterHSId = children.get(0);
    for (Long child : children) {
      if (CoreUtils.getHostIdFromHSId(child) == masterHostId) {
        masterHSId = child;
        break;
      }
    }
    tmLog.info(
        "Appointing HSId "
            + CoreUtils.hsIdToString(masterHSId)
            + " as leader for partition "
            + partitionId);
    try {
      m_iv2appointees.put(partitionId, masterHSId);
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e);
    }
    return masterHSId;
  }
  /**
   * Once participating host count is set, SnapshotCompletionMonitor can check this ZK node to
   * determine whether the snapshot has finished or not.
   *
   * <p>This should only be called when all participants have responded. It is possible that some
   * hosts finish taking snapshot before the coordinator logs the participating host count. In this
   * case, the host count would have been decremented multiple times already. To make sure finished
   * hosts are logged correctly, this method adds participating host count + 1 to the current host
   * count.
   *
   * @param txnId The snapshot txnId
   * @param participantCount The number of hosts participating in this snapshot
   */
  public static void logParticipatingHostCount(long txnId, int participantCount) {
    ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK();
    final String snapshotPath = VoltZK.completed_snapshots + "/" + txnId;

    boolean success = false;
    while (!success) {
      Stat stat = new Stat();
      byte data[] = null;
      try {
        data = zk.getData(snapshotPath, false, stat);
      } catch (KeeperException e) {
        if (e.code() == KeeperException.Code.NONODE) {
          // If snapshot creation failed for some reason, the node won't exist. ignore
          return;
        }
        VoltDB.crashLocalVoltDB("Failed to get snapshot completion node", true, e);
      } catch (InterruptedException e) {
        VoltDB.crashLocalVoltDB("Interrupted getting snapshot completion node", true, e);
      }
      if (data == null) {
        VoltDB.crashLocalVoltDB("Data should not be null if the node exists", false, null);
      }

      try {
        JSONObject jsonObj = new JSONObject(new String(data, Charsets.UTF_8));
        if (jsonObj.getLong("txnId") != txnId) {
          VoltDB.crashLocalVoltDB("TxnId should match", false, null);
        }

        int hostCount = jsonObj.getInt("hostCount");
        // +1 because hostCount was initialized to -1
        jsonObj.put("hostCount", hostCount + participantCount + 1);
        zk.setData(snapshotPath, jsonObj.toString(4).getBytes(Charsets.UTF_8), stat.getVersion());
      } catch (KeeperException.BadVersionException e) {
        continue;
      } catch (Exception e) {
        VoltDB.crashLocalVoltDB("This ZK call should never fail", true, e);
      }

      success = true;
    }
  }
Exemple #4
0
    @Override
    public void run(List<String> children) {
      List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children);
      // compute previously unseen HSId set in the callback list
      Set<Long> newHSIds = new HashSet<Long>(updatedHSIds);
      newHSIds.removeAll(m_replicas);
      tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds));
      // compute previously seen but now vanished from the callback list HSId set
      Set<Long> missingHSIds = new HashSet<Long>(m_replicas);
      missingHSIds.removeAll(updatedHSIds);
      tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds));

      tmLog.debug(
          "Handling babysitter callback for partition "
              + m_partitionId
              + ": children: "
              + CoreUtils.hsIdCollectionToString(updatedHSIds));
      if (m_state.get() == AppointerState.CLUSTER_START) {
        // We can't yet tolerate a host failure during startup.  Crash it all
        if (missingHSIds.size() > 0) {
          VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null);
        }
        // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor,
        // but for now we just look to see how many replicas of this partition we actually expect
        // and gate leader assignment on that many copies showing up.
        int replicaCount = m_kfactor + 1;
        JSONArray parts;
        try {
          parts = m_topo.getJSONArray("partitions");
          for (int p = 0; p < parts.length(); p++) {
            JSONObject aPartition = parts.getJSONObject(p);
            int pid = aPartition.getInt("partition_id");
            if (pid == m_partitionId) {
              replicaCount = aPartition.getJSONArray("replicas").length();
            }
          }
        } catch (JSONException e) {
          // Ignore and just assume the normal number of replicas
        }
        if (children.size() == replicaCount) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        } else {
          tmLog.info(
              "Waiting on "
                  + ((m_kfactor + 1) - children.size())
                  + " more nodes "
                  + "for k-safety before startup");
        }
      } else {
        // Check for k-safety
        if (!isClusterKSafe()) {
          VoltDB.crashGlobalVoltDB(
              "Some partitions have no replicas.  Cluster has become unviable.", false, null);
        }
        // Check if replay has completed
        if (m_replayComplete.get() == false) {
          VoltDB.crashGlobalVoltDB(
              "Detected node failure during command log replay. Cluster will shut down.",
              false,
              null);
        }
        // Check to see if there's been a possible network partition and we're not already handling
        // it
        if (m_partitionDetectionEnabled && !m_partitionDetected) {
          doPartitionDetectionActivities();
        }
        // If we survived the above gauntlet of fail, appoint a new leader for this partition.
        if (missingHSIds.contains(m_currentLeader)) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        }
      }
      m_replicas.clear();
      m_replicas.addAll(updatedHSIds);
    }
Exemple #5
0
 public InstanceId(JSONObject jsObj) throws JSONException {
   m_coord = jsObj.getInt("coord");
   m_timestamp = jsObj.getLong("timestamp");
 }
 @Override
 protected void loadFromJSONObject(JSONObject obj, Database db) throws JSONException {
   m_columnIndex = obj.getInt(Members.COLUMN_IDX.name());
   m_tableName = obj.getString(Members.TABLE_NAME.name());
   m_columnName = obj.getString(Members.COLUMN_NAME.name());
 }
Exemple #7
0
  // XXX maybe consider an IOException subclass at some point
  public TableSaveFile(
      FileChannel dataIn,
      int readAheadChunks,
      Integer[] relevantPartitionIds,
      boolean continueOnCorruptedChunk)
      throws IOException {
    try {
      EELibraryLoader.loadExecutionEngineLibrary(true);
      if (relevantPartitionIds == null) {
        m_relevantPartitionIds = null;
      } else {
        m_relevantPartitionIds = new HashSet<Integer>();
        for (Integer i : relevantPartitionIds) {
          m_relevantPartitionIds.add(i);
        }
      }
      m_chunkReads = new Semaphore(readAheadChunks);
      m_saveFile = dataIn;
      m_continueOnCorruptedChunk = continueOnCorruptedChunk;

      final PureJavaCrc32 crc = new PureJavaCrc32();
      /*
       * If the CRC check fails because the file wasn't completed
       */
      final PureJavaCrc32 secondCRC = new PureJavaCrc32();

      /*
       * Get the header with the save restore specific information
       */
      final ByteBuffer lengthBuffer = ByteBuffer.allocate(8);
      while (lengthBuffer.hasRemaining()) {
        final int read = m_saveFile.read(lengthBuffer);
        if (read == -1) {
          throw new EOFException();
        }
      }
      lengthBuffer.flip();
      final int originalCRC = lengthBuffer.getInt();
      int length = lengthBuffer.getInt();
      crc.update(lengthBuffer.array(), 4, 4);
      secondCRC.update(lengthBuffer.array(), 4, 4);

      if (length < 0) {
        throw new IOException("Corrupted save file has negative header length");
      }

      if (length > 2097152) {
        throw new IOException("Corrupted save file has unreasonable header length > 2 megs");
      }

      final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length);
      while (saveRestoreHeader.hasRemaining()) {
        final int read = m_saveFile.read(saveRestoreHeader);
        if (read == -1 || read < length) {
          throw new EOFException();
        }
      }
      saveRestoreHeader.flip();
      crc.update(saveRestoreHeader.array());
      secondCRC.update(new byte[] {1});
      secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1);

      /*
       *  Get the template for the VoltTable serialization header.
       *  It will have an extra length value preceded to it so that
       *  it can be sucked straight into a buffer. This will not
       *  contain a row count since that varies from chunk to chunk
       *  and is supplied by the chunk
       */
      lengthBuffer.clear();
      lengthBuffer.limit(4);
      /*
       * Why this stupidity and no while loop?
       * Because java is broken and complains about a random final
       * elsewhere if you do.
       */
      {
        final int read = m_saveFile.read(lengthBuffer);
        if (read == -1) {
          throw new EOFException();
        }
      }
      crc.update(lengthBuffer.array(), 0, 4);
      secondCRC.update(lengthBuffer.array(), 0, 4);
      lengthBuffer.flip();
      length = lengthBuffer.getInt();

      if (length < 4) {
        throw new IOException(
            "Corrupted save file has negative length or too small length for VoltTable header");
      }

      if (length > 2097152) {
        throw new IOException(
            "Corrupted save file has unreasonable VoltTable header length > 2 megs");
      }

      m_tableHeader = ByteBuffer.allocate(length + 4);
      m_tableHeader.putInt(length);
      while (m_tableHeader.hasRemaining()) {
        final int read = m_saveFile.read(m_tableHeader);
        if (read == -1) {
          throw new EOFException();
        }
      }
      crc.update(m_tableHeader.array(), 4, length);
      secondCRC.update(m_tableHeader.array(), 4, length);

      boolean failedCRCDueToNotCompleted = false;

      final int actualCRC = (int) crc.getValue();
      if (originalCRC != actualCRC) {
        /*
         * Check if the CRC mismatch is due to the snapshot not being completed
         */
        final int secondCRCValue = (int) secondCRC.getValue();
        if (secondCRCValue == originalCRC) {
          failedCRCDueToNotCompleted = true;
        } else {
          throw new IOException("Checksum mismatch");
        }
      }

      FastDeserializer fd = new FastDeserializer(saveRestoreHeader);
      byte completedByte = fd.readByte();
      m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false);
      for (int ii = 0; ii < 4; ii++) {
        m_versionNum[ii] = fd.readInt();
      }

      /*
       * Support the original pre 1.3 header format as well as a new JSON format.
       * JSON will make it possible to add info to a snapshot header without
       * breaking backwards compatibility.
       */
      if (m_versionNum[3] == 0) {
        m_txnId = fd.readLong();
        m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId);
        m_hostId = fd.readInt();
        m_hostname = fd.readString();
        m_clusterName = fd.readString();
        m_databaseName = fd.readString();
        m_tableName = fd.readString();
        m_isReplicated = fd.readBoolean();
        m_isCompressed = false;
        m_checksumType = ChecksumType.CRC32;
        if (!m_isReplicated) {
          m_partitionIds = (int[]) fd.readArray(int.class);
          if (!m_completed) {
            for (Integer partitionId : m_partitionIds) {
              m_corruptedPartitions.add(partitionId);
            }
          }
          m_totalPartitions = fd.readInt();
        } else {
          m_partitionIds = new int[] {0};
          m_totalPartitions = 1;
          if (!m_completed) {
            m_corruptedPartitions.add(0);
          }
        }
        m_hasVersion2FormatChunks = false;
      } else {
        assert (m_versionNum[3] == 1 || m_versionNum[3] == 2);
        if (m_versionNum[3] >= 2) {
          m_hasVersion2FormatChunks = true;
        } else {
          m_hasVersion2FormatChunks = false;
        }
        int numJSONBytes = fd.readInt();
        byte jsonBytes[] = new byte[numJSONBytes];
        fd.readFully(jsonBytes);
        String jsonString = new String(jsonBytes, "UTF-8");
        JSONObject obj = new JSONObject(jsonString);

        m_txnId = obj.getLong("txnId");
        // Timestamp field added for 3.0, might not be there
        if (obj.has("timestamp")) {
          m_timestamp = obj.getLong("timestamp");
        } else {
          // Pre 3.0/IV2 the timestamp was in the transactionid
          m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId);
        }
        m_hostId = obj.getInt("hostId");
        m_hostname = obj.getString("hostname");
        m_clusterName = obj.getString("clusterName");
        m_databaseName = obj.getString("databaseName");
        m_tableName = obj.getString("tableName");
        m_isReplicated = obj.getBoolean("isReplicated");
        m_isCompressed = obj.optBoolean("isCompressed", false);
        m_checksumType = ChecksumType.valueOf(obj.optString("checksumType", "CRC32"));
        if (!m_isReplicated) {
          JSONArray partitionIds = obj.getJSONArray("partitionIds");
          m_partitionIds = new int[partitionIds.length()];
          for (int ii = 0; ii < m_partitionIds.length; ii++) {
            m_partitionIds[ii] = partitionIds.getInt(ii);
          }

          if (!m_completed) {
            for (Integer partitionId : m_partitionIds) {
              m_corruptedPartitions.add(partitionId);
            }
          }
          m_totalPartitions = obj.getInt("numPartitions");
        } else {
          m_partitionIds = new int[] {0};
          m_totalPartitions = 1;
          if (!m_completed) {
            m_corruptedPartitions.add(0);
          }
        }
      }
      /*
       * Several runtime exceptions can be thrown in valid failure cases where
       * a corrupt save file is being detected.
       */
    } catch (BufferUnderflowException e) {
      throw new IOException(e);
    } catch (BufferOverflowException e) {
      throw new IOException(e);
    } catch (IndexOutOfBoundsException e) {
      throw new IOException(e);
    } catch (JSONException e) {
      throw new IOException(e);
    }
  }