Exemplo n.º 1
0
 @Override
 public void handleResponse(ClientResponse resp) {
   if (resp == null) {
     VoltDB.crashLocalVoltDB(
         "Received a null response to a snapshot initiation request.  "
             + "This should be impossible.",
         true,
         null);
   } else if (resp.getStatus() != ClientResponse.SUCCESS) {
     tmLog.info(
         "Failed to complete partition detection snapshot, status: "
             + resp.getStatus()
             + ", reason: "
             + resp.getStatusString());
     tmLog.info("Retrying partition detection snapshot...");
     SnapshotUtil.requestSnapshot(
         0L,
         m_partSnapshotSchedule.getPath(),
         m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(),
         true,
         SnapshotFormat.NATIVE,
         null,
         m_snapshotHandler,
         true);
   } else if (!SnapshotUtil.didSnapshotRequestSucceed(resp.getResults())) {
     VoltDB.crashGlobalVoltDB(
         "Unable to complete partition detection snapshot: " + resp.getResults()[0],
         false,
         null);
   } else {
     VoltDB.crashGlobalVoltDB(
         "Partition detection snapshot completed. Shutting down.", false, null);
   }
 }
Exemplo n.º 2
0
  private void doPartitionDetectionActivities() {
    // We should never re-enter here once we've decided we're partitioned and doomed
    assert (!m_partitionDetected);
    // After everything is resolved, write the new surviving set to ZK
    List<Integer> currentNodes = null;
    try {
      currentNodes = m_hostMessenger.getLiveHostIds();
    } catch (Exception e) {

    }
    Set<Integer> currentHosts = new HashSet<Integer>(currentNodes);
    Set<Integer> previousHosts = readPriorKnownLiveNodes();

    boolean partitionDetectionTriggered = makePPDDecision(previousHosts, currentHosts);

    if (partitionDetectionTriggered) {
      m_partitionDetected = true;
      if (m_usingCommandLog) {
        // Just shut down immediately
        VoltDB.crashGlobalVoltDB(
            "Use of command logging detected, no additional database snapshot will "
                + "be generated.  Please use the 'recover' action to restore the database if necessary.",
            false,
            null);
      } else {
        SnapshotUtil.requestSnapshot(
            0L,
            m_partSnapshotSchedule.getPath(),
            m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(),
            true,
            SnapshotFormat.NATIVE,
            null,
            m_snapshotHandler,
            true);
      }
    }
    // If the cluster host set has changed, then write the new set to ZK
    // NOTE: we don't want to update the known live nodes if we've decided that our subcluster is
    // dying, otherwise a poorly timed subsequent failure might reverse this decision.  Any future
    // promoted
    // LeaderAppointer should make their partition detection decision based on the pre-partition
    // cluster state.
    else if (!currentHosts.equals(previousHosts)) {
      writeKnownLiveNodes(currentNodes);
    }
  }
Exemplo n.º 3
0
  private void createSetupIv2(
      String file_path,
      final String pathType,
      final String file_nonce,
      SnapshotFormat format,
      final long txnId,
      final Map<Integer, Long> partitionTransactionIds,
      JSONObject jsData,
      final SystemProcedureExecutionContext context,
      final VoltTable result,
      ExtensibleSnapshotDigestData extraSnapshotData,
      SiteTracker tracker,
      HashinatorSnapshotData hashinatorData,
      long timestamp) {
    SnapshotWritePlan plan;
    if (format == SnapshotFormat.NATIVE) {
      plan = new NativeSnapshotWritePlan();
    } else if (format == SnapshotFormat.CSV) {
      plan = new CSVSnapshotWritePlan();
    } else if (format == SnapshotFormat.STREAM) {
      plan = new StreamSnapshotWritePlan();
    } else if (format == SnapshotFormat.INDEX) {
      plan = new IndexSnapshotWritePlan();
    } else {
      throw new RuntimeException("BAD BAD BAD");
    }
    file_path = SnapshotUtil.getRealPath(SnapshotPathType.valueOf(pathType), file_path);

    final Callable<Boolean> deferredSetup =
        plan.createSetup(
            file_path,
            pathType,
            file_nonce,
            txnId,
            partitionTransactionIds,
            jsData,
            context,
            result,
            extraSnapshotData,
            tracker,
            hashinatorData,
            timestamp);
    m_deferredSetupFuture =
        VoltDB.instance()
            .submitSnapshotIOWork(
                new DeferredSnapshotSetup(plan, deferredSetup, txnId, partitionTransactionIds));

    synchronized (m_createLock) {
      // Seems like this should be cleared out just in case
      // Log if there is actually anything to clear since it is unexpected
      if (!m_taskListsForHSIds.isEmpty()) {
        SNAP_LOG.warn("Found lingering snapshot tasks while setting up a snapshot");
      }
      m_taskListsForHSIds.clear();
      m_createSuccess.set(true);
      m_createResult.set(result);

      m_taskListsForHSIds.putAll(plan.getTaskListsForHSIds());

      // HACK HACK HACK.  If the task list is empty, this host has no work to do for
      // this snapshot.  We're going to create an empty list of tasks for one of the sites to do
      // so that we'll have a SnapshotSiteProcessor which will do the logSnapshotCompleteToZK.
      if (m_taskListsForHSIds.isEmpty()) {
        SNAP_LOG.debug(
            "Node had no snapshot work to do.  Creating a null task to drive completion.");
        m_taskListsForHSIds.put(context.getSiteId(), new ArrayDeque<SnapshotTableTask>());
      }
      SNAP_LOG.debug(
          "Planned tasks: "
              + CoreUtils.hsIdCollectionToString(plan.getTaskListsForHSIds().keySet()));
      SNAP_LOG.debug(
          "Created tasks for HSIds: "
              + CoreUtils.hsIdCollectionToString(m_taskListsForHSIds.keySet()));
    }
  }
Exemplo n.º 4
0
  /**
   * The only public method: do all the work to start a snapshot. Assumes that a snapshot is
   * feasible, that the caller has validated it can be accomplished, that the caller knows this is a
   * consistent or useful transaction point at which to snapshot.
   *
   * @param file_path
   * @param file_nonce
   * @param format
   * @param block
   * @param txnId
   * @param data
   * @param context
   * @param hostname
   * @return VoltTable describing the results of the snapshot attempt
   */
  public VoltTable startSnapshotting(
      final String file_path,
      final String file_nonce,
      final SnapshotFormat format,
      final byte block,
      final long multiPartTxnId,
      final long partitionTxnId,
      final long legacyPerPartitionTxnIds[],
      final String data,
      final SystemProcedureExecutionContext context,
      final String hostname,
      final HashinatorSnapshotData hashinatorData,
      final long timestamp) {
    TRACE_LOG.trace("Creating snapshot target and handing to EEs");
    final VoltTable result = SnapshotUtil.constructNodeResultsTable();
    final int numLocalSites =
        context.getCluster().getDeployment().get("deployment").getSitesperhost();

    // One site wins the race to create the snapshot targets, populating
    // m_taskListsForSites for the other sites and creating an appropriate
    // number of snapshot permits.
    synchronized (SnapshotSiteProcessor.m_snapshotCreateLock) {
      SnapshotSiteProcessor.m_snapshotCreateSetupBarrierActualAction.set(
          new Runnable() {
            @Override
            public void run() {
              Map<Integer, Long> partitionTransactionIds = new HashMap<Integer, Long>();
              partitionTransactionIds = m_partitionLastSeenTransactionIds;
              SNAP_LOG.debug("Last seen partition transaction ids " + partitionTransactionIds);
              m_partitionLastSeenTransactionIds = new HashMap<Integer, Long>();
              partitionTransactionIds.put(TxnEgo.getPartitionId(multiPartTxnId), multiPartTxnId);

              /*
               * Do a quick sanity check that the provided IDs
               * don't conflict with currently active partitions. If they do
               * it isn't fatal we can just skip it.
               */
              for (long txnId : legacyPerPartitionTxnIds) {
                final int legacyPartition = TxnEgo.getPartitionId(txnId);
                if (partitionTransactionIds.containsKey(legacyPartition)) {
                  SNAP_LOG.warn(
                      "While saving a snapshot and propagating legacy "
                          + "transaction ids found an id that matches currently active partition"
                          + partitionTransactionIds.get(legacyPartition));
                } else {
                  partitionTransactionIds.put(legacyPartition, txnId);
                }
              }
              exportSequenceNumbers = SnapshotSiteProcessor.getExportSequenceNumbers();
              createSetupIv2(
                  file_path,
                  file_nonce,
                  format,
                  multiPartTxnId,
                  partitionTransactionIds,
                  data,
                  context,
                  result,
                  exportSequenceNumbers,
                  context.getSiteTrackerForSnapshot(),
                  hashinatorData,
                  timestamp);
            }
          });

      // Create a barrier to use with the current number of sites to wait for
      // or if the barrier is already set up check if it is broken and reset if necessary
      SnapshotSiteProcessor.readySnapshotSetupBarriers(numLocalSites);

      // From within this EE, record the sequence numbers as of the start of the snapshot (now)
      // so that the info can be put in the digest.
      SnapshotSiteProcessor.populateExportSequenceNumbersForExecutionSite(context);
      SNAP_LOG.debug(
          "Registering transaction id "
              + partitionTxnId
              + " for "
              + TxnEgo.getPartitionId(partitionTxnId));
      m_partitionLastSeenTransactionIds.put(TxnEgo.getPartitionId(partitionTxnId), partitionTxnId);
    }

    boolean runPostTasks = false;
    VoltTable earlyResultTable = null;
    try {
      SnapshotSiteProcessor.m_snapshotCreateSetupBarrier.await();
      try {
        synchronized (m_createLock) {
          SNAP_LOG.debug(
              "Found tasks for HSIds: "
                  + CoreUtils.hsIdCollectionToString(m_taskListsForHSIds.keySet()));
          SNAP_LOG.debug("Looking for local HSID: " + CoreUtils.hsIdToString(context.getSiteId()));
          Deque<SnapshotTableTask> taskList = m_taskListsForHSIds.remove(context.getSiteId());
          // If createSetup failed, then the first site to reach here is going
          // to send the results table generated by createSetup, and then empty out the table.
          // All other sites to reach here will send the appropriate empty table.
          // If createSetup was a success but the taskList is null, then we'll use the block
          // switch to figure out what flavor of empty SnapshotSave result table to return.
          if (!m_createSuccess.get()) {
            // There shouldn't be any work for any site if we failed
            assert (m_taskListsForHSIds.isEmpty());
            VoltTable finalresult = m_createResult.get();
            if (finalresult != null) {
              m_createResult.set(null);
              earlyResultTable = finalresult;
            } else {
              // We returned a non-empty NodeResultsTable with the failures in it,
              // every other site needs to return a NodeResultsTable as well.
              earlyResultTable = SnapshotUtil.constructNodeResultsTable();
            }
          } else if (taskList == null) {
            SNAP_LOG.debug("No task for this site, block " + block);
            // This node is participating in the snapshot but this site has nothing to do.
            // Send back an appropriate empty table based on the block flag
            if (block != 0) {
              runPostTasks = true;
              earlyResultTable = SnapshotUtil.constructPartitionResultsTable();
              earlyResultTable.addRow(
                  context.getHostId(),
                  hostname,
                  CoreUtils.getSiteIdFromHSId(context.getSiteId()),
                  "SUCCESS",
                  "");
            } else {
              earlyResultTable = SnapshotUtil.constructNodeResultsTable();
            }
          } else {
            context
                .getSiteSnapshotConnection()
                .initiateSnapshots(format, taskList, multiPartTxnId, exportSequenceNumbers);
          }

          if (m_deferredSetupFuture != null) {
            // Add a listener to the deferred setup so that it can kick off the snapshot
            // task once the setup is done.
            m_deferredSetupFuture.addListener(
                new Runnable() {
                  @Override
                  public void run() {
                    DeferredSnapshotSetup deferredSnapshotSetup = null;
                    try {
                      deferredSnapshotSetup = m_deferredSetupFuture.get();
                    } catch (Exception e) {
                      // it doesn't throw
                    }

                    assert deferredSnapshotSetup != null;
                    context
                        .getSiteSnapshotConnection()
                        .startSnapshotWithTargets(
                            deferredSnapshotSetup.getPlan().getSnapshotDataTargets());
                  }
                },
                CoreUtils.SAMETHREADEXECUTOR);
          }
        }
      } finally {
        SnapshotSiteProcessor.m_snapshotCreateFinishBarrier.await(120, TimeUnit.SECONDS);
      }
    } catch (TimeoutException e) {
      VoltDB.crashLocalVoltDB(
          "Timed out waiting 120 seconds for all threads to arrive and start snapshot", true, null);
    } catch (InterruptedException e) {
      result.addRow(context.getHostId(), hostname, "", "FAILURE", CoreUtils.throwableToString(e));
      earlyResultTable = result;
    } catch (BrokenBarrierException e) {
      result.addRow(context.getHostId(), hostname, "", "FAILURE", CoreUtils.throwableToString(e));
      earlyResultTable = result;
    }

    // If earlyResultTable is set, return here
    if (earlyResultTable != null) {
      if (runPostTasks) {
        // Need to run post-snapshot tasks before finishing
        SnapshotSiteProcessor.runPostSnapshotTasks(context);
      }
      return earlyResultTable;
    }

    if (block != 0) {
      HashSet<Exception> failures = Sets.newHashSet();
      String status = "SUCCESS";
      String err = "";
      try {
        // For blocking snapshot, propogate the error from deferred setup back to the client
        final DeferredSnapshotSetup deferredSnapshotSetup = m_deferredSetupFuture.get();
        if (deferredSnapshotSetup != null && deferredSnapshotSetup.getError() != null) {
          status = "FAILURE";
          err = deferredSnapshotSetup.getError().toString();
          failures.add(deferredSnapshotSetup.getError());
        }

        failures.addAll(context.getSiteSnapshotConnection().completeSnapshotWork());
        SnapshotSiteProcessor.runPostSnapshotTasks(context);
      } catch (Exception e) {
        status = "FAILURE";
        err = e.toString();
        failures.add(e);
      }
      final VoltTable blockingResult = SnapshotUtil.constructPartitionResultsTable();

      if (failures.isEmpty()) {
        blockingResult.addRow(
            context.getHostId(),
            hostname,
            CoreUtils.getSiteIdFromHSId(context.getSiteId()),
            status,
            err);
      } else {
        status = "FAILURE";
        for (Exception e : failures) {
          err = e.toString();
        }
        blockingResult.addRow(
            context.getHostId(),
            hostname,
            CoreUtils.getSiteIdFromHSId(context.getSiteId()),
            status,
            err);
      }
      return blockingResult;
    }

    return result;
  }
  public DefaultSnapshotDataTarget(
      final File file,
      final int hostId,
      final String clusterName,
      final String databaseName,
      final String tableName,
      final int numPartitions,
      final boolean isReplicated,
      final List<Integer> partitionIds,
      final VoltTable schemaTable,
      final long txnId,
      final long timestamp,
      int version[])
      throws IOException {
    String hostname = CoreUtils.getHostnameOrAddress();
    m_file = file;
    m_tableName = tableName;
    m_fos = new FileOutputStream(file);
    m_channel = m_fos.getChannel();
    m_needsFinalClose = !isReplicated;
    final FastSerializer fs = new FastSerializer();
    fs.writeInt(0); // CRC
    fs.writeInt(0); // Header length placeholder
    fs.writeByte(
        1); // Indicate the snapshot was not completed, set to true for the CRC calculation, false
    // later
    for (int ii = 0; ii < 4; ii++) {
      fs.writeInt(version[ii]); // version
    }
    JSONStringer stringer = new JSONStringer();
    byte jsonBytes[] = null;
    try {
      stringer.object();
      stringer.key("txnId").value(txnId);
      stringer.key("hostId").value(hostId);
      stringer.key("hostname").value(hostname);
      stringer.key("clusterName").value(clusterName);
      stringer.key("databaseName").value(databaseName);
      stringer.key("tableName").value(tableName.toUpperCase());
      stringer.key("isReplicated").value(isReplicated);
      stringer.key("isCompressed").value(true);
      stringer.key("checksumType").value("CRC32C");
      stringer.key("timestamp").value(timestamp);
      /*
       * The timestamp string is for human consumption, automated stuff should use
       * the actual timestamp
       */
      stringer.key("timestampString").value(SnapshotUtil.formatHumanReadableDate(timestamp));
      if (!isReplicated) {
        stringer.key("partitionIds").array();
        for (int partitionId : partitionIds) {
          stringer.value(partitionId);
        }
        stringer.endArray();

        stringer.key("numPartitions").value(numPartitions);
      }
      stringer.endObject();
      String jsonString = stringer.toString();
      JSONObject jsonObj = new JSONObject(jsonString);
      jsonString = jsonObj.toString(4);
      jsonBytes = jsonString.getBytes("UTF-8");
    } catch (Exception e) {
      throw new IOException(e);
    }
    fs.writeInt(jsonBytes.length);
    fs.write(jsonBytes);

    final BBContainer container = fs.getBBContainer();
    container.b.position(4);
    container.b.putInt(container.b.remaining() - 4);
    container.b.position(0);

    final byte schemaBytes[] = PrivateVoltTableFactory.getSchemaBytes(schemaTable);

    final PureJavaCrc32 crc = new PureJavaCrc32();
    ByteBuffer aggregateBuffer = ByteBuffer.allocate(container.b.remaining() + schemaBytes.length);
    aggregateBuffer.put(container.b);
    aggregateBuffer.put(schemaBytes);
    aggregateBuffer.flip();
    crc.update(aggregateBuffer.array(), 4, aggregateBuffer.capacity() - 4);

    final int crcValue = (int) crc.getValue();
    aggregateBuffer.putInt(crcValue).position(8);
    aggregateBuffer.put((byte) 0).position(0); // Haven't actually finished writing file

    if (m_simulateFullDiskWritingHeader) {
      m_writeException = new IOException("Disk full");
      m_writeFailed = true;
      m_fos.close();
      throw m_writeException;
    }

    /*
     * Be completely sure the write succeeded. If it didn't
     * the disk is probably full or the path is bunk etc.
     */
    m_acceptOneWrite = true;
    ListenableFuture<?> writeFuture =
        write(Callables.returning((BBContainer) DBBPool.wrapBB(aggregateBuffer)), false);
    try {
      writeFuture.get();
    } catch (InterruptedException e) {
      m_fos.close();
      throw new java.io.InterruptedIOException();
    } catch (ExecutionException e) {
      m_fos.close();
      throw m_writeException;
    }
    if (m_writeFailed) {
      m_fos.close();
      throw m_writeException;
    }

    ScheduledFuture<?> syncTask = null;
    syncTask =
        m_syncService.scheduleAtFixedRate(
            new Runnable() {
              @Override
              public void run() {
                // Only sync for at least 4 megabyte of data, enough to amortize the cost of seeking
                // on ye olden platters. Since we are appending to a file it's actually 2 seeks.
                while (m_bytesWrittenSinceLastSync.get() > (1024 * 1024 * 4)) {
                  final int bytesSinceLastSync = m_bytesWrittenSinceLastSync.getAndSet(0);
                  try {
                    m_channel.force(false);
                  } catch (IOException e) {
                    if (!(e instanceof java.nio.channels.AsynchronousCloseException)) {
                      SNAP_LOG.error("Error syncing snapshot", e);
                    } else {
                      SNAP_LOG.debug(
                          "Asynchronous close syncing snasphot data, presumably graceful", e);
                    }
                  }
                  m_bytesAllowedBeforeSync.release(bytesSinceLastSync);
                }
              }
            },
            SNAPSHOT_SYNC_FREQUENCY,
            SNAPSHOT_SYNC_FREQUENCY,
            TimeUnit.MILLISECONDS);
    m_syncTask = syncTask;
  }