Exemple #1
0
 public static void logFragmentTaskMessage(
     FragmentTaskMessage ftask, long localHSId, long spHandle, boolean borrow) {
   if (iv2log.isTraceEnabled()) {
     String label = "rxFragMsg";
     if (borrow) {
       label = "rxBrrwMsg";
     }
     if (ftask.getSpHandle() != Long.MIN_VALUE && ftask.getSpHandle() != spHandle) {
       iv2log.error(
           "FragmentTaskMessage SP HANDLE conflict.  Message: "
               + ftask.getSpHandle()
               + ", locally held: "
               + spHandle);
     }
     String logmsg = "%s %s from %s txnId %s spHandle %s trunc %s";
     iv2log.trace(
         String.format(
             logmsg,
             label,
             CoreUtils.hsIdToString(localHSId),
             CoreUtils.hsIdToString(ftask.m_sourceHSId),
             txnIdToString(ftask.getTxnId()),
             txnIdToString(spHandle),
             txnIdToString(ftask.getTruncationHandle())));
   }
 }
Exemple #2
0
  @Override
  public void run() {
    initialize();

    try {
      while (m_shouldContinue) {
        // Normal operation blocks the site thread on the sitetasker queue.
        SiteTasker task = m_scheduler.take();
        task.run(getSiteProcedureConnection());
      }
    } catch (OutOfMemoryError e) {
      // Even though OOM should be caught by the Throwable section below,
      // it sadly needs to be handled seperately. The goal here is to make
      // sure VoltDB crashes.
      String errmsg =
          "Site: "
              + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId)
              + " ran out of Java memory. "
              + "This node will shut down.";
      VoltDB.crashLocalVoltDB(errmsg, true, e);
    } catch (Throwable t) {
      String errmsg =
          "Site: "
              + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId)
              + " encountered an "
              + "unexpected error and will die, taking this VoltDB node down.";
      VoltDB.crashLocalVoltDB(errmsg, true, t);
    }
    shutdown();
  }
Exemple #3
0
 public void handleDumpMessage() {
   String who = CoreUtils.hsIdToString(m_mailbox.getHSId());
   hostLog.warn("State dump for site: " + who);
   hostLog.warn("" + who + ": partition: " + m_partitionId + ", isLeader: " + m_isLeader);
   if (m_isLeader) {
     hostLog.warn("" + who + ": replicas: " + CoreUtils.hsIdCollectionToString(m_replicaHSIds));
     if (m_sendToHSIds.length > 0) {
       m_mailbox.send(m_sendToHSIds, new DumpMessage());
     }
   }
   hostLog.warn(
       ""
           + who
           + ": most recent SP handle: "
           + getCurrentTxnId()
           + " "
           + TxnEgo.txnIdToString(getCurrentTxnId()));
   hostLog.warn(
       ""
           + who
           + ": outstanding txns: "
           + m_outstandingTxns.keySet()
           + " "
           + TxnEgo.txnIdCollectionToString(m_outstandingTxns.keySet()));
   hostLog.warn("" + who + ": TransactionTaskQueue: " + m_pendingTasks.toString());
   if (m_duplicateCounters.size() > 0) {
     hostLog.warn("" + who + ": duplicate counters: ");
     for (Entry<DuplicateCounterKey, DuplicateCounter> e : m_duplicateCounters.entrySet()) {
       hostLog.warn("\t" + who + ": " + e.getKey().toString() + ": " + e.getValue().toString());
     }
   }
 }
Exemple #4
0
 public static void logIv2InitiateTaskMessage(
     Iv2InitiateTaskMessage itask, long localHSId, long txnid, long spHandle) {
   if (iv2log.isTraceEnabled()) {
     String logmsg = "rxInitMsg %s from %s ciHandle %s txnId %s spHandle %s trunc %s";
     if (itask.getTxnId() != Long.MIN_VALUE && itask.getTxnId() != txnid) {
       iv2log.error(
           "Iv2InitiateTaskMessage TXN ID conflict.  Message: "
               + itask.getTxnId()
               + ", locally held: "
               + txnid);
     }
     if (itask.getSpHandle() != Long.MIN_VALUE && itask.getSpHandle() != spHandle) {
       iv2log.error(
           "Iv2InitiateTaskMessage SP HANDLE conflict.  Message: "
               + itask.getSpHandle()
               + ", locally held: "
               + spHandle);
     }
     iv2log.trace(
         String.format(
             logmsg,
             CoreUtils.hsIdToString(localHSId),
             CoreUtils.hsIdToString(itask.m_sourceHSId),
             ClientInterfaceHandleManager.handleToString(itask.getClientInterfaceHandle()),
             txnIdToString(txnid),
             txnIdToString(spHandle),
             txnIdToString(itask.getTruncationHandle())));
   }
 }
Exemple #5
0
 public static void logInitiatorRxMsg(VoltMessage msg, long localHSId) {
   if (iv2log.isTraceEnabled()) {
     if (msg instanceof InitiateResponseMessage) {
       InitiateResponseMessage iresp = (InitiateResponseMessage) msg;
       String logmsg = "rxInitRsp %s from %s ciHandle %s txnId %s spHandle %s status %s";
       iv2log.trace(
           String.format(
               logmsg,
               CoreUtils.hsIdToString(localHSId),
               CoreUtils.hsIdToString(iresp.m_sourceHSId),
               ClientInterfaceHandleManager.handleToString(iresp.getClientInterfaceHandle()),
               txnIdToString(iresp.getTxnId()),
               txnIdToString(iresp.getSpHandle()),
               respStatusToString(iresp.getClientResponseData().getStatus())));
     } else if (msg instanceof FragmentResponseMessage) {
       FragmentResponseMessage fresp = (FragmentResponseMessage) msg;
       String logmsg = "rxFragRsp %s from %s txnId %s spHandle %s status %s";
       iv2log.trace(
           String.format(
               logmsg,
               CoreUtils.hsIdToString(localHSId),
               CoreUtils.hsIdToString(fresp.m_sourceHSId),
               txnIdToString(fresp.getTxnId()),
               txnIdToString(fresp.getSpHandle()),
               fragStatusToString(fresp.getStatusCode())));
     }
   }
 }
Exemple #6
0
  /** Runs when the RejoinCoordinator decides this site should start rejoin. */
  void doInitiation(RejoinMessage message) {
    m_coordinatorHsId = message.m_sourceHSId;
    m_streamSnapshotMb = VoltDB.instance().getHostMessenger().createMailbox();
    m_rejoinSiteProcessor = new StreamSnapshotSink(m_streamSnapshotMb);

    // MUST choose the leader as the source.
    long sourceSite = m_mailbox.getMasterHsId(m_partitionId);
    long hsId =
        m_rejoinSiteProcessor.initialize(
            message.getSnapshotSourceCount(), message.getSnapshotBufferPool());

    REJOINLOG.debug(
        m_whoami
            + "received INITIATION message. Doing rejoin"
            + ". Source site is: "
            + CoreUtils.hsIdToString(sourceSite)
            + " and destination rejoin processor is: "
            + CoreUtils.hsIdToString(hsId)
            + " and snapshot nonce is: "
            + message.getSnapshotNonce());

    registerSnapshotMonitor(message.getSnapshotNonce());
    // Tell the RejoinCoordinator everything it will need to know to get us our snapshot stream.
    RejoinMessage initResp = new RejoinMessage(m_mailbox.getHSId(), sourceSite, hsId);
    m_mailbox.send(m_coordinatorHsId, initResp);

    // Start waiting for snapshot data
    m_taskQueue.offer(this);
  }
Exemple #7
0
 public static void logTopology(long leaderHSId, List<Long> replicas, int partitionId) {
   if (iv2log.isTraceEnabled()) {
     String logmsg = "topology partition %d leader %s replicas (%s)";
     iv2log.trace(
         String.format(
             logmsg,
             partitionId,
             CoreUtils.hsIdToString(leaderHSId),
             CoreUtils.hsIdCollectionToString(replicas)));
   }
 }
Exemple #8
0
 /** Create a native VoltDB execution engine */
 ExecutionEngine initializeEE(String serializedCatalog, final long timestamp) {
   String hostname = CoreUtils.getHostnameOrAddress();
   ExecutionEngine eeTemp = null;
   try {
     if (m_backend == BackendTarget.NATIVE_EE_JNI) {
       eeTemp =
           new ExecutionEngineJNI(
               m_context.cluster.getRelativeIndex(),
               m_siteId,
               m_partitionId,
               CoreUtils.getHostIdFromHSId(m_siteId),
               hostname,
               m_context
                   .cluster
                   .getDeployment()
                   .get("deployment")
                   .getSystemsettings()
                   .get("systemsettings")
                   .getMaxtemptablesize(),
               m_numberOfPartitions);
       eeTemp.loadCatalog(timestamp, serializedCatalog);
     } else {
       // set up the EE over IPC
       eeTemp =
           new ExecutionEngineIPC(
               m_context.cluster.getRelativeIndex(),
               m_siteId,
               m_partitionId,
               CoreUtils.getHostIdFromHSId(m_siteId),
               hostname,
               m_context
                   .cluster
                   .getDeployment()
                   .get("deployment")
                   .getSystemsettings()
                   .get("systemsettings")
                   .getMaxtemptablesize(),
               m_backend,
               VoltDB.instance().getConfig().m_ipcPorts.remove(0),
               m_numberOfPartitions);
       eeTemp.loadCatalog(timestamp, serializedCatalog);
     }
   }
   // just print error info an bail if we run into an error here
   catch (final Exception ex) {
     hostLog.l7dlog(
         Level.FATAL,
         LogKeys.host_ExecutionSite_FailedConstruction.name(),
         new Object[] {m_siteId, m_siteIndex},
         ex);
     VoltDB.crashLocalVoltDB(ex.getMessage(), true, ex);
   }
   return eeTemp;
 }
Exemple #9
0
 public static void logFinishTransaction(InitiateResponseMessage msg, long localHSId) {
   if (iv2log.isTraceEnabled()) {
     String logmsg = "finishTxn %s ciHandle %s initHSId %s status %s";
     iv2log.trace(
         String.format(
             logmsg,
             CoreUtils.hsIdToString(localHSId),
             ClientInterfaceHandleManager.handleToString(msg.getClientInterfaceHandle()),
             CoreUtils.hsIdToString(msg.getCoordinatorHSId()),
             respStatusToString(msg.getClientResponseData().getStatus())));
   }
 }
 /** Process a new repair log response */
 @Override
 public void deliver(VoltMessage message) {
   if (message instanceof Iv2RepairLogResponseMessage) {
     Iv2RepairLogResponseMessage response = (Iv2RepairLogResponseMessage) message;
     if (response.getRequestId() != m_requestId) {
       tmLog.debug(
           m_whoami
               + "rejecting stale repair response."
               + " Current request id is: "
               + m_requestId
               + " Received response for request id: "
               + response.getRequestId());
       return;
     }
     ReplicaRepairStruct rrs = m_replicaRepairStructs.get(response.m_sourceHSId);
     if (rrs.m_expectedResponses < 0) {
       tmLog.debug(
           m_whoami
               + "collecting "
               + response.getOfTotal()
               + " repair log entries from "
               + CoreUtils.hsIdToString(response.m_sourceHSId));
     }
     // Long.MAX_VALUE has rejoin semantics
     if (response.getHandle() != Long.MAX_VALUE) {
       m_maxSeenTxnId = Math.max(m_maxSeenTxnId, response.getHandle());
     }
     if (response.getPayload() != null) {
       m_repairLogUnion.add(response);
       if (tmLog.isTraceEnabled()) {
         tmLog.trace(
             m_whoami
                 + " collected from "
                 + CoreUtils.hsIdToString(response.m_sourceHSId)
                 + ", message: "
                 + response.getPayload());
       }
     }
     if (rrs.update(response)) {
       tmLog.debug(
           m_whoami
               + "collected "
               + rrs.m_receivedResponses
               + " responses for "
               + rrs.m_expectedResponses
               + " repair log entries from "
               + CoreUtils.hsIdToString(response.m_sourceHSId));
       if (areRepairLogsComplete()) {
         repairSurvivors();
       }
     }
   }
 }
Exemple #11
0
 public static void logIv2MultipartSentinel(
     MultiPartitionParticipantMessage message, long localHSId, long txnId) {
   if (iv2log.isTraceEnabled()) {
     String logmsg = "rxSntlMsg %s from %s txnId %s";
     iv2log.trace(
         String.format(
             logmsg,
             CoreUtils.hsIdToString(localHSId),
             CoreUtils.hsIdToString(message.m_sourceHSId),
             txnIdToString(txnId)));
   }
 }
Exemple #12
0
 public static void logCreateTransaction(Iv2InitiateTaskMessage msg) {
   if (iv2log.isTraceEnabled()) {
     String logmsg = "createTxn %s ciHandle %s initHSId %s proc %s";
     iv2log.trace(
         String.format(
             logmsg,
             CoreUtils.hsIdToString(msg.getInitiatorHSId()),
             ClientInterfaceHandleManager.handleToString(msg.getClientInterfaceHandle()),
             CoreUtils.hsIdToString(msg.getCoordinatorHSId()),
             msg.getStoredProcedureInvocation().getProcName()));
   }
 }
Exemple #13
0
 public long getBuddySiteForMPI(long hsid) {
   int host = CoreUtils.getHostIdFromHSId(hsid);
   // We'll be lazy and get the map we'd feed to SiteTracker's
   // constructor, then go looking for a matching host ID.
   List<MailboxNodeContent> sitesList = getMailboxNodeContentList();
   for (MailboxNodeContent site : sitesList) {
     if (site.partitionId != MpInitiator.MP_INIT_PID
         && host == CoreUtils.getHostIdFromHSId(site.HSId)) {
       return site.HSId;
     }
   }
   throw new RuntimeException(
       "Unable to find a buddy initiator for MPI with HSID: " + CoreUtils.hsIdToString(hsid));
 }
Exemple #14
0
 public static void logCompleteTransactionMessage(
     CompleteTransactionMessage ctask, long localHSId) {
   if (iv2log.isTraceEnabled()) {
     String logmsg = "rxCompMsg %s from %s txnId %s %s %s";
     iv2log.trace(
         String.format(
             logmsg,
             CoreUtils.hsIdToString(localHSId),
             CoreUtils.hsIdToString(ctask.m_sourceHSId),
             txnIdToString(ctask.getTxnId()),
             ctask.isRollback() ? "ROLLBACK" : "COMMIT",
             ctask.isRestart() ? "RESTART" : ""));
   }
 }
Exemple #15
0
  private long assignLeader(int partitionId, List<Long> children) {
    // We used masterHostId = -1 as a way to force the leader choice to be
    // the first replica in the list, if we don't have some other mechanism
    // which has successfully overridden it.
    int masterHostId = -1;
    if (m_state.get() == AppointerState.CLUSTER_START) {
      try {
        // find master in topo
        JSONArray parts = m_topo.getJSONArray("partitions");
        for (int p = 0; p < parts.length(); p++) {
          JSONObject aPartition = parts.getJSONObject(p);
          int pid = aPartition.getInt("partition_id");
          if (pid == partitionId) {
            masterHostId = aPartition.getInt("master");
          }
        }
      } catch (JSONException jse) {
        tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0");
        jse.printStackTrace();
        masterHostId = -1; // stupid default
      }
    } else {
      // For now, if we're appointing a new leader as a result of a
      // failure, just pick the first replica in the children list.
      // Could eventually do something more complex here to try to keep a
      // semi-balance, but it's unclear that this has much utility until
      // we add rebalancing on rejoin as well.
      masterHostId = -1;
    }

    long masterHSId = children.get(0);
    for (Long child : children) {
      if (CoreUtils.getHostIdFromHSId(child) == masterHostId) {
        masterHSId = child;
        break;
      }
    }
    tmLog.info(
        "Appointing HSId "
            + CoreUtils.hsIdToString(masterHSId)
            + " as leader for partition "
            + partitionId);
    try {
      m_iv2appointees.put(partitionId, masterHSId);
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e);
    }
    return masterHSId;
  }
Exemple #16
0
  @Override
  protected void updateStatsRow(Object rowKey, Object[] rowValues) {
    long leader;
    List<Long> sites = new ArrayList<Long>();
    if (rowKey.equals(MpInitiator.MP_INIT_PID)) {
      leader = getHSIdForMultiPartitionInitiator();
      sites.add(leader);
    } else {
      leader = m_iv2Masters.pointInTimeCache().get((Integer) rowKey);
      sites.addAll(getReplicasForPartition((Integer) rowKey));
    }

    rowValues[columnNameToIndex.get("Partition")] = rowKey;
    rowValues[columnNameToIndex.get("Sites")] = CoreUtils.hsIdCollectionToString(sites);
    rowValues[columnNameToIndex.get("Leader")] = CoreUtils.hsIdToString(leader);
  }
Exemple #17
0
  @Override
  public void run() {
    Thread.currentThread().setName("Iv2ExecutionSite: " + CoreUtils.hsIdToString(m_siteId));
    initialize(m_startupConfig.m_serializedCatalog, m_startupConfig.m_timestamp);
    m_startupConfig = null; // release the serializedCatalog bytes.

    try {
      while (m_shouldContinue) {
        if (m_rejoinState == kStateRunning) {
          // Normal operation blocks the site thread on the sitetasker queue.
          SiteTasker task = m_scheduler.take();
          if (task instanceof TransactionTask) {
            m_currentTxnId = ((TransactionTask) task).getTxnId();
            m_lastTxnTime = EstTime.currentTimeMillis();
          }
          task.run(getSiteProcedureConnection());
        } else {
          // Rejoin operation poll and try to do some catchup work. Tasks
          // are responsible for logging any rejoin work they might have.
          SiteTasker task = m_scheduler.poll();
          if (task != null) {
            task.runForRejoin(getSiteProcedureConnection(), m_rejoinTaskLog);
          }
          replayFromTaskLog();
        }
      }
    } catch (OutOfMemoryError e) {
      // Even though OOM should be caught by the Throwable section below,
      // it sadly needs to be handled seperately. The goal here is to make
      // sure VoltDB crashes.
      String errmsg =
          "Site: "
              + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId)
              + " ran out of Java memory. "
              + "This node will shut down.";
      VoltDB.crashLocalVoltDB(errmsg, true, e);
    } catch (Throwable t) {
      String errmsg =
          "Site: "
              + org.voltcore.utils.CoreUtils.hsIdToString(m_siteId)
              + " encountered an "
              + "unexpected error and will die, taking this VoltDB node down.";
      VoltDB.crashLocalVoltDB(errmsg, true, t);
    }
    shutdown();
  }
Exemple #18
0
 @Override
 public String toString() {
   StringBuilder sb = new StringBuilder();
   sb.append("MpProcedureTask:");
   sb.append("  TXN ID: ").append(TxnEgo.txnIdToString(getTxnId()));
   sb.append("  SP HANDLE ID: ").append(TxnEgo.txnIdToString(getSpHandle()));
   sb.append("  ON HSID: ").append(CoreUtils.hsIdToString(m_initiator.getHSId()));
   return sb.toString();
 }
  /** Send missed-messages to survivors. */
  public void repairSurvivors() {
    // cancel() and repair() must be synchronized by the caller (the deliver lock,
    // currently). If cancelled and the last repair message arrives, don't send
    // out corrections!
    if (this.m_promotionResult.isCancelled()) {
      tmLog.debug(m_whoami + "Skipping repair message creation for cancelled Term.");
      return;
    }

    int queued = 0;
    tmLog.debug(m_whoami + "received all repair logs and is repairing surviving replicas.");
    for (Iv2RepairLogResponseMessage li : m_repairLogUnion) {
      List<Long> needsRepair = new ArrayList<Long>(5);
      for (Entry<Long, ReplicaRepairStruct> entry : m_replicaRepairStructs.entrySet()) {
        if (entry.getValue().needs(li.getHandle())) {
          ++queued;
          tmLog.debug(
              m_whoami
                  + "repairing "
                  + CoreUtils.hsIdToString(entry.getKey())
                  + ". Max seen "
                  + entry.getValue().m_maxSpHandleSeen
                  + ". Repairing with "
                  + li.getHandle());
          needsRepair.add(entry.getKey());
        }
      }
      if (!needsRepair.isEmpty()) {
        if (tmLog.isTraceEnabled()) {
          tmLog.trace(
              m_whoami
                  + "repairing: "
                  + CoreUtils.hsIdCollectionToString(needsRepair)
                  + " with message: "
                  + li.getPayload());
        }
        m_mailbox.repairReplicasWith(needsRepair, li.getPayload());
      }
    }
    tmLog.debug(m_whoami + "finished queuing " + queued + " replica repair messages.");

    m_promotionResult.done(m_maxSeenTxnId);
  }
 /** Notify the coordinator that this site has received the first fragment message */
 private void sendFirstFragResponse() {
   if (JOINLOG.isDebugEnabled()) {
     JOINLOG.debug(
         "P"
             + m_partitionId
             + " sending first fragment response to coordinator "
             + CoreUtils.hsIdToString(m_coordinatorHsId));
   }
   RejoinMessage msg =
       new RejoinMessage(m_mailbox.getHSId(), RejoinMessage.Type.FIRST_FRAGMENT_RECEIVED);
   m_mailbox.send(m_coordinatorHsId, msg);
   m_firstFragResponseSent = true;
 }
Exemple #21
0
    @Override
    public void run() {
      REJOINLOG.debug(
          m_whoami
              + "informing rejoinCoordinator "
              + CoreUtils.hsIdToString(m_coordinatorHsId)
              + " of REPLAY_FINISHED");
      RejoinMessage replay_complete =
          new RejoinMessage(m_mailbox.getHSId(), RejoinMessage.Type.REPLAY_FINISHED);
      m_mailbox.send(m_coordinatorHsId, replay_complete);
      m_currentlyRejoining.set(false);

      SnapshotSaveAPI.recoveringSiteCount.decrementAndGet();
    }
Exemple #22
0
 // This message used to be sent by the SP or MP initiator when they accepted a promotion.
 // For dev speed, we'll detect mastership changes here and construct and send this message to the
 // local client interface so we can keep the CIs implementation
 private void sendLeaderChangeNotify(long hsId, int partitionId) {
   try {
     JSONStringer stringer = new JSONStringer();
     stringer.object();
     stringer.key(JSON_PARTITION_ID).value(partitionId);
     stringer.key(JSON_INITIATOR_HSID).value(hsId);
     stringer.endObject();
     BinaryPayloadMessage bpm =
         new BinaryPayloadMessage(new byte[0], stringer.toString().getBytes("UTF-8"));
     int hostId = m_hostMessenger.getHostId();
     m_hostMessenger.send(
         CoreUtils.getHSIdFromHostAndSite(hostId, HostMessenger.CLIENT_INTERFACE_SITE_ID), bpm);
   } catch (Exception e) {
     VoltDB.crashLocalVoltDB("Unable to propogate leader promotion to client interface.", true, e);
   }
 }
  /** Start fixing survivors: setup scoreboard and request repair logs. */
  void prepareForFaultRecovery() {
    for (Long hsid : m_survivors) {
      m_replicaRepairStructs.put(hsid, new ReplicaRepairStruct());
    }

    tmLog.info(
        m_whoami
            + "found (including self) "
            + m_survivors.size()
            + " surviving replicas to repair. "
            + " Survivors: "
            + CoreUtils.hsIdCollectionToString(m_survivors));
    VoltMessage logRequest =
        new Iv2RepairLogRequestMessage(m_requestId, Iv2RepairLogRequestMessage.SPREQUEST);
    m_mailbox.send(com.google.common.primitives.Longs.toArray(m_survivors), logRequest);
  }
Exemple #24
0
 public void dump(long hsId) {
   final String who = CoreUtils.hsIdToString(hsId);
   tmLog.info(
       String.format(
           "%s: REPLAY SEQUENCER DUMP, LAST POLLED FRAGMENT %d (%s), LAST SEEN TXNID %d (%s), %s%s",
           who,
           m_lastPolledFragmentTxnId,
           TxnEgo.txnIdToString(m_lastPolledFragmentTxnId),
           m_lastSeenTxnId,
           TxnEgo.txnIdToString(m_lastSeenTxnId),
           m_mpiEOLReached ? "MPI EOL, " : "",
           m_mustDrain ? "MUST DRAIN" : ""));
   for (Entry<Long, ReplayEntry> e : m_replayEntries.entrySet()) {
     tmLog.info(String.format("%s: REPLAY ENTRY %s: %s", who, e.getKey(), e.getValue()));
   }
 }
  public SimpleFileSnapshotDataTarget(File file, boolean needsFinalClose) throws IOException {
    m_file = file;
    m_tempFile = new File(m_file.getParentFile(), m_file.getName() + ".incomplete");
    m_ras = new RandomAccessFile(m_tempFile, "rw");
    m_fc = m_ras.getChannel();
    m_needsFinalClose = needsFinalClose;

    m_es = CoreUtils.getListeningSingleThreadExecutor("Snapshot write thread for " + m_file);
    ScheduledFuture<?> syncTask = null;
    syncTask =
        DefaultSnapshotDataTarget.m_syncService.scheduleAtFixedRate(
            new Runnable() {
              private long syncedBytes = 0;

              @Override
              public void run() {
                // Only sync for at least 4 megabyte of data, enough to amortize the cost of seeking
                // on ye olden platters. Since we are appending to a file it's actually 2 seeks.
                while (m_bytesSinceLastSync.get() > 1024 * 1024 * 4) {
                  try {
                    final long syncStart = syncedBytes;
                    syncedBytes =
                        Bits.sync_file_range(
                            SNAP_LOG, m_ras.getFD(), m_fc, syncStart, m_fc.position());
                  } catch (IOException e) {
                    if (!(e instanceof java.nio.channels.AsynchronousCloseException)) {
                      SNAP_LOG.error("Error syncing snapshot", e);
                    } else {
                      SNAP_LOG.debug(
                          "Asynchronous close syncing snapshot data, presumably graceful", e);
                    }
                  }
                  // Blind setting to 0 means we could technically write more than
                  // 256 megabytes at a time but 512 is the worst case and that is fine
                  m_bytesSinceLastSync.set(0);
                }
              }
            },
            DefaultSnapshotDataTarget.SNAPSHOT_SYNC_FREQUENCY,
            DefaultSnapshotDataTarget.SNAPSHOT_SYNC_FREQUENCY,
            TimeUnit.MILLISECONDS);
    m_syncTask = syncTask;
  }
  @Override
  public Callable<Boolean> createSetup(
      String file_path,
      String file_nonce,
      long txnId,
      Map<Integer, Long> partitionTransactionIds,
      JSONObject jsData,
      SystemProcedureExecutionContext context,
      VoltTable result,
      Map<String, Map<Integer, Pair<Long, Long>>> exportSequenceNumbers,
      SiteTracker tracker,
      HashinatorSnapshotData hashinatorData,
      long timestamp) {
    assert SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.isEmpty();

    final IndexSnapshotRequestConfig config =
        new IndexSnapshotRequestConfig(jsData, context.getDatabase());
    final Map<Integer, Long> pidToLocalHSIds = findLocalSources(config.partitionRanges, tracker);

    // mark snapshot start in registry
    final AtomicInteger numTables = new AtomicInteger(config.tables.length);
    m_snapshotRecord =
        SnapshotRegistry.startSnapshot(
            txnId, context.getHostId(), file_path, file_nonce, SnapshotFormat.INDEX, config.tables);

    // create table tasks
    for (Table table : config.tables) {
      createTasksForTable(
          table, config.partitionRanges, pidToLocalHSIds, numTables, m_snapshotRecord);
      result.addRow(
          context.getHostId(),
          CoreUtils.getHostnameOrAddress(),
          table.getTypeName(),
          "SUCCESS",
          "");
    }

    return null;
  }
  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();

    sb.append("COMPLETE_TRANSACTION (FROM COORD: ");
    sb.append(CoreUtils.hsIdToString(m_coordinatorHSId));
    sb.append(") FOR TXN ");
    sb.append(m_txnId);
    sb.append("\n  FLAGS: ").append(m_flags);

    sb.append("\n  HASH: " + String.valueOf(m_hash));

    if (isRollback()) sb.append("\n  THIS IS AN ROLLBACK REQUEST");

    if (requiresAck()) sb.append("\n  THIS MESSAGE REQUIRES AN ACK");

    if (isRestart()) {
      sb.append("\n  THIS IS A TRANSACTION RESTART");
    }

    return sb.toString();
  }
Exemple #28
0
/**
 * LeaderAppointer handles centralized appointment of partition leaders across the partition. This
 * is primarily so that the leaders can be evenly distributed throughout the cluster, reducing
 * bottlenecks (at least at startup). As a side-effect, this service also controls the initial
 * startup of the cluster, blocking operation until each partition has a k-safe set of replicas,
 * each partition has a leader, and the MPI has started.
 */
public class LeaderAppointer implements Promotable {
  private static final VoltLogger tmLog = new VoltLogger("TM");

  private enum AppointerState {
    INIT, // Initial start state, used to inhibit ZK callback actions
    CLUSTER_START, // indicates that we're doing the initial cluster startup
    DONE // indicates normal running conditions, including repair
  }

  private final HostMessenger m_hostMessenger;
  private final ZooKeeper m_zk;
  private final int m_partitionCount;
  private final BabySitter[] m_partitionWatchers;
  private final LeaderCache m_iv2appointees;
  private final LeaderCache m_iv2masters;
  private final PartitionCallback[] m_callbacks;
  private final int m_kfactor;
  private final JSONObject m_topo;
  private final MpInitiator m_MPI;
  private final AtomicReference<AppointerState> m_state =
      new AtomicReference<AppointerState>(AppointerState.INIT);
  private CountDownLatch m_startupLatch = null;
  private final boolean m_partitionDetectionEnabled;
  private boolean m_partitionDetected = false;
  private boolean m_usingCommandLog = false;
  private final AtomicBoolean m_replayComplete = new AtomicBoolean(false);

  // Provide a single single-threaded executor service to all the BabySitters for each partition.
  // This will guarantee that the ordering of events generated by ZooKeeper is preserved in the
  // handling of callbacks in LeaderAppointer.
  private final ExecutorService m_es =
      CoreUtils.getCachedSingleThreadExecutor("LeaderAppointer-Babysitters", 15000);
  private final SnapshotSchedule m_partSnapshotSchedule;

  private final SnapshotResponseHandler m_snapshotHandler =
      new SnapshotResponseHandler() {
        @Override
        public void handleResponse(ClientResponse resp) {
          if (resp == null) {
            VoltDB.crashLocalVoltDB(
                "Received a null response to a snapshot initiation request.  "
                    + "This should be impossible.",
                true,
                null);
          } else if (resp.getStatus() != ClientResponse.SUCCESS) {
            tmLog.info(
                "Failed to complete partition detection snapshot, status: "
                    + resp.getStatus()
                    + ", reason: "
                    + resp.getStatusString());
            tmLog.info("Retrying partition detection snapshot...");
            SnapshotUtil.requestSnapshot(
                0L,
                m_partSnapshotSchedule.getPath(),
                m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(),
                true,
                SnapshotFormat.NATIVE,
                null,
                m_snapshotHandler,
                true);
          } else if (!SnapshotUtil.didSnapshotRequestSucceed(resp.getResults())) {
            VoltDB.crashGlobalVoltDB(
                "Unable to complete partition detection snapshot: " + resp.getResults()[0],
                false,
                null);
          } else {
            VoltDB.crashGlobalVoltDB(
                "Partition detection snapshot completed. Shutting down.", false, null);
          }
        }
      };

  private class PartitionCallback extends BabySitter.Callback {
    final int m_partitionId;
    final Set<Long> m_replicas;
    long m_currentLeader;

    /** Constructor used when we know (or think we know) who the leader for this partition is */
    PartitionCallback(int partitionId, long currentLeader) {
      this(partitionId);
      // Try to be clever for repair.  Create ourselves with the current leader set to
      // whatever is in the LeaderCache, and claim that replica exists, then let the
      // first run() call fix the world.
      m_currentLeader = currentLeader;
      m_replicas.add(currentLeader);
    }

    /** Constructor used at startup when there is no leader */
    PartitionCallback(int partitionId) {
      m_partitionId = partitionId;
      // A bit of a hack, but we should never end up with an HSID as Long.MAX_VALUE
      m_currentLeader = Long.MAX_VALUE;
      m_replicas = new HashSet<Long>();
    }

    @Override
    public void run(List<String> children) {
      List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children);
      // compute previously unseen HSId set in the callback list
      Set<Long> newHSIds = new HashSet<Long>(updatedHSIds);
      newHSIds.removeAll(m_replicas);
      tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds));
      // compute previously seen but now vanished from the callback list HSId set
      Set<Long> missingHSIds = new HashSet<Long>(m_replicas);
      missingHSIds.removeAll(updatedHSIds);
      tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds));

      tmLog.debug(
          "Handling babysitter callback for partition "
              + m_partitionId
              + ": children: "
              + CoreUtils.hsIdCollectionToString(updatedHSIds));
      if (m_state.get() == AppointerState.CLUSTER_START) {
        // We can't yet tolerate a host failure during startup.  Crash it all
        if (missingHSIds.size() > 0) {
          VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null);
        }
        // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor,
        // but for now we just look to see how many replicas of this partition we actually expect
        // and gate leader assignment on that many copies showing up.
        int replicaCount = m_kfactor + 1;
        JSONArray parts;
        try {
          parts = m_topo.getJSONArray("partitions");
          for (int p = 0; p < parts.length(); p++) {
            JSONObject aPartition = parts.getJSONObject(p);
            int pid = aPartition.getInt("partition_id");
            if (pid == m_partitionId) {
              replicaCount = aPartition.getJSONArray("replicas").length();
            }
          }
        } catch (JSONException e) {
          // Ignore and just assume the normal number of replicas
        }
        if (children.size() == replicaCount) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        } else {
          tmLog.info(
              "Waiting on "
                  + ((m_kfactor + 1) - children.size())
                  + " more nodes "
                  + "for k-safety before startup");
        }
      } else {
        // Check for k-safety
        if (!isClusterKSafe()) {
          VoltDB.crashGlobalVoltDB(
              "Some partitions have no replicas.  Cluster has become unviable.", false, null);
        }
        // Check if replay has completed
        if (m_replayComplete.get() == false) {
          VoltDB.crashGlobalVoltDB(
              "Detected node failure during command log replay. Cluster will shut down.",
              false,
              null);
        }
        // Check to see if there's been a possible network partition and we're not already handling
        // it
        if (m_partitionDetectionEnabled && !m_partitionDetected) {
          doPartitionDetectionActivities();
        }
        // If we survived the above gauntlet of fail, appoint a new leader for this partition.
        if (missingHSIds.contains(m_currentLeader)) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        }
      }
      m_replicas.clear();
      m_replicas.addAll(updatedHSIds);
    }
  }

  /* We'll use this callback purely for startup so we can discover when all
   * the leaders we have appointed have completed their promotions and
   * published themselves to Zookeeper */
  LeaderCache.Callback m_masterCallback =
      new LeaderCache.Callback() {
        @Override
        public void run(ImmutableMap<Integer, Long> cache) {
          Set<Long> currentLeaders = new HashSet<Long>(cache.values());
          tmLog.debug("Updated leaders: " + currentLeaders);
          if (m_state.get() == AppointerState.CLUSTER_START) {
            if (currentLeaders.size() == m_partitionCount) {
              tmLog.debug("Leader appointment complete, promoting MPI and unblocking.");
              m_state.set(AppointerState.DONE);
              m_MPI.acceptPromotion();
              m_startupLatch.countDown();
            }
          }
        }
      };

  public LeaderAppointer(
      HostMessenger hm,
      int numberOfPartitions,
      int kfactor,
      boolean partitionDetectionEnabled,
      SnapshotSchedule partitionSnapshotSchedule,
      boolean usingCommandLog,
      JSONObject topology,
      MpInitiator mpi) {
    m_hostMessenger = hm;
    m_zk = hm.getZK();
    m_kfactor = kfactor;
    m_topo = topology;
    m_MPI = mpi;
    m_partitionCount = numberOfPartitions;
    m_callbacks = new PartitionCallback[m_partitionCount];
    m_partitionWatchers = new BabySitter[m_partitionCount];
    m_iv2appointees = new LeaderCache(m_zk, VoltZK.iv2appointees);
    m_iv2masters = new LeaderCache(m_zk, VoltZK.iv2masters, m_masterCallback);
    m_partitionDetectionEnabled = partitionDetectionEnabled;
    m_partSnapshotSchedule = partitionSnapshotSchedule;
    m_usingCommandLog = usingCommandLog;
  }

  @Override
  public void acceptPromotion() throws InterruptedException, ExecutionException, KeeperException {
    // Crank up the leader caches.  Use blocking startup so that we'll have valid point-in-time
    // caches later.
    m_iv2appointees.start(true);
    m_iv2masters.start(true);
    // Figure out what conditions we assumed leadership under.
    if (m_iv2appointees.pointInTimeCache().size() == 0) {
      tmLog.debug("LeaderAppointer in startup");
      m_state.set(AppointerState.CLUSTER_START);
    } else if ((m_iv2appointees.pointInTimeCache().size() != m_partitionCount)
        || (m_iv2masters.pointInTimeCache().size() != m_partitionCount)) {
      // If we are promoted and the appointees or masters set is partial, the previous appointer
      // failed
      // during startup (at least for now, until we add add/remove a partition on the fly).
      VoltDB.crashGlobalVoltDB("Detected failure during startup, unable to start", false, null);
    } else {
      tmLog.debug("LeaderAppointer in repair");
      m_state.set(AppointerState.DONE);
    }

    if (m_state.get() == AppointerState.CLUSTER_START) {
      // Need to block the return of acceptPromotion until after the MPI is promoted.  Wait for this
      // latch
      // to countdown after appointing all the partition leaders.  The
      // LeaderCache callback will count it down once it has seen all the
      // appointed leaders publish themselves as the actual leaders.
      m_startupLatch = new CountDownLatch(1);
      writeKnownLiveNodes(m_hostMessenger.getLiveHostIds());
      for (int i = 0; i < m_partitionCount; i++) {
        String dir = LeaderElector.electionDirForPartition(i);
        // Race along with all of the replicas for this partition to create the ZK parent node
        try {
          m_zk.create(dir, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        } catch (KeeperException.NodeExistsException e) {
          // expected on all nodes that don't start() first.
        }
        m_callbacks[i] = new PartitionCallback(i);
        Pair<BabySitter, List<String>> sitterstuff =
            BabySitter.blockingFactory(m_zk, dir, m_callbacks[i], m_es);
        m_partitionWatchers[i] = sitterstuff.getFirst();
      }
      m_startupLatch.await();
    } else {
      // If we're taking over for a failed LeaderAppointer, we know when
      // we get here that every partition had a leader at some point in
      // time.  We'll seed each of the PartitionCallbacks for each
      // partition with the HSID of the last published leader.  The
      // blocking startup of the BabySitter watching that partition will
      // call our callback, get the current full set of replicas, and
      // appoint a new leader if the seeded one has actually failed
      Map<Integer, Long> masters = m_iv2masters.pointInTimeCache();
      tmLog.info("LeaderAppointer repairing with master set: " + masters);
      for (Entry<Integer, Long> master : masters.entrySet()) {
        int partId = master.getKey();
        String dir = LeaderElector.electionDirForPartition(partId);
        m_callbacks[partId] = new PartitionCallback(partId, master.getValue());
        Pair<BabySitter, List<String>> sitterstuff =
            BabySitter.blockingFactory(m_zk, dir, m_callbacks[partId], m_es);
        m_partitionWatchers[partId] = sitterstuff.getFirst();
      }
      // just go ahead and promote our MPI
      m_MPI.acceptPromotion();
    }
  }

  private long assignLeader(int partitionId, List<Long> children) {
    // We used masterHostId = -1 as a way to force the leader choice to be
    // the first replica in the list, if we don't have some other mechanism
    // which has successfully overridden it.
    int masterHostId = -1;
    if (m_state.get() == AppointerState.CLUSTER_START) {
      try {
        // find master in topo
        JSONArray parts = m_topo.getJSONArray("partitions");
        for (int p = 0; p < parts.length(); p++) {
          JSONObject aPartition = parts.getJSONObject(p);
          int pid = aPartition.getInt("partition_id");
          if (pid == partitionId) {
            masterHostId = aPartition.getInt("master");
          }
        }
      } catch (JSONException jse) {
        tmLog.error("Failed to find master for partition " + partitionId + ", defaulting to 0");
        jse.printStackTrace();
        masterHostId = -1; // stupid default
      }
    } else {
      // For now, if we're appointing a new leader as a result of a
      // failure, just pick the first replica in the children list.
      // Could eventually do something more complex here to try to keep a
      // semi-balance, but it's unclear that this has much utility until
      // we add rebalancing on rejoin as well.
      masterHostId = -1;
    }

    long masterHSId = children.get(0);
    for (Long child : children) {
      if (CoreUtils.getHostIdFromHSId(child) == masterHostId) {
        masterHSId = child;
        break;
      }
    }
    tmLog.info(
        "Appointing HSId "
            + CoreUtils.hsIdToString(masterHSId)
            + " as leader for partition "
            + partitionId);
    try {
      m_iv2appointees.put(partitionId, masterHSId);
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB("Unable to appoint new master for partition " + partitionId, true, e);
    }
    return masterHSId;
  }

  private void writeKnownLiveNodes(List<Integer> liveNodes) {
    try {
      if (m_zk.exists(VoltZK.lastKnownLiveNodes, null) == null) {
        // VoltZK.createPersistentZKNodes should have done this
        m_zk.create(VoltZK.lastKnownLiveNodes, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
      }
      JSONStringer stringer = new JSONStringer();
      stringer.object();
      stringer.key("liveNodes").array();
      for (Integer node : liveNodes) {
        stringer.value(node);
      }
      stringer.endArray();
      stringer.endObject();
      JSONObject obj = new JSONObject(stringer.toString());
      tmLog.debug("Writing live nodes to ZK: " + obj.toString(4));
      m_zk.setData(VoltZK.lastKnownLiveNodes, obj.toString(4).getBytes("UTF-8"), -1);
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB(
          "Unable to update known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes, true, e);
    }
  }

  private Set<Integer> readPriorKnownLiveNodes() {
    Set<Integer> nodes = new HashSet<Integer>();
    try {
      byte[] data = m_zk.getData(VoltZK.lastKnownLiveNodes, false, null);
      String jsonString = new String(data, "UTF-8");
      tmLog.debug("Read prior known live nodes: " + jsonString);
      JSONObject jsObj = new JSONObject(jsonString);
      JSONArray jsonNodes = jsObj.getJSONArray("liveNodes");
      for (int ii = 0; ii < jsonNodes.length(); ii++) {
        nodes.add(jsonNodes.getInt(ii));
      }
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB(
          "Unable to read prior known live nodes at ZK path: " + VoltZK.lastKnownLiveNodes,
          true,
          e);
    }
    return nodes;
  }

  /**
   * Given a set of the known host IDs before a fault, and the known host IDs in the post-fault
   * cluster, determine whether or not we think a network partition may have happened. NOTE: this
   * assumes that we have already done the k-safety validation for every partition and already
   * failed if we weren't a viable cluster. ALSO NOTE: not private so it may be unit-tested.
   */
  static boolean makePPDDecision(Set<Integer> previousHosts, Set<Integer> currentHosts) {
    // Real partition detection stuff would go here
    // find the lowest hostId between the still-alive hosts and the
    // failed hosts. Which set contains the lowest hostId?
    int blessedHostId = Integer.MAX_VALUE;
    boolean blessedHostIdInFailedSet = true;

    // This should be all the pre-partition hosts IDs.  Any new host IDs
    // (say, if this was triggered by rejoin), will be greater than any surviving
    // host ID, so don't worry about including it in this search.
    for (Integer hostId : previousHosts) {
      if (hostId < blessedHostId) {
        blessedHostId = hostId;
      }
    }

    for (Integer hostId : currentHosts) {
      if (hostId.equals(blessedHostId)) {
        blessedHostId = hostId;
        blessedHostIdInFailedSet = false;
      }
    }

    // Evaluate PPD triggers.
    boolean partitionDetectionTriggered = false;
    // Exact 50-50 splits. The set with the lowest survivor host doesn't trigger PPD
    // If the blessed host is in the failure set, this set is not blessed.
    if (currentHosts.size() * 2 == previousHosts.size()) {
      if (blessedHostIdInFailedSet) {
        tmLog.info(
            "Partition detection triggered for 50/50 cluster failure. "
                + "This survivor set is shutting down.");
        partitionDetectionTriggered = true;
      } else {
        tmLog.info(
            "Partition detected for 50/50 failure. "
                + "This survivor set is continuing execution.");
      }
    }

    // A strict, viable minority is always a partition.
    if (currentHosts.size() * 2 < previousHosts.size()) {
      tmLog.info(
          "Partition detection triggered. " + "This minority survivor set is shutting down.");
      partitionDetectionTriggered = true;
    }

    return partitionDetectionTriggered;
  }

  private void doPartitionDetectionActivities() {
    // We should never re-enter here once we've decided we're partitioned and doomed
    assert (!m_partitionDetected);
    // After everything is resolved, write the new surviving set to ZK
    List<Integer> currentNodes = null;
    try {
      currentNodes = m_hostMessenger.getLiveHostIds();
    } catch (Exception e) {

    }
    Set<Integer> currentHosts = new HashSet<Integer>(currentNodes);
    Set<Integer> previousHosts = readPriorKnownLiveNodes();

    boolean partitionDetectionTriggered = makePPDDecision(previousHosts, currentHosts);

    if (partitionDetectionTriggered) {
      m_partitionDetected = true;
      if (m_usingCommandLog) {
        // Just shut down immediately
        VoltDB.crashGlobalVoltDB(
            "Use of command logging detected, no additional database snapshot will "
                + "be generated.  Please use the 'recover' action to restore the database if necessary.",
            false,
            null);
      } else {
        SnapshotUtil.requestSnapshot(
            0L,
            m_partSnapshotSchedule.getPath(),
            m_partSnapshotSchedule.getPrefix() + System.currentTimeMillis(),
            true,
            SnapshotFormat.NATIVE,
            null,
            m_snapshotHandler,
            true);
      }
    }
    // If the cluster host set has changed, then write the new set to ZK
    // NOTE: we don't want to update the known live nodes if we've decided that our subcluster is
    // dying, otherwise a poorly timed subsequent failure might reverse this decision.  Any future
    // promoted
    // LeaderAppointer should make their partition detection decision based on the pre-partition
    // cluster state.
    else if (!currentHosts.equals(previousHosts)) {
      writeKnownLiveNodes(currentNodes);
    }
  }

  private boolean isClusterKSafe() {
    boolean retval = true;
    for (int i = 0; i < m_partitionCount; i++) {
      String dir = LeaderElector.electionDirForPartition(i);
      try {
        List<String> replicas = m_zk.getChildren(dir, null, null);
        if (replicas.isEmpty()) {
          tmLog.fatal("K-Safety violation: No replicas found for partition: " + i);
          retval = false;
        }
      } catch (Exception e) {
        VoltDB.crashLocalVoltDB("Unable to read replicas in ZK dir: " + dir, true, e);
      }
    }
    return retval;
  }

  public void onReplayCompletion() {
    m_replayComplete.set(true);
  }

  public void shutdown() {
    try {
      m_iv2appointees.shutdown();
      m_iv2masters.shutdown();
      for (BabySitter watcher : m_partitionWatchers) {
        watcher.shutdown();
      }
    } catch (Exception e) {
      // don't care, we're going down
    }
  }
}
Exemple #29
0
    @Override
    public void run(List<String> children) {
      List<Long> updatedHSIds = VoltZK.childrenToReplicaHSIds(children);
      // compute previously unseen HSId set in the callback list
      Set<Long> newHSIds = new HashSet<Long>(updatedHSIds);
      newHSIds.removeAll(m_replicas);
      tmLog.debug("Newly seen replicas: " + CoreUtils.hsIdCollectionToString(newHSIds));
      // compute previously seen but now vanished from the callback list HSId set
      Set<Long> missingHSIds = new HashSet<Long>(m_replicas);
      missingHSIds.removeAll(updatedHSIds);
      tmLog.debug("Newly dead replicas: " + CoreUtils.hsIdCollectionToString(missingHSIds));

      tmLog.debug(
          "Handling babysitter callback for partition "
              + m_partitionId
              + ": children: "
              + CoreUtils.hsIdCollectionToString(updatedHSIds));
      if (m_state.get() == AppointerState.CLUSTER_START) {
        // We can't yet tolerate a host failure during startup.  Crash it all
        if (missingHSIds.size() > 0) {
          VoltDB.crashGlobalVoltDB("Node failure detected during startup.", false, null);
        }
        // ENG-3166: Eventually we would like to get rid of the extra replicas beyond k_factor,
        // but for now we just look to see how many replicas of this partition we actually expect
        // and gate leader assignment on that many copies showing up.
        int replicaCount = m_kfactor + 1;
        JSONArray parts;
        try {
          parts = m_topo.getJSONArray("partitions");
          for (int p = 0; p < parts.length(); p++) {
            JSONObject aPartition = parts.getJSONObject(p);
            int pid = aPartition.getInt("partition_id");
            if (pid == m_partitionId) {
              replicaCount = aPartition.getJSONArray("replicas").length();
            }
          }
        } catch (JSONException e) {
          // Ignore and just assume the normal number of replicas
        }
        if (children.size() == replicaCount) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        } else {
          tmLog.info(
              "Waiting on "
                  + ((m_kfactor + 1) - children.size())
                  + " more nodes "
                  + "for k-safety before startup");
        }
      } else {
        // Check for k-safety
        if (!isClusterKSafe()) {
          VoltDB.crashGlobalVoltDB(
              "Some partitions have no replicas.  Cluster has become unviable.", false, null);
        }
        // Check if replay has completed
        if (m_replayComplete.get() == false) {
          VoltDB.crashGlobalVoltDB(
              "Detected node failure during command log replay. Cluster will shut down.",
              false,
              null);
        }
        // Check to see if there's been a possible network partition and we're not already handling
        // it
        if (m_partitionDetectionEnabled && !m_partitionDetected) {
          doPartitionDetectionActivities();
        }
        // If we survived the above gauntlet of fail, appoint a new leader for this partition.
        if (missingHSIds.contains(m_currentLeader)) {
          m_currentLeader = assignLeader(m_partitionId, updatedHSIds);
        }
      }
      m_replicas.clear();
      m_replicas.addAll(updatedHSIds);
    }
/** Export data from a single catalog version and database instance. */
public class ExportGeneration {
  /** Processors also log using this facility. */
  private static final VoltLogger exportLog = new VoltLogger("EXPORT");

  public Long m_timestamp;
  public final File m_directory;

  private String m_leadersZKPath;
  private String m_mailboxesZKPath;

  /**
   * Data sources, one per table per site, provide the interface to poll() and ack() Export data
   * from the execution engines. Data sources are configured by the Export manager at initialization
   * time. partitionid : <tableid : datasource>.
   */
  public final HashMap<Integer, HashMap<String, ExportDataSource>> m_dataSourcesByPartition =
      new HashMap<Integer, HashMap<String, ExportDataSource>>();

  private int m_numSources = 0;
  private final AtomicInteger m_drainedSources = new AtomicInteger(0);

  private final Runnable m_onAllSourcesDrained;

  private final Runnable m_onSourceDrained =
      new Runnable() {
        @Override
        public void run() {
          int numSourcesDrained = m_drainedSources.incrementAndGet();
          exportLog.info(
              "Drained source in generation "
                  + m_timestamp
                  + " with "
                  + numSourcesDrained
                  + " of "
                  + m_numSources
                  + " drained");
          if (numSourcesDrained == m_numSources) {
            if (m_partitionLeaderZKName.isEmpty()) {
              m_onAllSourcesDrained.run();
            } else {
              ListenableFuture<?> removeLeadership =
                  m_childUpdatingThread.submit(
                      new Runnable() {
                        @Override
                        public void run() {
                          for (Map.Entry<Integer, String> entry :
                              m_partitionLeaderZKName.entrySet()) {
                            m_zk.delete(
                                m_leadersZKPath + "/" + entry.getKey() + "/" + entry.getValue(),
                                -1,
                                new AsyncCallback.VoidCallback() {

                                  @Override
                                  public void processResult(int rc, String path, Object ctx) {
                                    KeeperException.Code code = KeeperException.Code.get(rc);
                                    if (code != KeeperException.Code.OK) {
                                      VoltDB.crashLocalVoltDB(
                                          "Error in export leader election giving up leadership of "
                                              + path,
                                          true,
                                          KeeperException.create(code));
                                    }
                                  }
                                },
                                null);
                          }
                        }
                      },
                      null);
              removeLeadership.addListener(
                  m_onAllSourcesDrained, MoreExecutors.sameThreadExecutor());
            }

            ;
          }
        }
      };

  private Mailbox m_mbox;

  private ZooKeeper m_zk;
  private volatile boolean shutdown = false;

  private static final ListeningExecutorService m_childUpdatingThread =
      CoreUtils.getListeningExecutorService("Export ZK Watcher", 1);

  private final Map<Integer, String> m_partitionLeaderZKName = new HashMap<Integer, String>();
  private final Set<Integer> m_partitionsIKnowIAmTheLeader = new HashSet<Integer>();

  /*
   * Set to true if this export generation was initialized from disk
   * instead of being fed data from the current live system
   */
  private boolean m_diskBased = false;

  /**
   * Constructor to create a new generation of export data
   *
   * @param exportOverflowDirectory
   * @throws IOException
   */
  public ExportGeneration(long txnId, Runnable onAllSourcesDrained, File exportOverflowDirectory)
      throws IOException {
    m_onAllSourcesDrained = onAllSourcesDrained;
    m_timestamp = txnId;
    m_directory = new File(exportOverflowDirectory, Long.toString(txnId));
    if (!m_directory.mkdirs()) {
      throw new IOException("Could not create " + m_directory);
    }
    exportLog.info("Creating new export generation " + m_timestamp);
  }

  /**
   * Constructor to create a generation based on one that has been persisted to disk
   *
   * @param generationDirectory
   * @param generationTimestamp
   * @throws IOException
   */
  public ExportGeneration(Runnable onAllSourcesDrained, File generationDirectory)
      throws IOException {
    m_onAllSourcesDrained = onAllSourcesDrained;
    m_directory = generationDirectory;
  }

  public boolean isDiskBased() {
    return m_diskBased;
  }

  boolean initializeGenerationFromDisk(final Connector conn, HostMessenger messenger) {
    m_diskBased = true;
    Set<Integer> partitions = new HashSet<Integer>();

    /*
     * Find all the advertisements. Once one is found, extract the nonce
     * and check for any data files related to the advertisement. If no data files
     * exist ignore the advertisement.
     */
    boolean hadValidAd = false;
    for (File f : m_directory.listFiles()) {
      if (f.getName().endsWith(".ad")) {
        boolean haveDataFiles = false;
        String nonce = f.getName().substring(0, f.getName().length() - 3);
        for (File dataFile : m_directory.listFiles()) {
          if (dataFile.getName().startsWith(nonce) && !dataFile.getName().equals(f.getName())) {
            haveDataFiles = true;
            break;
          }
        }

        if (haveDataFiles) {
          try {
            addDataSource(f, partitions);
            hadValidAd = true;
          } catch (IOException e) {
            VoltDB.crashLocalVoltDB("Error intializing export datasource " + f, true, e);
          }
        } else {
          // Delete ads that have no data
          f.delete();
        }
      }
    }
    createAndRegisterAckMailboxes(partitions, messenger);
    exportLog.info("Restoring export generation " + m_timestamp);
    return hadValidAd;
  }

  /*
   * Run a leader election for every partition to determine who will
   * start consuming the export data.
   *
   */
  public void kickOffLeaderElection() {
    m_childUpdatingThread.submit(
        new Runnable() {
          @Override
          public void run() {
            try {
              /*
               * The path where leaders will register for this generation
               */
              m_leadersZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "leaders";

              /*
               * Create a directory for each partition
               */
              for (Integer partition : m_dataSourcesByPartition.keySet()) {
                ZKUtil.asyncMkdirs(m_zk, m_leadersZKPath + "/" + partition);
              }

              /*
               * Queue the creation of our ephemeral sequential and then queue
               * a task to retrieve the children to find the result of the election
               */
              List<ZKUtil.ChildrenCallback> callbacks = new ArrayList<ZKUtil.ChildrenCallback>();
              for (final Integer partition : m_dataSourcesByPartition.keySet()) {
                m_zk.create(
                    m_leadersZKPath + "/" + partition + "/leader",
                    null,
                    Ids.OPEN_ACL_UNSAFE,
                    CreateMode.EPHEMERAL_SEQUENTIAL,
                    new org.apache.zookeeper_voltpatches.AsyncCallback.StringCallback() {
                      @Override
                      public void processResult(int rc, String path, Object ctx, String name) {
                        KeeperException.Code code = KeeperException.Code.get(rc);
                        if (code != KeeperException.Code.OK) {
                          VoltDB.crashLocalVoltDB(
                              "Error in export leader election",
                              true,
                              KeeperException.create(code));
                        }
                        String splitName[] = name.split("/");
                        m_partitionLeaderZKName.put(partition, splitName[splitName.length - 1]);
                      }
                    },
                    null);
                ZKUtil.ChildrenCallback cb = new ZKUtil.ChildrenCallback();
                callbacks.add(cb);
                m_zk.getChildren(
                    m_leadersZKPath + "/" + partition,
                    constructLeaderChildWatcher(partition),
                    cb,
                    null);
              }

              /*
               * Process the result of the per partition elections.
               * No worries about ordering with the watcher because the watcher tasks
               * all get funneled through this thread
               */
              Iterator<ZKUtil.ChildrenCallback> iter = callbacks.iterator();
              for (Integer partition : m_dataSourcesByPartition.keySet()) {
                ZKUtil.ChildrenCallback cb = iter.next();
                handleLeaderChildrenUpdate(partition, cb.getChildren());
              }
            } catch (Throwable t) {
              VoltDB.crashLocalVoltDB("Error in export leader election", true, t);
            }
          }
        });
  }

  private Watcher constructLeaderChildWatcher(final Integer partition) {
    return new Watcher() {
      @Override
      public void process(final WatchedEvent event) {
        final Runnable processRunnable =
            new Runnable() {
              @Override
              public void run() {
                if (m_drainedSources.get() == m_numSources) {
                  return;
                }
                final AsyncCallback.ChildrenCallback childrenCallback =
                    new org.apache.zookeeper_voltpatches.AsyncCallback.ChildrenCallback() {
                      @Override
                      public void processResult(
                          final int rc,
                          final String path,
                          Object ctx,
                          final List<String> children) {
                        KeeperException.Code code = KeeperException.Code.get(rc);
                        if (code != KeeperException.Code.OK) {
                          VoltDB.crashLocalVoltDB(
                              "Error in export leader election",
                              true,
                              KeeperException.create(code));
                        }
                        m_childUpdatingThread.execute(
                            new Runnable() {
                              @Override
                              public void run() {
                                try {
                                  handleLeaderChildrenUpdate(partition, children);
                                } catch (Throwable t) {
                                  VoltDB.crashLocalVoltDB(
                                      "Error in export leader election", true, t);
                                }
                              }
                            });
                      }
                    };
                m_zk.getChildren(
                    m_leadersZKPath + "/" + partition,
                    constructLeaderChildWatcher(partition),
                    childrenCallback,
                    null);
              }
            };
        m_childUpdatingThread.execute(processRunnable);
      }
    };
  }

  private void handleLeaderChildrenUpdate(Integer partition, List<String> children) {
    if (m_drainedSources.get() == m_numSources || children.isEmpty()) {
      return;
    }

    String leader = Collections.min(children);
    if (m_partitionLeaderZKName.get(partition).equals(leader)) {
      if (m_partitionsIKnowIAmTheLeader.add(partition)) {
        for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) {
          try {
            eds.acceptMastership();
          } catch (Exception e) {
            exportLog.error("Unable to start exporting", e);
          }
        }
      }
    }
  }

  void initializeGenerationFromCatalog(
      final Connector conn,
      int hostId,
      HostMessenger messenger,
      List<Pair<Integer, Long>> partitions) {
    /*
     * Now create datasources based on the catalog
     */
    Iterator<ConnectorTableInfo> tableInfoIt = conn.getTableinfo().iterator();
    // Only populate partitions in use if export is actually happening
    Set<Integer> partitionsInUse = new HashSet<Integer>();
    while (tableInfoIt.hasNext()) {
      ConnectorTableInfo next = tableInfoIt.next();
      Table table = next.getTable();
      addDataSources(table, hostId, partitions);

      for (Pair<Integer, Long> p : partitions) {
        partitionsInUse.add(p.getFirst());
      }
    }

    createAndRegisterAckMailboxes(partitionsInUse, messenger);
  }

  private void createAndRegisterAckMailboxes(
      final Set<Integer> localPartitions, HostMessenger messenger) {
    m_zk = messenger.getZK();
    m_mailboxesZKPath = VoltZK.exportGenerations + "/" + m_timestamp + "/" + "mailboxes";

    m_mbox =
        new LocalMailbox(messenger) {
          @Override
          public void deliver(VoltMessage message) {
            if (message instanceof BinaryPayloadMessage) {
              BinaryPayloadMessage bpm = (BinaryPayloadMessage) message;
              ByteBuffer buf = ByteBuffer.wrap(bpm.m_payload);
              final int partition = buf.getInt();
              final int length = buf.getInt();
              byte stringBytes[] = new byte[length];
              buf.get(stringBytes);
              String signature = new String(stringBytes, Constants.UTF8ENCODING);
              final long ackUSO = buf.getLong();

              final HashMap<String, ExportDataSource> partitionSources =
                  m_dataSourcesByPartition.get(partition);
              if (partitionSources == null) {
                exportLog.error(
                    "Received an export ack for partition "
                        + partition
                        + " which does not exist on this node");
                return;
              }

              final ExportDataSource eds = partitionSources.get(signature);
              if (eds == null) {
                exportLog.error(
                    "Received an export ack for partition "
                        + partition
                        + " source signature "
                        + signature
                        + " which does not exist on this node");
                return;
              }

              try {
                eds.ack(ackUSO);
              } catch (RejectedExecutionException ignoreIt) {
                // ignore it: as it is already shutdown
              }
            } else {
              exportLog.error("Receive unexpected message " + message + " in export subsystem");
            }
          }
        };
    messenger.createMailbox(null, m_mbox);

    for (Integer partition : localPartitions) {
      final String partitionDN = m_mailboxesZKPath + "/" + partition;
      ZKUtil.asyncMkdirs(m_zk, partitionDN);

      ZKUtil.StringCallback cb = new ZKUtil.StringCallback();
      m_zk.create(
          partitionDN + "/" + m_mbox.getHSId(),
          null,
          Ids.OPEN_ACL_UNSAFE,
          CreateMode.EPHEMERAL,
          cb,
          null);
    }

    ListenableFuture<?> fut =
        m_childUpdatingThread.submit(
            new Runnable() {
              @Override
              public void run() {
                List<Pair<Integer, ZKUtil.ChildrenCallback>> callbacks =
                    new ArrayList<Pair<Integer, ZKUtil.ChildrenCallback>>();
                for (Integer partition : localPartitions) {
                  ZKUtil.ChildrenCallback callback = new ZKUtil.ChildrenCallback();
                  m_zk.getChildren(
                      m_mailboxesZKPath + "/" + partition,
                      constructMailboxChildWatcher(),
                      callback,
                      null);
                  callbacks.add(Pair.of(partition, callback));
                }
                for (Pair<Integer, ZKUtil.ChildrenCallback> p : callbacks) {
                  final Integer partition = p.getFirst();
                  List<String> children = null;
                  try {
                    children = p.getSecond().getChildren();
                  } catch (InterruptedException e) {
                    Throwables.propagate(e);
                  } catch (KeeperException e) {
                    Throwables.propagate(e);
                  }
                  ImmutableList.Builder<Long> mailboxes = ImmutableList.builder();

                  for (String child : children) {
                    if (child.equals(Long.toString(m_mbox.getHSId()))) continue;
                    mailboxes.add(Long.valueOf(child));
                  }
                  ImmutableList<Long> mailboxHsids = mailboxes.build();

                  for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) {
                    eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids));
                  }
                }
              }
            });
    try {
      fut.get();
    } catch (Throwable t) {
      Throwables.propagate(t);
    }
  }

  private Watcher constructMailboxChildWatcher() {
    return new Watcher() {

      @Override
      public void process(final WatchedEvent event) {
        m_childUpdatingThread.submit(
            new Runnable() {
              @Override
              public void run() {
                try {
                  handleChildUpdate(event);
                } catch (Throwable t) {
                  VoltDB.crashLocalVoltDB("Error in export ack handling", true, t);
                }
              }
            });
      }
    };
  }

  private void handleChildUpdate(final WatchedEvent event) {
    m_zk.getChildren(
        event.getPath(), constructMailboxChildWatcher(), constructChildRetrievalCallback(), null);
  }

  private AsyncCallback.ChildrenCallback constructChildRetrievalCallback() {
    return new AsyncCallback.ChildrenCallback() {
      @Override
      public void processResult(
          final int rc, final String path, Object ctx, final List<String> children) {
        m_childUpdatingThread.submit(
            new Runnable() {
              @Override
              public void run() {
                try {
                  if (shutdown) return;
                  KeeperException.Code code = KeeperException.Code.get(rc);
                  if (code != KeeperException.Code.OK) {
                    throw KeeperException.create(code);
                  }

                  final String split[] = path.split("/");
                  final int partition = Integer.valueOf(split[split.length - 1]);
                  ImmutableList.Builder<Long> mailboxes = ImmutableList.builder();
                  for (String child : children) {
                    if (child.equals(Long.toString(m_mbox.getHSId()))) continue;
                    mailboxes.add(Long.valueOf(child));
                  }
                  ImmutableList<Long> mailboxHsids = mailboxes.build();
                  for (ExportDataSource eds : m_dataSourcesByPartition.get(partition).values()) {
                    eds.updateAckMailboxes(Pair.of(m_mbox, mailboxHsids));
                  }
                } catch (Throwable t) {
                  VoltDB.crashLocalVoltDB("Error in export ack handling", true, t);
                }
              }
            });
      }
    };
  }

  public long getQueuedExportBytes(int partitionId, String signature) {
    // assert(m_dataSourcesByPartition.containsKey(partitionId));
    // assert(m_dataSourcesByPartition.get(partitionId).containsKey(delegateId));
    HashMap<String, ExportDataSource> sources = m_dataSourcesByPartition.get(partitionId);

    if (sources == null) {
      /*
       * This is fine. If the table is dropped it won't have an entry in the generation created
       * after the table was dropped.
       */
      //            exportLog.error("Could not find export data sources for generation " +
      // m_timestamp + " partition "
      //                    + partitionId);
      return 0;
    }

    ExportDataSource source = sources.get(signature);
    if (source == null) {
      /*
       * This is fine. If the table is dropped it won't have an entry in the generation created
       * after the table was dropped.
       */
      // exportLog.error("Could not find export data source for generation " + m_timestamp + "
      // partition " + partitionId +
      //        " signature " + signature);
      return 0;
    }
    return source.sizeInBytes();
  }

  /*
   * Create a datasource based on an ad file
   */
  private void addDataSource(File adFile, Set<Integer> partitions) throws IOException {
    m_numSources++;
    ExportDataSource source = new ExportDataSource(m_onSourceDrained, adFile);
    partitions.add(source.getPartitionId());
    m_timestamp = source.getGeneration();
    exportLog.info(
        "Creating ExportDataSource for "
            + adFile
            + " table "
            + source.getTableName()
            + " signature "
            + source.getSignature()
            + " partition id "
            + source.getPartitionId()
            + " bytes "
            + source.sizeInBytes());
    HashMap<String, ExportDataSource> dataSourcesForPartition =
        m_dataSourcesByPartition.get(source.getPartitionId());
    if (dataSourcesForPartition == null) {
      dataSourcesForPartition = new HashMap<String, ExportDataSource>();
      m_dataSourcesByPartition.put(source.getPartitionId(), dataSourcesForPartition);
    }
    dataSourcesForPartition.put(source.getSignature(), source);
  }

  /*
   * An unfortunate test only method for supplying a mock source
   */
  public void addDataSource(ExportDataSource source) {
    HashMap<String, ExportDataSource> dataSourcesForPartition =
        m_dataSourcesByPartition.get(source.getPartitionId());
    if (dataSourcesForPartition == null) {
      dataSourcesForPartition = new HashMap<String, ExportDataSource>();
      m_dataSourcesByPartition.put(source.getPartitionId(), dataSourcesForPartition);
    }
    dataSourcesForPartition.put(source.getSignature(), source);
  }

  // silly helper to add datasources for a table catalog object
  private void addDataSources(Table table, int hostId, List<Pair<Integer, Long>> partitions) {
    for (Pair<Integer, Long> p : partitions) {
      Integer partition = p.getFirst();
      Long site = p.getSecond();

      /*
       * IOException can occur if there is a problem
       * with the persistent aspects of the datasource storage
       */
      try {
        HashMap<String, ExportDataSource> dataSourcesForPartition =
            m_dataSourcesByPartition.get(partition);
        if (dataSourcesForPartition == null) {
          dataSourcesForPartition = new HashMap<String, ExportDataSource>();
          m_dataSourcesByPartition.put(partition, dataSourcesForPartition);
        }
        ExportDataSource exportDataSource =
            new ExportDataSource(
                m_onSourceDrained,
                "database",
                table.getTypeName(),
                partition,
                site,
                table.getSignature(),
                m_timestamp,
                table.getColumns(),
                m_directory.getPath());
        m_numSources++;
        exportLog.info(
            "Creating ExportDataSource for table "
                + table.getTypeName()
                + " signature "
                + table.getSignature()
                + " partition id "
                + partition);
        dataSourcesForPartition.put(table.getSignature(), exportDataSource);
      } catch (IOException e) {
        VoltDB.crashLocalVoltDB(
            "Error creating datasources for table " + table.getTypeName() + " host id " + hostId,
            true,
            e);
      }
    }
  }

  public void pushExportBuffer(
      int partitionId,
      String signature,
      long uso,
      long bufferPtr,
      ByteBuffer buffer,
      boolean sync,
      boolean endOfStream) {
    //        System.out.println("In generation " + m_timestamp + " partition " + partitionId + "
    // signature " + signature + (buffer == null ? " null buffer " : (" buffer length " +
    // buffer.remaining())));
    //        for (Integer i : m_dataSourcesByPartition.keySet()) {
    //            System.out.println("Have partition " + i);
    //        }
    assert (m_dataSourcesByPartition.containsKey(partitionId));
    assert (m_dataSourcesByPartition.get(partitionId).containsKey(signature));
    HashMap<String, ExportDataSource> sources = m_dataSourcesByPartition.get(partitionId);

    if (sources == null) {
      exportLog.error(
          "Could not find export data sources for partition "
              + partitionId
              + " generation "
              + m_timestamp
              + " the export data is being discarded");
      DBBPool.deleteCharArrayMemory(bufferPtr);
      return;
    }

    ExportDataSource source = sources.get(signature);
    if (source == null) {
      exportLog.error(
          "Could not find export data source for partition "
              + partitionId
              + " signature "
              + signature
              + " generation "
              + m_timestamp
              + " the export data is being discarded");
      DBBPool.deleteCharArrayMemory(bufferPtr);
      return;
    }

    source.pushExportBuffer(uso, bufferPtr, buffer, sync, endOfStream);
  }

  public void closeAndDelete() throws IOException {
    List<ListenableFuture<?>> tasks = new ArrayList<ListenableFuture<?>>();
    for (HashMap<String, ExportDataSource> map : m_dataSourcesByPartition.values()) {
      for (ExportDataSource source : map.values()) {
        tasks.add(source.closeAndDelete());
      }
    }
    try {
      Futures.allAsList(tasks).get();
    } catch (Exception e) {
      Throwables.propagateIfPossible(e, IOException.class);
    }
    shutdown = true;
    VoltFile.recursivelyDelete(m_directory);
  }

  /*
   * Returns true if the generatino was completely truncated away
   */
  public boolean truncateExportToTxnId(long txnId, long[] perPartitionTxnIds) {
    // create an easy partitionId:txnId lookup.
    HashMap<Integer, Long> partitionToTxnId = new HashMap<Integer, Long>();
    for (long tid : perPartitionTxnIds) {
      partitionToTxnId.put(TxnEgo.getPartitionId(tid), tid);
    }

    List<ListenableFuture<?>> tasks = new ArrayList<ListenableFuture<?>>();

    // pre-iv2, the truncation point is the snapshot transaction id.
    // In iv2, truncation at the per-partition txn id recorded in the snapshot.
    for (HashMap<String, ExportDataSource> dataSources : m_dataSourcesByPartition.values()) {
      for (ExportDataSource source : dataSources.values()) {
        if (VoltDB.instance().isIV2Enabled()) {
          Long truncationPoint = partitionToTxnId.get(source.getPartitionId());
          if (truncationPoint == null) {
            exportLog.error(
                "Snapshot "
                    + txnId
                    + " does not include truncation point for partition "
                    + source.getPartitionId());
          } else {
            tasks.add(source.truncateExportToTxnId(truncationPoint));
          }
        } else {
          tasks.add(source.truncateExportToTxnId(txnId));
        }
      }
    }

    try {
      Futures.allAsList(tasks).get();
    } catch (Exception e) {
      VoltDB.crashLocalVoltDB(
          "Unexpected exception truncating export data during snapshot restore. "
              + "You can back up export overflow data and start the "
              + "DB without it to get past this error",
          true,
          e);
    }

    return m_drainedSources.get() == m_numSources;
  }

  public void close() {
    List<ListenableFuture<?>> tasks = new ArrayList<ListenableFuture<?>>();
    for (HashMap<String, ExportDataSource> sources : m_dataSourcesByPartition.values()) {
      for (ExportDataSource source : sources.values()) {
        tasks.add(source.close());
      }
    }
    try {
      Futures.allAsList(tasks).get();
    } catch (Exception e) {
      // Logging of errors  is done inside the tasks so nothing to do here
      // intentionally not failing if there is an issue with close
      exportLog.error("Error closing export data sources", e);
    }
    shutdown = true;
  }

  /**
   * Indicate to all associated {@link ExportDataSource}to assume mastership role for the given
   * partition id
   *
   * @param partitionId
   */
  public void acceptMastershipTask(int partitionId) {
    HashMap<String, ExportDataSource> partitionDataSourceMap =
        m_dataSourcesByPartition.get(partitionId);
    exportLog.info(
        "Export generation " + m_timestamp + " accepting mastership for partition " + partitionId);
    for (ExportDataSource eds : partitionDataSourceMap.values()) {
      try {
        eds.acceptMastership();
      } catch (Exception e) {
        exportLog.error("Unable to start exporting", e);
      }
    }
  }

  @Override
  public String toString() {
    return "Export Generation - " + m_timestamp.toString();
  }
}