@Override
  public void scheduleStop(VMInstanceVO vm, long hostId, WorkType type) {
    assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop);

    if (_haDao.hasBeenScheduled(vm.getId(), type)) {
      s_logger.info("There's already a job scheduled to stop " + vm);
      return;
    }

    HaWorkVO work =
        new HaWorkVO(
            vm.getId(),
            vm.getType(),
            type,
            Step.Scheduled,
            hostId,
            vm.getState(),
            0,
            vm.getUpdated());
    _haDao.persist(work);
    if (s_logger.isDebugEnabled()) {
      s_logger.debug("Scheduled " + work);
    }
    wakeupWorkers();
  }
  public Long migrate(final HaWorkVO work) {
    long vmId = work.getInstanceId();

    long srcHostId = work.getHostId();
    try {
      work.setStep(Step.Migrating);
      _haDao.update(work.getId(), work);

      VMInstanceVO vm = _instanceDao.findById(vmId);
      // First try starting the vm with its original planner, if it doesn't succeed send HAPlanner
      // as its an emergency.
      boolean result = false;
      try {
        _itMgr.migrateAway(vm.getUuid(), srcHostId, null);
      } catch (InsufficientServerCapacityException e) {
        s_logger.warn("Failed to deploy vm " + vmId + " with original planner, sending HAPlanner");
        _itMgr.migrateAway(vm.getUuid(), srcHostId, _haPlanners.get(0));
      }
      return null;
    } catch (InsufficientServerCapacityException e) {
      s_logger.warn("Insufficient capacity for migrating a VM.");
      _resourceMgr.maintenanceFailed(srcHostId);
      return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
    }
  }
 @Override
 public List<VMInstanceVO> findTakenMigrationWork() {
   List<HaWorkVO> works = _haDao.findTakenWorkItems(WorkType.Migration);
   List<VMInstanceVO> vms = new ArrayList<VMInstanceVO>(works.size());
   for (HaWorkVO work : works) {
     vms.add(_instanceDao.findById(work.getInstanceId()));
   }
   return vms;
 }
  @Override
  public boolean configure(final String name, final Map<String, Object> xmlParams)
      throws ConfigurationException {
    _serverId = _msServer.getId();

    Map<String, String> params = new HashMap<String, String>();
    params = _configDao.getConfiguration(Long.toHexString(_serverId), xmlParams);

    String value = params.get(Config.HAWorkers.key());
    final int count = NumbersUtil.parseInt(value, 1);
    _workers = new WorkerThread[count];
    for (int i = 0; i < _workers.length; i++) {
      _workers[i] = new WorkerThread("HA-Worker-" + i);
    }

    value = params.get("force.ha");
    _forceHA = Boolean.parseBoolean(value);

    value = params.get("time.to.sleep");
    _timeToSleep = NumbersUtil.parseInt(value, 60) * 1000;

    value = params.get("max.retries");
    _maxRetries = NumbersUtil.parseInt(value, 5);

    value = params.get("time.between.failures");
    _timeBetweenFailures = NumbersUtil.parseLong(value, 3600) * 1000;

    value = params.get("time.between.cleanup");
    _timeBetweenCleanups = NumbersUtil.parseLong(value, 3600 * 24);

    value = params.get("stop.retry.interval");
    _stopRetryInterval = NumbersUtil.parseInt(value, 10 * 60);

    value = params.get("restart.retry.interval");
    _restartRetryInterval = NumbersUtil.parseInt(value, 10 * 60);

    value = params.get("investigate.retry.interval");
    _investigateRetryInterval = NumbersUtil.parseInt(value, 1 * 60);

    value = params.get("migrate.retry.interval");
    _migrateRetryInterval = NumbersUtil.parseInt(value, 2 * 60);

    _instance = params.get("instance");
    if (_instance == null) {
      _instance = "VMOPS";
    }

    _haTag = params.get("ha.tag");

    _haDao.releaseWorkItems(_serverId);

    _stopped = true;

    _executor = Executors.newScheduledThreadPool(count, new NamedThreadFactory("HA"));

    return true;
  }
  @Override
  public void scheduleStop(final VMInstanceVO vm, long hostId, boolean verifyHost) {

    if (_haDao.hasBeenScheduled(vm.getId(), verifyHost ? WorkType.CheckStop : WorkType.Stop)) {
      s_logger.info("There's already a job scheduled to stop " + vm.toString());
      return;
    }

    final HaWorkVO work =
        new HaWorkVO(
            vm.getId(),
            vm.getType(),
            verifyHost ? WorkType.CheckStop : WorkType.Stop,
            Step.Scheduled,
            hostId,
            vm.getState(),
            0,
            vm.getUpdated());
    _haDao.persist(work);
    if (s_logger.isDebugEnabled()) {
      s_logger.debug("Scheduled " + work.toString() + " verifyHost = " + verifyHost);
    }
    wakeupWorkers();
  }
 @Override
 public boolean scheduleMigration(final VMInstanceVO vm) {
   final HaWorkVO work =
       new HaWorkVO(
           vm.getId(),
           vm.getType(),
           WorkType.Migration,
           Step.Scheduled,
           vm.getHostId(),
           vm.getState(),
           0,
           vm.getUpdated());
   _haDao.persist(work);
   wakeupWorkers();
   return true;
 }
 @Override
 public void scheduleDestroy(VMInstanceVO vm, long hostId) {
   final HaWorkVO work =
       new HaWorkVO(
           vm.getId(),
           vm.getType(),
           WorkType.Destroy,
           Step.Scheduled,
           hostId,
           vm.getState(),
           0,
           vm.getUpdated());
   _haDao.persist(work);
   if (s_logger.isDebugEnabled()) {
     s_logger.debug("Scheduled " + work.toString());
   }
   wakeupWorkers();
 }
  public Long migrate(final HaWorkVO work) {
    long vmId = work.getInstanceId();

    long srcHostId = work.getHostId();
    try {
      work.setStep(Step.Migrating);
      _haDao.update(work.getId(), work);

      if (!_itMgr.migrateAway(work.getType(), vmId, srcHostId)) {
        s_logger.warn("Unable to migrate vm from " + srcHostId);
        _resourceMgr.maintenanceFailed(srcHostId);
      }
      return null;
    } catch (InsufficientServerCapacityException e) {
      s_logger.warn("Insufficient capacity for migrating a VM.");
      _resourceMgr.maintenanceFailed(srcHostId);
      return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
    } catch (VirtualMachineMigrationException e) {
      s_logger.warn("Looks like VM is still starting, we need to retry migrating the VM later.");
      _resourceMgr.maintenanceFailed(srcHostId);
      return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
    }
  }
  @Override
  public void cancelScheduledMigrations(final HostVO host) {
    WorkType type = host.getType() == HostVO.Type.Storage ? WorkType.Stop : WorkType.Migration;

    _haDao.deleteMigrationWorkItems(host.getId(), type, _serverId);
  }
 @Override
 public void cancelDestroy(VMInstanceVO vm, Long hostId) {
   _haDao.delete(vm.getId(), WorkType.Destroy);
 }
  protected Long restart(HaWorkVO work) {
    List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
    if (items.size() > 0) {
      StringBuilder str =
          new StringBuilder(
              "Cancelling this work item because newer ones have been scheduled.  Work Ids = [");
      for (HaWorkVO item : items) {
        str.append(item.getId()).append(", ");
      }
      str.delete(str.length() - 2, str.length()).append("]");
      s_logger.info(str.toString());
      return null;
    }

    items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
    if (items.size() > 0) {
      StringBuilder str =
          new StringBuilder(
              "Waiting because there's HA work being executed on an item currently.  Work Ids =[");
      for (HaWorkVO item : items) {
        str.append(item.getId()).append(", ");
      }
      str.delete(str.length() - 2, str.length()).append("]");
      s_logger.info(str.toString());
      return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
    }

    long vmId = work.getInstanceId();

    VMInstanceVO vm = _itMgr.findByIdAndType(work.getType(), work.getInstanceId());
    if (vm == null) {
      s_logger.info("Unable to find vm: " + vmId);
      return null;
    }

    s_logger.info("HA on " + vm);
    if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
      s_logger.info(
          "VM "
              + vm
              + " has been changed.  Current State = "
              + vm.getState()
              + " Previous State = "
              + work.getPreviousState()
              + " last updated = "
              + vm.getUpdated()
              + " previous updated = "
              + work.getUpdateTime());
      return null;
    }

    short alertType = AlertManager.ALERT_TYPE_USERVM;
    if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER;
    } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY;
    } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_SSVM;
    }

    HostVO host = _hostDao.findById(work.getHostId());
    boolean isHostRemoved = false;
    if (host == null) {
      host = _hostDao.findByIdIncludingRemoved(work.getHostId());
      if (host != null) {
        s_logger.debug(
            "VM "
                + vm.toString()
                + " is now no longer on host "
                + work.getHostId()
                + " as the host is removed");
        isHostRemoved = true;
      }
    }

    DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
    HostPodVO podVO = _podDao.findById(host.getPodId());
    String hostDesc =
        "name: "
            + host.getName()
            + "(id:"
            + host.getId()
            + "), availability zone: "
            + dcVO.getName()
            + ", pod: "
            + podVO.getName();

    Boolean alive = null;
    if (work.getStep() == Step.Investigating) {
      if (!isHostRemoved) {
        if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) {
          s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId());
          return null;
        }

        Enumeration<Investigator> en = _investigators.enumeration();
        Investigator investigator = null;
        while (en.hasMoreElements()) {
          investigator = en.nextElement();
          alive = investigator.isVmAlive(vm, host);
          s_logger.info(investigator.getName() + " found " + vm + "to be alive? " + alive);
          if (alive != null) {
            break;
          }
        }
        boolean fenced = false;
        if (alive == null) {
          s_logger.debug("Fencing off VM that we don't know the state of");
          Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration();
          while (enfb.hasMoreElements()) {
            FenceBuilder fb = enfb.nextElement();
            Boolean result = fb.fenceOff(vm, host);
            s_logger.info("Fencer " + fb.getName() + " returned " + result);
            if (result != null && result) {
              fenced = true;
              break;
            }
          }
        } else if (!alive) {
          fenced = true;
        } else {
          s_logger.debug(
              "VM " + vm.getHostName() + " is found to be alive by " + investigator.getName());
          if (host.getStatus() == Status.Up) {
            s_logger.info(vm + " is alive and host is up. No need to restart it.");
            return null;
          } else {
            s_logger.debug("Rescheduling because the host is not up but the vm is alive");
            return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
          }
        }

        if (!fenced) {
          s_logger.debug("We were unable to fence off the VM " + vm);
          _alertMgr.sendAlert(
              alertType,
              vm.getDataCenterIdToDeployIn(),
              vm.getPodIdToDeployIn(),
              "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
              "Insufficient capacity to restart VM, name: "
                  + vm.getHostName()
                  + ", id: "
                  + vmId
                  + " which was running on host "
                  + hostDesc);
          return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
        }

        try {
          _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
        } catch (ResourceUnavailableException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (OperationTimedoutException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (ConcurrentOperationException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        }

        work.setStep(Step.Scheduled);
        _haDao.update(work.getId(), work);
      } else {
        s_logger.debug(
            "How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways");
        try {
          _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
        } catch (ResourceUnavailableException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (OperationTimedoutException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (ConcurrentOperationException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        }
      }
    }

    vm = _itMgr.findByIdAndType(vm.getType(), vm.getId());

    if (!_forceHA && !vm.isHaEnabled()) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("VM is not HA enabled so we're done.");
      }
      return null; // VM doesn't require HA
    }

    if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("VM can not restart on another server.");
      }
      return null;
    }

    if (work.getTimesTried() > _maxRetries) {
      s_logger.warn("Retried to max times so deleting: " + vmId);
      return null;
    }

    try {
      VMInstanceVO started =
          _itMgr.advanceStart(
              vm,
              new HashMap<VirtualMachineProfile.Param, Object>(),
              _accountMgr.getSystemUser(),
              _accountMgr.getSystemAccount());
      if (started != null) {
        s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId());
        return null;
      }

      if (s_logger.isDebugEnabled()) {
        s_logger.debug(
            "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
      }
    } catch (final InsufficientCapacityException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterIdToDeployIn(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "Insufficient capacity to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    } catch (final ResourceUnavailableException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterIdToDeployIn(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    } catch (ConcurrentOperationException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterIdToDeployIn(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    } catch (OperationTimedoutException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterIdToDeployIn(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    }
    vm = _itMgr.findByIdAndType(vm.getType(), vm.getId());
    work.setUpdateTime(vm.getUpdated());
    work.setPreviousState(vm.getState());
    return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
  }
  @Override
  public void scheduleRestart(VMInstanceVO vm, boolean investigate) {
    Long hostId = vm.getHostId();
    if (hostId == null) {
      try {
        s_logger.debug("Found a vm that is scheduled to be restarted but has no host id: " + vm);
        _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
      } catch (ResourceUnavailableException e) {
        assert false : "How do we hit this when force is true?";
        throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
      } catch (OperationTimedoutException e) {
        assert false : "How do we hit this when force is true?";
        throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
      } catch (ConcurrentOperationException e) {
        assert false : "How do we hit this when force is true?";
        throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
      }
      return;
    }

    if (vm.getHypervisorType() == HypervisorType.VMware) {
      s_logger.info("Skip HA for VMware VM " + vm.getInstanceName());
      return;
    }

    if (!investigate) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug(
            "VM does not require investigation so I'm marking it as Stopped: " + vm.toString());
      }

      short alertType = AlertManager.ALERT_TYPE_USERVM;
      if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
        alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER;
      } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
        alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY;
      } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
        alertType = AlertManager.ALERT_TYPE_SSVM;
      }

      if (!(_forceHA || vm.isHaEnabled())) {
        String hostDesc =
            "id:"
                + vm.getHostId()
                + ", availability zone id:"
                + vm.getDataCenterIdToDeployIn()
                + ", pod id:"
                + vm.getPodIdToDeployIn();
        _alertMgr.sendAlert(
            alertType,
            vm.getDataCenterIdToDeployIn(),
            vm.getPodIdToDeployIn(),
            "VM (name: "
                + vm.getHostName()
                + ", id: "
                + vm.getId()
                + ") stopped unexpectedly on host "
                + hostDesc,
            "Virtual Machine "
                + vm.getHostName()
                + " (id: "
                + vm.getId()
                + ") running on host ["
                + vm.getHostId()
                + "] stopped unexpectedly.");

        if (s_logger.isDebugEnabled()) {
          s_logger.debug("VM is not HA enabled so we're done.");
        }
      }

      try {
        _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount());
      } catch (ResourceUnavailableException e) {
        assert false : "How do we hit this when force is true?";
        throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
      } catch (OperationTimedoutException e) {
        assert false : "How do we hit this when force is true?";
        throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
      } catch (ConcurrentOperationException e) {
        assert false : "How do we hit this when force is true?";
        throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
      }
    }

    List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
    int maxRetries = 0;
    for (HaWorkVO item : items) {
      if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
        maxRetries = item.getTimesTried();
        break;
      }
    }

    HaWorkVO work =
        new HaWorkVO(
            vm.getId(),
            vm.getType(),
            WorkType.HA,
            investigate ? Step.Investigating : Step.Scheduled,
            hostId,
            vm.getState(),
            maxRetries + 1,
            vm.getUpdated());
    _haDao.persist(work);

    if (s_logger.isInfoEnabled()) {
      s_logger.info("Schedule vm for HA:  " + vm);
    }

    wakeupWorkers();
  }
 @Override
 public void onManagementNodeLeft(List<ManagementServerHostVO> nodeList, long selfNodeId) {
   for (ManagementServerHostVO node : nodeList) {
     _haDao.releaseWorkItems(node.getMsid());
   }
 }
  public Long migrate(final HaWorkVO work) {
    final long vmId = work.getInstanceId();

    final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType());

    VMInstanceVO vm = mgr.get(vmId);
    if (vm == null || vm.getRemoved() != null) {
      s_logger.debug("Unable to find the vm " + vmId);
      return null;
    }

    s_logger.info("Migrating vm: " + vm.toString());
    if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) {
      s_logger.info("VM is not longer running on the current hostId");
      return null;
    }

    short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE;
    if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER_MIGRATE;
    } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY_MIGRATE;
    }

    HostVO fromHost = _hostDao.findById(vm.getHostId());
    String fromHostName = ((fromHost == null) ? "unknown" : fromHost.getName());
    HostVO toHost = null;
    if (work.getStep() == Step.Scheduled) {
      if (vm.getState() != State.Running) {
        s_logger.info(
            "VM's state is not ready for migration. "
                + vm.toString()
                + " State is "
                + vm.getState().toString());
        return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
      }

      DataCenterVO dcVO = _dcDao.findById(fromHost.getDataCenterId());
      HostPodVO podVO = _podDao.findById(fromHost.getPodId());

      try {
        toHost = mgr.prepareForMigration(vm);
        if (toHost == null) {
          if (s_logger.isDebugEnabled()) {
            s_logger.debug("Unable to find a host for migrating vm " + vmId);
          }
          _alertMgr.sendAlert(
              alertType,
              vm.getDataCenterId(),
              vm.getPodId(),
              "Unable to migrate vm "
                  + vm.getName()
                  + " from host "
                  + fromHostName
                  + " in zone "
                  + dcVO.getName()
                  + " and pod "
                  + podVO.getName(),
              "Unable to find a suitable host");
        }
      } catch (final InsufficientCapacityException e) {
        s_logger.warn("Unable to mgirate due to insufficient capacity " + vm.toString());
        _alertMgr.sendAlert(
            alertType,
            vm.getDataCenterId(),
            vm.getPodId(),
            "Unable to migrate vm "
                + vm.getName()
                + " from host "
                + fromHostName
                + " in zone "
                + dcVO.getName()
                + " and pod "
                + podVO.getName(),
            "Insufficient capacity");
      } catch (final StorageUnavailableException e) {
        s_logger.warn("Storage is unavailable: " + vm.toString());
        _alertMgr.sendAlert(
            alertType,
            vm.getDataCenterId(),
            vm.getPodId(),
            "Unable to migrate vm "
                + vm.getName()
                + " from host "
                + fromHostName
                + " in zone "
                + dcVO.getName()
                + " and pod "
                + podVO.getName(),
            "Storage is gone.");
      }

      if (toHost == null) {
        _agentMgr.maintenanceFailed(vm.getHostId());
        return null;
      }

      if (s_logger.isDebugEnabled()) {
        s_logger.debug("Migrating from " + work.getHostId() + " to " + toHost.getId());
      }
      work.setStep(Step.Migrating);
      work.setHostId(toHost.getId());
      _haDao.update(work.getId(), work);
    }

    if (work.getStep() == Step.Migrating) {
      vm = mgr.get(vmId); // let's see if anything has changed.
      boolean migrated = false;
      if (vm == null
          || vm.getRemoved() != null
          || vm.getHostId() == null
          || !_itMgr.stateTransitTo(vm, Event.MigrationRequested, vm.getHostId())) {
        s_logger.info("Migration cancelled because state has changed: " + vm.toString());
      } else {
        try {
          boolean isWindows =
              _guestOSCategoryDao
                  .findById(_guestOSDao.findById(vm.getGuestOSId()).getCategoryId())
                  .getName()
                  .equalsIgnoreCase("Windows");
          MigrateCommand cmd =
              new MigrateCommand(vm.getInstanceName(), toHost.getPrivateIpAddress(), isWindows);
          Answer answer = _agentMgr.send(fromHost.getId(), cmd);
          if (answer != null && answer.getResult()) {
            migrated = true;
            _storageMgr.unshare(vm, fromHost);
            work.setStep(Step.Investigating);
            _haDao.update(work.getId(), work);
          }
        } catch (final AgentUnavailableException e) {
          s_logger.debug("host became unavailable");
        } catch (final OperationTimedoutException e) {
          s_logger.debug("operation timed out");
          if (e.isActive()) {
            scheduleRestart(vm, true);
          }
        }
      }

      if (!migrated) {
        s_logger.info("Migration was unsuccessful.  Cleaning up: " + vm.toString());

        DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId());
        HostPodVO podVO = _podDao.findById(vm.getPodId());
        _alertMgr.sendAlert(
            alertType,
            fromHost.getDataCenterId(),
            fromHost.getPodId(),
            "Unable to migrate vm "
                + vm.getName()
                + " from host "
                + fromHost.getName()
                + " in zone "
                + dcVO.getName()
                + " and pod "
                + podVO.getName(),
            "Migrate Command failed.  Please check logs.");

        _itMgr.stateTransitTo(vm, Event.MigrationFailedOnSource, toHost.getId());
        _agentMgr.maintenanceFailed(vm.getHostId());

        Command cleanup = mgr.cleanup(vm, null);
        _agentMgr.easySend(toHost.getId(), cleanup);
        _storageMgr.unshare(vm, toHost);

        return null;
      }
    }

    if (toHost == null) {
      toHost = _hostDao.findById(work.getHostId());
    }
    DataCenterVO dcVO = _dcDao.findById(toHost.getDataCenterId());
    HostPodVO podVO = _podDao.findById(toHost.getPodId());

    try {
      if (!mgr.completeMigration(vm, toHost)) {
        _alertMgr.sendAlert(
            alertType,
            toHost.getDataCenterId(),
            toHost.getPodId(),
            "Unable to migrate "
                + vmId
                + " to host "
                + toHost.getName()
                + " in zone "
                + dcVO.getName()
                + " and pod "
                + podVO.getName(),
            "Migration not completed");
        s_logger.warn("Unable to complete migration: " + vm.toString());
      } else {
        s_logger.info("Migration is complete: " + vm.toString());
      }
      return null;
    } catch (final AgentUnavailableException e) {
      s_logger.warn("Agent is unavailable for " + vm.toString());
    } catch (final OperationTimedoutException e) {
      s_logger.warn("Operation timed outfor " + vm.toString());
    }
    _itMgr.stateTransitTo(vm, Event.MigrationFailedOnDest, toHost.getId());
    return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
  }
  protected Long restart(final HaWorkVO work) {
    final long vmId = work.getInstanceId();

    final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType());
    if (mgr == null) {
      s_logger.warn(
          "Unable to find a handler for " + work.getType().toString() + ", throwing out " + vmId);
      return null;
    }

    VMInstanceVO vm = mgr.get(vmId);
    if (vm == null) {
      s_logger.info("Unable to find vm: " + vmId);
      return null;
    }

    s_logger.info("HA on " + vm.toString());
    if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
      s_logger.info(
          "VM "
              + vm.toString()
              + " has been changed.  Current State = "
              + vm.getState()
              + " Previous State = "
              + work.getPreviousState()
              + " last updated = "
              + vm.getUpdated()
              + " previous updated = "
              + work.getUpdateTime());
      return null;
    }

    final HostVO host = _hostDao.findById(work.getHostId());

    DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
    HostPodVO podVO = _podDao.findById(host.getPodId());
    String hostDesc =
        "name: "
            + host.getName()
            + "(id:"
            + host.getId()
            + "), availability zone: "
            + dcVO.getName()
            + ", pod: "
            + podVO.getName();

    short alertType = AlertManager.ALERT_TYPE_USERVM;
    if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER;
    } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
      alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY;
    }

    Boolean alive = null;
    if (work.getStep() == Step.Investigating) {
      if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) {
        s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId());
        if (vm.getState() == State.Starting && vm.getUpdated() == work.getUpdateTime()) {
          _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null);
        }
        return null;
      }

      Enumeration<Investigator> en = _investigators.enumeration();
      Investigator investigator = null;
      while (en.hasMoreElements()) {
        investigator = en.nextElement();
        alive = investigator.isVmAlive(vm, host);
        if (alive != null) {
          s_logger.debug(
              investigator.getName() + " found VM " + vm.getName() + "to be alive? " + alive);
          break;
        }
      }
      if (alive != null && alive) {
        s_logger.debug("VM " + vm.getName() + " is found to be alive by " + investigator.getName());
        if (host.getStatus() == Status.Up) {
          compareState(vm, new AgentVmInfo(vm.getInstanceName(), mgr, State.Running), false);
          return null;
        } else {
          s_logger.debug("Rescheduling because the host is not up but the vm is alive");
          return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
        }
      }

      boolean fenced = false;
      if (alive == null || !alive) {
        fenced = true;
        s_logger.debug("Fencing off VM that we don't know the state of");
        Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration();
        while (enfb.hasMoreElements()) {
          final FenceBuilder fb = enfb.nextElement();
          Boolean result = fb.fenceOff(vm, host);
          if (result != null && !result) {
            fenced = false;
          }
        }
      }

      if (alive == null && !fenced) {
        s_logger.debug("We were unable to fence off the VM " + vm.toString());
        _alertMgr.sendAlert(
            alertType,
            vm.getDataCenterId(),
            vm.getPodId(),
            "Unable to restart " + vm.getName() + " which was running on host " + hostDesc,
            "Insufficient capacity to restart VM, name: "
                + vm.getName()
                + ", id: "
                + vmId
                + " which was running on host "
                + hostDesc);
        return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
      }

      mgr.completeStopCommand(vm);

      work.setStep(Step.Scheduled);
      _haDao.update(work.getId(), work);
    }

    // send an alert for VMs that stop unexpectedly
    _alertMgr.sendAlert(
        alertType,
        vm.getDataCenterId(),
        vm.getPodId(),
        "VM (name: "
            + vm.getName()
            + ", id: "
            + vmId
            + ") stopped unexpectedly on host "
            + hostDesc,
        "Virtual Machine "
            + vm.getName()
            + " (id: "
            + vm.getId()
            + ") running on host ["
            + hostDesc
            + "] stopped unexpectedly.");

    vm = mgr.get(vm.getId());

    if (!_forceHA && !vm.isHaEnabled()) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("VM is not HA enabled so we're done.");
      }
      return null; // VM doesn't require HA
    }

    if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("VM can not restart on another server.");
      }
      return null;
    }

    if (work.getTimesTried() > _maxRetries) {
      s_logger.warn("Retried to max times so deleting: " + vmId);
      return null;
    }

    try {
      VMInstanceVO started = mgr.start(vm.getId(), 0);
      if (started != null) {
        s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId());
        return null;
      }

      if (s_logger.isDebugEnabled()) {
        s_logger.debug(
            "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
      }
      vm = mgr.get(vm.getId());
      work.setUpdateTime(vm.getUpdated());
      work.setPreviousState(vm.getState());
      return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
    } catch (final InsufficientCapacityException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodId(),
          "Unable to restart " + vm.getName() + " which was running on host " + hostDesc,
          "Insufficient capacity to restart VM, name: "
              + vm.getName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
      return null;
    } catch (final StorageUnavailableException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodId(),
          "Unable to restart " + vm.getName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
      return null;
    } catch (ConcurrentOperationException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodId(),
          "Unable to restart " + vm.getName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
      return null;
    } catch (ExecutionException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodId(),
          "Unable to restart " + vm.getName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
      return null;
    }
  }
  @Override
  public void scheduleRestart(VMInstanceVO vm, final boolean investigate) {
    Long hostId = vm.getHostId();
    VirtualMachineGuru<VMInstanceVO> mgr = findManager(vm.getType());
    vm = mgr.get(vm.getId());
    if (!investigate) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug(
            "VM does not require investigation so I'm marking it as Stopped: " + vm.toString());
      }

      short alertType = AlertManager.ALERT_TYPE_USERVM;
      if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
        alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER;
      } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
        alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY;
      }

      if (!(_forceHA || vm.isHaEnabled())) {

        _alertMgr.sendAlert(
            alertType,
            vm.getDataCenterId(),
            vm.getPodId(),
            "VM (name: "
                + vm.getName()
                + ", id: "
                + vm.getId()
                + ") stopped unexpectedly on host "
                + vm.getHostId(),
            "Virtual Machine "
                + vm.getName()
                + " (id: "
                + vm.getId()
                + ") running on host ["
                + vm.getHostId()
                + "] stopped unexpectedly.");

        if (s_logger.isDebugEnabled()) {
          s_logger.debug("VM is not HA enabled so we're done.");
        }
      }

      mgr.completeStopCommand(vm);
    }

    final List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
    int maxRetries = 0;
    for (final HaWorkVO item : items) {
      if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
        maxRetries = item.getTimesTried();
        break;
      }
    }

    final HaWorkVO work =
        new HaWorkVO(
            vm.getId(),
            vm.getType(),
            WorkType.HA,
            investigate ? Step.Investigating : Step.Scheduled,
            hostId,
            vm.getState(),
            maxRetries + 1,
            vm.getUpdated());
    _haDao.persist(work);

    if (s_logger.isInfoEnabled()) {
      s_logger.info("Schedule vm for HA:  " + vm.toString());
    }

    wakeupWorkers();
  }