@Override public void scheduleRestartForVmsOnHost(final HostVO host) { if (host.getType() != Host.Type.Routing) { return; } s_logger.warn("Scheduling restart for VMs on host " + host.getId()); final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId()); final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); // send an email alert that the host is down StringBuilder sb = null; if ((vms != null) && !vms.isEmpty()) { sb = new StringBuilder(); sb.append(" Starting HA on the following VMs: "); // collect list of vm names for the alert email VMInstanceVO vm = vms.get(0); if (vm.isHaEnabled()) { sb.append(" " + vm.getName()); } for (int i = 1; i < vms.size(); i++) { vm = vms.get(i); if (vm.isHaEnabled()) { sb.append(" " + vm.getName()); } } } // send an email alert that the host is down, include VMs HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert( AlertManager.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc, "Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : "")); for (final VMInstanceVO vm : vms) { if (s_logger.isDebugEnabled()) { s_logger.debug("Notifying HA Mgr of to investigate vm " + vm.getId() + "-" + vm.getName()); } scheduleRestart(vm, true); } }
public Long migrate(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); VMInstanceVO vm = mgr.get(vmId); if (vm == null || vm.getRemoved() != null) { s_logger.debug("Unable to find the vm " + vmId); return null; } s_logger.info("Migrating vm: " + vm.toString()); if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM is not longer running on the current hostId"); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER_MIGRATE; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY_MIGRATE; } HostVO fromHost = _hostDao.findById(vm.getHostId()); String fromHostName = ((fromHost == null) ? "unknown" : fromHost.getName()); HostVO toHost = null; if (work.getStep() == Step.Scheduled) { if (vm.getState() != State.Running) { s_logger.info( "VM's state is not ready for migration. " + vm.toString() + " State is " + vm.getState().toString()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; } DataCenterVO dcVO = _dcDao.findById(fromHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(fromHost.getPodId()); try { toHost = mgr.prepareForMigration(vm); if (toHost == null) { if (s_logger.isDebugEnabled()) { s_logger.debug("Unable to find a host for migrating vm " + vmId); } _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Unable to find a suitable host"); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to mgirate due to insufficient capacity " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Insufficient capacity"); } catch (final StorageUnavailableException e) { s_logger.warn("Storage is unavailable: " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Storage is gone."); } if (toHost == null) { _agentMgr.maintenanceFailed(vm.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug("Migrating from " + work.getHostId() + " to " + toHost.getId()); } work.setStep(Step.Migrating); work.setHostId(toHost.getId()); _haDao.update(work.getId(), work); } if (work.getStep() == Step.Migrating) { vm = mgr.get(vmId); // let's see if anything has changed. boolean migrated = false; if (vm == null || vm.getRemoved() != null || vm.getHostId() == null || !_itMgr.stateTransitTo(vm, Event.MigrationRequested, vm.getHostId())) { s_logger.info("Migration cancelled because state has changed: " + vm.toString()); } else { try { boolean isWindows = _guestOSCategoryDao .findById(_guestOSDao.findById(vm.getGuestOSId()).getCategoryId()) .getName() .equalsIgnoreCase("Windows"); MigrateCommand cmd = new MigrateCommand(vm.getInstanceName(), toHost.getPrivateIpAddress(), isWindows); Answer answer = _agentMgr.send(fromHost.getId(), cmd); if (answer != null && answer.getResult()) { migrated = true; _storageMgr.unshare(vm, fromHost); work.setStep(Step.Investigating); _haDao.update(work.getId(), work); } } catch (final AgentUnavailableException e) { s_logger.debug("host became unavailable"); } catch (final OperationTimedoutException e) { s_logger.debug("operation timed out"); if (e.isActive()) { scheduleRestart(vm, true); } } } if (!migrated) { s_logger.info("Migration was unsuccessful. Cleaning up: " + vm.toString()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostPodVO podVO = _podDao.findById(vm.getPodId()); _alertMgr.sendAlert( alertType, fromHost.getDataCenterId(), fromHost.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migrate Command failed. Please check logs."); _itMgr.stateTransitTo(vm, Event.MigrationFailedOnSource, toHost.getId()); _agentMgr.maintenanceFailed(vm.getHostId()); Command cleanup = mgr.cleanup(vm, null); _agentMgr.easySend(toHost.getId(), cleanup); _storageMgr.unshare(vm, toHost); return null; } } if (toHost == null) { toHost = _hostDao.findById(work.getHostId()); } DataCenterVO dcVO = _dcDao.findById(toHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(toHost.getPodId()); try { if (!mgr.completeMigration(vm, toHost)) { _alertMgr.sendAlert( alertType, toHost.getDataCenterId(), toHost.getPodId(), "Unable to migrate " + vmId + " to host " + toHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migration not completed"); s_logger.warn("Unable to complete migration: " + vm.toString()); } else { s_logger.info("Migration is complete: " + vm.toString()); } return null; } catch (final AgentUnavailableException e) { s_logger.warn("Agent is unavailable for " + vm.toString()); } catch (final OperationTimedoutException e) { s_logger.warn("Operation timed outfor " + vm.toString()); } _itMgr.stateTransitTo(vm, Event.MigrationFailedOnDest, toHost.getId()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; }
/** * compareState does as its name suggests and compares the states between management server and * agent. It returns whether something should be cleaned up */ protected Command compareState(VMInstanceVO vm, final AgentVmInfo info, final boolean fullSync) { State agentState = info.state; final String agentName = info.name; final State serverState = vm.getState(); final String serverName = vm.getName(); Command command = null; if (s_logger.isDebugEnabled()) { s_logger.debug( "VM " + serverName + ": server state = " + serverState.toString() + " and agent state = " + agentState.toString()); } if (agentState == State.Error) { agentState = State.Stopped; short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } HostPodVO podVO = _podDao.findById(vm.getPodId()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostVO hostVO = _hostDao.findById(vm.getHostId()); String hostDesc = "name: " + hostVO.getName() + " (id:" + hostVO.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vm.getId() + ") stopped on host " + hostDesc + " due to storage failure", "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() + "] stopped due to storage failure."); } if (serverState == State.Migrating) { s_logger.debug("Skipping vm in migrating state: " + vm.toString()); return null; } if (agentState == serverState) { if (s_logger.isDebugEnabled()) { s_logger.debug("Both states are " + agentState.toString() + " for " + serverName); } assert (agentState == State.Stopped || agentState == State.Running) : "If the states we send up is changed, this must be changed."; _itMgr.stateTransitTo( vm, agentState == State.Stopped ? VirtualMachine.Event.AgentReportStopped : VirtualMachine.Event.AgentReportRunning, vm.getHostId()); if (agentState == State.Stopped) { s_logger.debug("State matches but the agent said stopped so let's send a cleanup anyways."); return info.mgr.cleanup(vm, agentName); } return null; } if (agentState == State.Stopped) { // This state means the VM on the agent was detected previously // and now is gone. This is slightly different than if the VM // was never completed but we still send down a Stop Command // to ensure there's cleanup. if (serverState == State.Running) { // Our records showed that it should be running so let's restart it. vm = info.mgr.get(vm.getId()); scheduleRestart(vm, false); command = info.mgr.cleanup(vm, agentName); } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM is in stopping state on full sync. Updating the status to stopped"); vm = info.mgr.get(vm.getId()); info.mgr.completeStopCommand(vm); command = info.mgr.cleanup(vm, agentName); } else { s_logger.debug("Ignoring VM in stopping mode: " + vm.getName()); } } else if (serverState == State.Starting) { s_logger.debug("Ignoring VM in starting mode: " + vm.getName()); } else { s_logger.debug("Sending cleanup to a stopped vm: " + agentName); _itMgr.stateTransitTo(vm, VirtualMachine.Event.AgentReportStopped, null); command = info.mgr.cleanup(vm, agentName); } } else if (agentState == State.Running) { if (serverState == State.Starting) { if (fullSync) { s_logger.debug("VM state is starting on full sync so updating it to running"); vm = info.mgr.get(vm.getId()); info.mgr.completeStartCommand(vm); } } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM state is in stopping on fullsync so resend stop."); vm = info.mgr.get(vm.getId()); info.mgr.completeStopCommand(vm); command = info.mgr.cleanup(vm, agentName); } else { s_logger.debug("VM is in stopping state so no action."); } } else if (serverState == State.Destroyed || serverState == State.Stopped || serverState == State.Expunging) { s_logger.debug("VM state is in stopped so stopping it on the agent"); vm = info.mgr.get(vm.getId()); command = info.mgr.cleanup(vm, agentName); } else { _itMgr.stateTransitTo(vm, VirtualMachine.Event.AgentReportRunning, vm.getHostId()); } } /*else if (agentState == State.Unknown) { if (serverState == State.Running) { if (fullSync) { vm = info.handler.get(vm.getId()); } scheduleRestart(vm, false); } else if (serverState == State.Starting) { if (fullSync) { vm = info.handler.get(vm.getId()); } scheduleRestart(vm, false); } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM state is stopping in full sync. Resending stop"); command = info.handler.cleanup(vm, agentName); } } }*/ return command; }