@Override public TemplateProfile prepare(RegisterTemplateCmd cmd) throws ResourceAllocationException { TemplateProfile profile = super.prepare(cmd); if (profile.getZoneId() == null || profile.getZoneId() == -1) { List<DataCenterVO> dcs = _dcDao.listAllIncludingRemoved(); for (DataCenterVO dc : dcs) { List<HostVO> pxeServers = _resourceMgr.listAllHostsInOneZoneByType(Host.Type.BaremetalPxe, dc.getId()); if (pxeServers.size() == 0) { throw new CloudRuntimeException( "Please add PXE server before adding baremetal template in zone " + dc.getName()); } } } else { List<HostVO> pxeServers = _resourceMgr.listAllHostsInOneZoneByType(Host.Type.BaremetalPxe, profile.getZoneId()); if (pxeServers.size() == 0) { throw new CloudRuntimeException( "Please add PXE server before adding baremetal template in zone " + profile.getZoneId()); } } return profile; }
@Override public void scheduleRestartForVmsOnHost(final HostVO host) { if (host.getType() != Host.Type.Routing) { return; } s_logger.warn("Scheduling restart for VMs on host " + host.getId()); final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId()); final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); // send an email alert that the host is down StringBuilder sb = null; if ((vms != null) && !vms.isEmpty()) { sb = new StringBuilder(); sb.append(" Starting HA on the following VMs: "); // collect list of vm names for the alert email VMInstanceVO vm = vms.get(0); if (vm.isHaEnabled()) { sb.append(" " + vm.getName()); } for (int i = 1; i < vms.size(); i++) { vm = vms.get(i); if (vm.isHaEnabled()) { sb.append(" " + vm.getName()); } } } // send an email alert that the host is down, include VMs HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert( AlertManager.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc, "Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : "")); for (final VMInstanceVO vm : vms) { if (s_logger.isDebugEnabled()) { s_logger.debug("Notifying HA Mgr of to investigate vm " + vm.getId() + "-" + vm.getName()); } scheduleRestart(vm, true); } }
@Override @DB public boolean delete(TemplateProfile profile) { boolean success = true; VMTemplateVO template = profile.getTemplate(); Long zoneId = profile.getZoneId(); Long templateId = template.getId(); String zoneName; List<HostVO> secondaryStorageHosts; if (!template.isCrossZones() && zoneId != null) { DataCenterVO zone = _dcDao.findById(zoneId); zoneName = zone.getName(); secondaryStorageHosts = _ssvmMgr.listSecondaryStorageHostsInOneZone(zoneId); } else { zoneName = "(all zones)"; secondaryStorageHosts = _ssvmMgr.listSecondaryStorageHostsInAllZones(); } s_logger.debug( "Attempting to mark template host refs for template: " + template.getName() + " as destroyed in zone: " + zoneName); // Make sure the template is downloaded to all the necessary secondary storage hosts for (HostVO secondaryStorageHost : secondaryStorageHosts) { long hostId = secondaryStorageHost.getId(); List<VMTemplateHostVO> templateHostVOs = _tmpltHostDao.listByHostTemplate(hostId, templateId); for (VMTemplateHostVO templateHostVO : templateHostVOs) { if (templateHostVO.getDownloadState() == Status.DOWNLOAD_IN_PROGRESS) { String errorMsg = "Please specify a template that is not currently being downloaded."; s_logger.debug( "Template: " + template.getName() + " is currently being downloaded to secondary storage host: " + secondaryStorageHost.getName() + "; cant' delete it."); throw new CloudRuntimeException(errorMsg); } } } Account account = _accountDao.findByIdIncludingRemoved(template.getAccountId()); String eventType = ""; if (template.getFormat().equals(ImageFormat.ISO)) { eventType = EventTypes.EVENT_ISO_DELETE; } else { eventType = EventTypes.EVENT_TEMPLATE_DELETE; } // Iterate through all necessary secondary storage hosts and mark the template on each host as // destroyed for (HostVO secondaryStorageHost : secondaryStorageHosts) { long hostId = secondaryStorageHost.getId(); long sZoneId = secondaryStorageHost.getDataCenterId(); List<VMTemplateHostVO> templateHostVOs = _tmpltHostDao.listByHostTemplate(hostId, templateId); for (VMTemplateHostVO templateHostVO : templateHostVOs) { VMTemplateHostVO lock = _tmpltHostDao.acquireInLockTable(templateHostVO.getId()); try { if (lock == null) { s_logger.debug( "Failed to acquire lock when deleting templateHostVO with ID: " + templateHostVO.getId()); success = false; break; } UsageEventVO usageEvent = new UsageEventVO(eventType, account.getId(), sZoneId, templateId, null); _usageEventDao.persist(usageEvent); templateHostVO.setDestroyed(true); _tmpltHostDao.update(templateHostVO.getId(), templateHostVO); String installPath = templateHostVO.getInstallPath(); if (installPath != null) { Answer answer = _agentMgr.sendToSecStorage( secondaryStorageHost, new DeleteTemplateCommand(secondaryStorageHost.getStorageUrl(), installPath)); if (answer == null || !answer.getResult()) { s_logger.debug( "Failed to delete " + templateHostVO + " due to " + ((answer == null) ? "answer is null" : answer.getDetails())); } else { _tmpltHostDao.remove(templateHostVO.getId()); s_logger.debug("Deleted template at: " + installPath); } } else { _tmpltHostDao.remove(templateHostVO.getId()); } VMTemplateZoneVO templateZone = _tmpltZoneDao.findByZoneTemplate(sZoneId, templateId); if (templateZone != null) { _tmpltZoneDao.remove(templateZone.getId()); } } finally { if (lock != null) { _tmpltHostDao.releaseFromLockTable(lock.getId()); } } } if (!success) { break; } } s_logger.debug( "Successfully marked template host refs for template: " + template.getName() + " as destroyed in zone: " + zoneName); // If there are no more non-destroyed template host entries for this template, delete it if (success && (_tmpltHostDao.listByTemplateId(templateId).size() == 0)) { long accountId = template.getAccountId(); VMTemplateVO lock = _tmpltDao.acquireInLockTable(templateId); try { if (lock == null) { s_logger.debug("Failed to acquire lock when deleting template with ID: " + templateId); success = false; } else if (_tmpltDao.remove(templateId)) { // Decrement the number of templates _resourceLimitMgr.decrementResourceCount(accountId, ResourceType.template); } } finally { if (lock != null) { _tmpltDao.releaseFromLockTable(lock.getId()); } } s_logger.debug( "Removed template: " + template.getName() + " because all of its template host refs were marked as destroyed."); } return success; }
protected Long restart(HaWorkVO work) { List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Cancelling this work item because newer ones have been scheduled. Work Ids = ["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return null; } items = _haDao.listRunningHaWorkForVm(work.getInstanceId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Waiting because there's HA work being executed on an item currently. Work Ids =["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } long vmId = work.getInstanceId(); VMInstanceVO vm = _itMgr.findByIdAndType(work.getType(), work.getInstanceId()); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_SSVM; } HostVO host = _hostDao.findById(work.getHostId()); boolean isHostRemoved = false; if (host == null) { host = _hostDao.findByIdIncludingRemoved(work.getHostId()); if (host != null) { s_logger.debug( "VM " + vm.toString() + " is now no longer on host " + work.getHostId() + " as the host is removed"); isHostRemoved = true; } } DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); Boolean alive = null; if (work.getStep() == Step.Investigating) { if (!isHostRemoved) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); s_logger.info(investigator.getName() + " found " + vm + "to be alive? " + alive); if (alive != null) { break; } } boolean fenced = false; if (alive == null) { s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); s_logger.info("Fencer " + fb.getName() + " returned " + result); if (result != null && result) { fenced = true; break; } } } else if (!alive) { fenced = true; } else { s_logger.debug( "VM " + vm.getHostName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { s_logger.info(vm + " is alive and host is up. No need to restart it."); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } if (!fenced) { s_logger.debug("We were unable to fence off the VM " + vm); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } else { s_logger.debug( "How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways"); try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } } } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = _itMgr.advanceStart( vm, new HashMap<VirtualMachineProfile.Param, Object>(), _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (final ResourceUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (OperationTimedoutException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; }
@Override public Host addTrafficMonitor(AddTrafficMonitorCmd cmd) { long zoneId = cmd.getZoneId(); DataCenterVO zone = _dcDao.findById(zoneId); String zoneName; if (zone == null) { throw new InvalidParameterValueException("Could not find zone with ID: " + zoneId); } else { zoneName = zone.getName(); } List<HostVO> trafficMonitorsInZone = _resourceMgr.listAllHostsInOneZoneByType(Host.Type.TrafficMonitor, zoneId); if (trafficMonitorsInZone.size() != 0) { throw new InvalidParameterValueException( "Already added an traffic monitor in zone: " + zoneName); } URI uri; try { uri = new URI(cmd.getUrl()); } catch (Exception e) { s_logger.debug(e); throw new InvalidParameterValueException(e.getMessage()); } String ipAddress = uri.getHost(); // String numRetries = params.get("numretries"); // String timeout = params.get("timeout"); TrafficSentinelResource resource = new TrafficSentinelResource(); String guid = getTrafficMonitorGuid(zoneId, NetworkUsageResourceName.TrafficSentinel, ipAddress); Map<String, Object> hostParams = new HashMap<String, Object>(); hostParams.put("zone", String.valueOf(zoneId)); hostParams.put("ipaddress", ipAddress); hostParams.put("url", cmd.getUrl()); hostParams.put("inclZones", (cmd.getInclZones() != null) ? cmd.getInclZones() : _TSinclZones); hostParams.put("exclZones", (cmd.getExclZones() != null) ? cmd.getExclZones() : _TSexclZones); hostParams.put("guid", guid); hostParams.put("name", guid); try { resource.configure(guid, hostParams); } catch (ConfigurationException e) { throw new CloudRuntimeException(e.getMessage()); } Map<String, String> hostDetails = new HashMap<String, String>(); hostDetails.put("url", cmd.getUrl()); hostDetails.put("last_collection", "" + System.currentTimeMillis()); if (cmd.getInclZones() != null) { hostDetails.put("inclZones", cmd.getInclZones()); } if (cmd.getExclZones() != null) { hostDetails.put("exclZones", cmd.getExclZones()); } Host trafficMonitor = _resourceMgr.addHost(zoneId, resource, Host.Type.TrafficMonitor, hostDetails); return trafficMonitor; }
public Long migrate(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); VMInstanceVO vm = mgr.get(vmId); if (vm == null || vm.getRemoved() != null) { s_logger.debug("Unable to find the vm " + vmId); return null; } s_logger.info("Migrating vm: " + vm.toString()); if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM is not longer running on the current hostId"); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER_MIGRATE; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY_MIGRATE; } HostVO fromHost = _hostDao.findById(vm.getHostId()); String fromHostName = ((fromHost == null) ? "unknown" : fromHost.getName()); HostVO toHost = null; if (work.getStep() == Step.Scheduled) { if (vm.getState() != State.Running) { s_logger.info( "VM's state is not ready for migration. " + vm.toString() + " State is " + vm.getState().toString()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; } DataCenterVO dcVO = _dcDao.findById(fromHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(fromHost.getPodId()); try { toHost = mgr.prepareForMigration(vm); if (toHost == null) { if (s_logger.isDebugEnabled()) { s_logger.debug("Unable to find a host for migrating vm " + vmId); } _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Unable to find a suitable host"); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to mgirate due to insufficient capacity " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Insufficient capacity"); } catch (final StorageUnavailableException e) { s_logger.warn("Storage is unavailable: " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Storage is gone."); } if (toHost == null) { _agentMgr.maintenanceFailed(vm.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug("Migrating from " + work.getHostId() + " to " + toHost.getId()); } work.setStep(Step.Migrating); work.setHostId(toHost.getId()); _haDao.update(work.getId(), work); } if (work.getStep() == Step.Migrating) { vm = mgr.get(vmId); // let's see if anything has changed. boolean migrated = false; if (vm == null || vm.getRemoved() != null || vm.getHostId() == null || !_itMgr.stateTransitTo(vm, Event.MigrationRequested, vm.getHostId())) { s_logger.info("Migration cancelled because state has changed: " + vm.toString()); } else { try { boolean isWindows = _guestOSCategoryDao .findById(_guestOSDao.findById(vm.getGuestOSId()).getCategoryId()) .getName() .equalsIgnoreCase("Windows"); MigrateCommand cmd = new MigrateCommand(vm.getInstanceName(), toHost.getPrivateIpAddress(), isWindows); Answer answer = _agentMgr.send(fromHost.getId(), cmd); if (answer != null && answer.getResult()) { migrated = true; _storageMgr.unshare(vm, fromHost); work.setStep(Step.Investigating); _haDao.update(work.getId(), work); } } catch (final AgentUnavailableException e) { s_logger.debug("host became unavailable"); } catch (final OperationTimedoutException e) { s_logger.debug("operation timed out"); if (e.isActive()) { scheduleRestart(vm, true); } } } if (!migrated) { s_logger.info("Migration was unsuccessful. Cleaning up: " + vm.toString()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostPodVO podVO = _podDao.findById(vm.getPodId()); _alertMgr.sendAlert( alertType, fromHost.getDataCenterId(), fromHost.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migrate Command failed. Please check logs."); _itMgr.stateTransitTo(vm, Event.MigrationFailedOnSource, toHost.getId()); _agentMgr.maintenanceFailed(vm.getHostId()); Command cleanup = mgr.cleanup(vm, null); _agentMgr.easySend(toHost.getId(), cleanup); _storageMgr.unshare(vm, toHost); return null; } } if (toHost == null) { toHost = _hostDao.findById(work.getHostId()); } DataCenterVO dcVO = _dcDao.findById(toHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(toHost.getPodId()); try { if (!mgr.completeMigration(vm, toHost)) { _alertMgr.sendAlert( alertType, toHost.getDataCenterId(), toHost.getPodId(), "Unable to migrate " + vmId + " to host " + toHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migration not completed"); s_logger.warn("Unable to complete migration: " + vm.toString()); } else { s_logger.info("Migration is complete: " + vm.toString()); } return null; } catch (final AgentUnavailableException e) { s_logger.warn("Agent is unavailable for " + vm.toString()); } catch (final OperationTimedoutException e) { s_logger.warn("Operation timed outfor " + vm.toString()); } _itMgr.stateTransitTo(vm, Event.MigrationFailedOnDest, toHost.getId()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; }
/** * compareState does as its name suggests and compares the states between management server and * agent. It returns whether something should be cleaned up */ protected Command compareState(VMInstanceVO vm, final AgentVmInfo info, final boolean fullSync) { State agentState = info.state; final String agentName = info.name; final State serverState = vm.getState(); final String serverName = vm.getName(); Command command = null; if (s_logger.isDebugEnabled()) { s_logger.debug( "VM " + serverName + ": server state = " + serverState.toString() + " and agent state = " + agentState.toString()); } if (agentState == State.Error) { agentState = State.Stopped; short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } HostPodVO podVO = _podDao.findById(vm.getPodId()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostVO hostVO = _hostDao.findById(vm.getHostId()); String hostDesc = "name: " + hostVO.getName() + " (id:" + hostVO.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vm.getId() + ") stopped on host " + hostDesc + " due to storage failure", "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() + "] stopped due to storage failure."); } if (serverState == State.Migrating) { s_logger.debug("Skipping vm in migrating state: " + vm.toString()); return null; } if (agentState == serverState) { if (s_logger.isDebugEnabled()) { s_logger.debug("Both states are " + agentState.toString() + " for " + serverName); } assert (agentState == State.Stopped || agentState == State.Running) : "If the states we send up is changed, this must be changed."; _itMgr.stateTransitTo( vm, agentState == State.Stopped ? VirtualMachine.Event.AgentReportStopped : VirtualMachine.Event.AgentReportRunning, vm.getHostId()); if (agentState == State.Stopped) { s_logger.debug("State matches but the agent said stopped so let's send a cleanup anyways."); return info.mgr.cleanup(vm, agentName); } return null; } if (agentState == State.Stopped) { // This state means the VM on the agent was detected previously // and now is gone. This is slightly different than if the VM // was never completed but we still send down a Stop Command // to ensure there's cleanup. if (serverState == State.Running) { // Our records showed that it should be running so let's restart it. vm = info.mgr.get(vm.getId()); scheduleRestart(vm, false); command = info.mgr.cleanup(vm, agentName); } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM is in stopping state on full sync. Updating the status to stopped"); vm = info.mgr.get(vm.getId()); info.mgr.completeStopCommand(vm); command = info.mgr.cleanup(vm, agentName); } else { s_logger.debug("Ignoring VM in stopping mode: " + vm.getName()); } } else if (serverState == State.Starting) { s_logger.debug("Ignoring VM in starting mode: " + vm.getName()); } else { s_logger.debug("Sending cleanup to a stopped vm: " + agentName); _itMgr.stateTransitTo(vm, VirtualMachine.Event.AgentReportStopped, null); command = info.mgr.cleanup(vm, agentName); } } else if (agentState == State.Running) { if (serverState == State.Starting) { if (fullSync) { s_logger.debug("VM state is starting on full sync so updating it to running"); vm = info.mgr.get(vm.getId()); info.mgr.completeStartCommand(vm); } } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM state is in stopping on fullsync so resend stop."); vm = info.mgr.get(vm.getId()); info.mgr.completeStopCommand(vm); command = info.mgr.cleanup(vm, agentName); } else { s_logger.debug("VM is in stopping state so no action."); } } else if (serverState == State.Destroyed || serverState == State.Stopped || serverState == State.Expunging) { s_logger.debug("VM state is in stopped so stopping it on the agent"); vm = info.mgr.get(vm.getId()); command = info.mgr.cleanup(vm, agentName); } else { _itMgr.stateTransitTo(vm, VirtualMachine.Event.AgentReportRunning, vm.getHostId()); } } /*else if (agentState == State.Unknown) { if (serverState == State.Running) { if (fullSync) { vm = info.handler.get(vm.getId()); } scheduleRestart(vm, false); } else if (serverState == State.Starting) { if (fullSync) { vm = info.handler.get(vm.getId()); } scheduleRestart(vm, false); } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM state is stopping in full sync. Resending stop"); command = info.handler.cleanup(vm, agentName); } } }*/ return command; }
protected Long restart(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); if (mgr == null) { s_logger.warn( "Unable to find a handler for " + work.getType().toString() + ", throwing out " + vmId); return null; } VMInstanceVO vm = mgr.get(vmId); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm.toString()); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm.toString() + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } final HostVO host = _hostDao.findById(work.getHostId()); DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } Boolean alive = null; if (work.getStep() == Step.Investigating) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); if (vm.getState() == State.Starting && vm.getUpdated() == work.getUpdateTime()) { _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null); } return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); if (alive != null) { s_logger.debug( investigator.getName() + " found VM " + vm.getName() + "to be alive? " + alive); break; } } if (alive != null && alive) { s_logger.debug("VM " + vm.getName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { compareState(vm, new AgentVmInfo(vm.getInstanceName(), mgr, State.Running), false); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } boolean fenced = false; if (alive == null || !alive) { fenced = true; s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { final FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); if (result != null && !result) { fenced = false; } } } if (alive == null && !fenced) { s_logger.debug("We were unable to fence off the VM " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } mgr.completeStopCommand(vm); work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } // send an alert for VMs that stop unexpectedly _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vmId + ") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + hostDesc + "] stopped unexpectedly."); vm = mgr.get(vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = mgr.start(vm.getId(), 0); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } vm = mgr.get(vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (final StorageUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ExecutionException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } }
private void generateEmailAlert( DataCenterVO dc, HostPodVO pod, ClusterVO cluster, double totalCapacity, double usedCapacity, short capacityType) { String msgSubject = null; String msgContent = null; String totalStr; String usedStr; String pctStr = formatPercent(usedCapacity / totalCapacity); short alertType = -1; Long podId = pod == null ? null : pod.getId(); Long clusterId = cluster == null ? null : cluster.getId(); switch (capacityType) { // Cluster Level case CapacityVO.CAPACITY_TYPE_MEMORY: msgSubject = "System Alert: Low Available Memory in cluster " + cluster.getName() + " pod " + pod.getName() + " of availablity zone " + dc.getName(); totalStr = formatBytesToMegabytes(totalCapacity); usedStr = formatBytesToMegabytes(usedCapacity); msgContent = "System memory is low, total: " + totalStr + " MB, used: " + usedStr + " MB (" + pctStr + "%)"; alertType = ALERT_TYPE_MEMORY; break; case CapacityVO.CAPACITY_TYPE_CPU: msgSubject = "System Alert: Low Unallocated CPU in cluster " + cluster.getName() + " pod " + pod.getName() + " of availablity zone " + dc.getName(); totalStr = _dfWhole.format(totalCapacity); usedStr = _dfWhole.format(usedCapacity); msgContent = "Unallocated CPU is low, total: " + totalStr + " Mhz, used: " + usedStr + " Mhz (" + pctStr + "%)"; alertType = ALERT_TYPE_CPU; break; case CapacityVO.CAPACITY_TYPE_STORAGE: msgSubject = "System Alert: Low Available Storage in cluster " + cluster.getName() + " pod " + pod.getName() + " of availablity zone " + dc.getName(); totalStr = formatBytesToMegabytes(totalCapacity); usedStr = formatBytesToMegabytes(usedCapacity); msgContent = "Available storage space is low, total: " + totalStr + " MB, used: " + usedStr + " MB (" + pctStr + "%)"; alertType = ALERT_TYPE_STORAGE; break; case CapacityVO.CAPACITY_TYPE_STORAGE_ALLOCATED: msgSubject = "System Alert: Remaining unallocated Storage is low in cluster " + cluster.getName() + " pod " + pod.getName() + " of availablity zone " + dc.getName(); totalStr = formatBytesToMegabytes(totalCapacity); usedStr = formatBytesToMegabytes(usedCapacity); msgContent = "Unallocated storage space is low, total: " + totalStr + " MB, allocated: " + usedStr + " MB (" + pctStr + "%)"; alertType = ALERT_TYPE_STORAGE_ALLOCATED; break; case CapacityVO.CAPACITY_TYPE_LOCAL_STORAGE: msgSubject = "System Alert: Remaining unallocated Local Storage is low in cluster " + cluster.getName() + " pod " + pod.getName() + " of availablity zone " + dc.getName(); totalStr = formatBytesToMegabytes(totalCapacity); usedStr = formatBytesToMegabytes(usedCapacity); msgContent = "Unallocated storage space is low, total: " + totalStr + " MB, allocated: " + usedStr + " MB (" + pctStr + "%)"; alertType = ALERT_TYPE_LOCAL_STORAGE; break; // Pod Level case CapacityVO.CAPACITY_TYPE_PRIVATE_IP: msgSubject = "System Alert: Number of unallocated private IPs is low in pod " + pod.getName() + " of availablity zone " + dc.getName(); totalStr = Double.toString(totalCapacity); usedStr = Double.toString(usedCapacity); msgContent = "Number of unallocated private IPs is low, total: " + totalStr + ", allocated: " + usedStr + " (" + pctStr + "%)"; alertType = ALERT_TYPE_PRIVATE_IP; break; // Zone Level case CapacityVO.CAPACITY_TYPE_SECONDARY_STORAGE: msgSubject = "System Alert: Low Available Secondary Storage in availablity zone " + dc.getName(); totalStr = formatBytesToMegabytes(totalCapacity); usedStr = formatBytesToMegabytes(usedCapacity); msgContent = "Available secondary storage space is low, total: " + totalStr + " MB, used: " + usedStr + " MB (" + pctStr + "%)"; alertType = ALERT_TYPE_SECONDARY_STORAGE; break; case CapacityVO.CAPACITY_TYPE_VIRTUAL_NETWORK_PUBLIC_IP: msgSubject = "System Alert: Number of unallocated virtual network public IPs is low in availablity zone " + dc.getName(); totalStr = Double.toString(totalCapacity); usedStr = Double.toString(usedCapacity); msgContent = "Number of unallocated public IPs is low, total: " + totalStr + ", allocated: " + usedStr + " (" + pctStr + "%)"; alertType = ALERT_TYPE_VIRTUAL_NETWORK_PUBLIC_IP; break; case CapacityVO.CAPACITY_TYPE_DIRECT_ATTACHED_PUBLIC_IP: msgSubject = "System Alert: Number of unallocated direct attached public IPs is low in availablity zone " + dc.getName(); totalStr = Double.toString(totalCapacity); usedStr = Double.toString(usedCapacity); msgContent = "Number of unallocated direct attached public IPs is low, total: " + totalStr + ", allocated: " + usedStr + " (" + pctStr + "%)"; alertType = ALERT_TYPE_DIRECT_ATTACHED_PUBLIC_IP; break; case CapacityVO.CAPACITY_TYPE_VLAN: msgSubject = "System Alert: Number of unallocated VLANs is low in availablity zone " + dc.getName(); totalStr = Double.toString(totalCapacity); usedStr = Double.toString(usedCapacity); msgContent = "Number of unallocated VLANs is low, total: " + totalStr + ", allocated: " + usedStr + " (" + pctStr + "%)"; alertType = ALERT_TYPE_VLAN; break; } try { if (s_logger.isDebugEnabled()) { s_logger.debug(msgSubject); s_logger.debug(msgContent); } _emailAlert.sendAlert(alertType, dc.getId(), podId, clusterId, msgSubject, msgContent); } catch (Exception ex) { s_logger.error("Exception in CapacityChecker", ex); } }