// TODO: add test for method @Override public final HostVO createHostVOForDirectConnectAgent( final HostVO host, final StartupCommand[] startup, final ServerResource resource, final Map<String, String> details, final List<String> hostTags) { StartupCommand firstCmd = startup[0]; if (!(firstCmd instanceof StartupRoutingCommand)) { return null; } StartupRoutingCommand ssCmd = ((StartupRoutingCommand) firstCmd); if (ssCmd.getHypervisorType() != HypervisorType.Hyperv) { return null; } s_logger.info( "Host: " + host.getName() + " connected with hypervisor type: " + HypervisorType.Hyperv + ". Checking CIDR..."); HostPodVO pod = _podDao.findById(host.getPodId()); DataCenterVO dc = _dcDao.findById(host.getDataCenterId()); _resourceMgr.checkCIDR(pod, dc, ssCmd.getPrivateIpAddress(), ssCmd.getPrivateNetmask()); return _resourceMgr.fillRoutingHostVO(host, ssCmd, HypervisorType.Hyperv, details, hostTags); }
public DomainRouterVO deployLoadBalancerVM(Long networkId, IPAddressVO ipAddr, Long accountId) { NetworkVO network = _networkDao.findById(networkId); DataCenter dc = _dcDao.findById(network.getDataCenterId()); Long podId = getPodIdForDirectIp(ipAddr); Pod pod = podId == null ? null : _podDao.findById(podId); Map<VirtualMachineProfile.Param, Object> params = new HashMap<VirtualMachineProfile.Param, Object>(1); params.put(VirtualMachineProfile.Param.ReProgramGuestNetworks, true); Account owner = _accountService.getActiveAccountByName("system", new Long(1)); DeployDestination dest = new DeployDestination(dc, pod, null, null); s_logger.debug("About to deploy ELB vm "); try { DomainRouterVO elbVm = deployELBVm(network, dest, owner, params); if (elbVm == null) { throw new InvalidParameterValueException("Could not deploy or find existing ELB VM"); } s_logger.debug("Deployed ELB vm = " + elbVm); return elbVm; } catch (Throwable t) { s_logger.warn("Error while deploying ELB VM: ", t); return null; } }
protected void cleanDb() { if (volumeId != null) { volumeDao.remove(volumeId); volumeId = null; } if (diskOfferingId != null) { diskOfferingDao.remove(diskOfferingId); diskOfferingId = null; } if (storagePoolId != null) { storagePoolDao.remove(storagePoolId); storagePoolId = null; } if (clusterId != null) { clusterDao.remove(clusterId); clusterId = null; } if (podId != null) { podDao.remove(podId); podId = null; } if (dcId != null) { dcDao.remove(dcId); dcId = null; } }
@Test public void scheduleRestartForVmsOnHost() { Mockito.when(hostVO.getType()).thenReturn(Host.Type.Routing); Mockito.when(hostVO.getHypervisorType()).thenReturn(HypervisorType.KVM); Mockito.when(_instanceDao.listByHostId(42l)) .thenReturn(Arrays.asList(Mockito.mock(VMInstanceVO.class))); Mockito.when(_podDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(HostPodVO.class)); Mockito.when(_dcDao.findById(Mockito.anyLong())).thenReturn(Mockito.mock(DataCenterVO.class)); highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true); }
@Override public void scheduleRestartForVmsOnHost(final HostVO host) { if (host.getType() != Host.Type.Routing) { return; } s_logger.warn("Scheduling restart for VMs on host " + host.getId()); final List<VMInstanceVO> vms = _instanceDao.listByHostId(host.getId()); final DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); // send an email alert that the host is down StringBuilder sb = null; if ((vms != null) && !vms.isEmpty()) { sb = new StringBuilder(); sb.append(" Starting HA on the following VMs: "); // collect list of vm names for the alert email VMInstanceVO vm = vms.get(0); if (vm.isHaEnabled()) { sb.append(" " + vm.getName()); } for (int i = 1; i < vms.size(); i++) { vm = vms.get(i); if (vm.isHaEnabled()) { sb.append(" " + vm.getName()); } } } // send an email alert that the host is down, include VMs HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert( AlertManager.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host is down, " + hostDesc, "Host [" + hostDesc + "] is down." + ((sb != null) ? sb.toString() : "")); for (final VMInstanceVO vm : vms) { if (s_logger.isDebugEnabled()) { s_logger.debug("Notifying HA Mgr of to investigate vm " + vm.getId() + "-" + vm.getName()); } scheduleRestart(vm, true); } }
protected Long restart(HaWorkVO work) { List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Cancelling this work item because newer ones have been scheduled. Work Ids = ["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return null; } items = _haDao.listRunningHaWorkForVm(work.getInstanceId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Waiting because there's HA work being executed on an item currently. Work Ids =["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } long vmId = work.getInstanceId(); VMInstanceVO vm = _itMgr.findByIdAndType(work.getType(), work.getInstanceId()); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_SSVM; } HostVO host = _hostDao.findById(work.getHostId()); boolean isHostRemoved = false; if (host == null) { host = _hostDao.findByIdIncludingRemoved(work.getHostId()); if (host != null) { s_logger.debug( "VM " + vm.toString() + " is now no longer on host " + work.getHostId() + " as the host is removed"); isHostRemoved = true; } } DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); Boolean alive = null; if (work.getStep() == Step.Investigating) { if (!isHostRemoved) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); s_logger.info(investigator.getName() + " found " + vm + "to be alive? " + alive); if (alive != null) { break; } } boolean fenced = false; if (alive == null) { s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); s_logger.info("Fencer " + fb.getName() + " returned " + result); if (result != null && result) { fenced = true; break; } } } else if (!alive) { fenced = true; } else { s_logger.debug( "VM " + vm.getHostName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { s_logger.info(vm + " is alive and host is up. No need to restart it."); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } if (!fenced) { s_logger.debug("We were unable to fence off the VM " + vm); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } else { s_logger.debug( "How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways"); try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } } } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = _itMgr.advanceStart( vm, new HashMap<VirtualMachineProfile.Param, Object>(), _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (final ResourceUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (OperationTimedoutException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; }
protected void createDb() { DataCenterVO dc = new DataCenterVO( UUID.randomUUID().toString(), "test", "8.8.8.8", null, "10.0.0.1", null, "10.0.0.1/24", null, null, NetworkType.Basic, null, null, true, true, null, null); dc = dcDao.persist(dc); dcId = dc.getId(); HostPodVO pod = new HostPodVO(UUID.randomUUID().toString(), dc.getId(), "255.255.255.255", "", 8, "test"); pod = podDao.persist(pod); podId = pod.getId(); ClusterVO cluster = new ClusterVO(dc.getId(), pod.getId(), "devcloud cluster"); cluster.setHypervisorType(HypervisorType.XenServer.toString()); cluster.setClusterType(ClusterType.CloudManaged); cluster.setManagedState(ManagedState.Managed); cluster = clusterDao.persist(cluster); clusterId = cluster.getId(); DataStoreProvider provider = providerMgr.getDataStoreProvider("ancient primary data store provider"); storage = new StoragePoolVO(); storage.setDataCenterId(dcId); storage.setPodId(podId); storage.setPoolType(StoragePoolType.NetworkFilesystem); storage.setClusterId(clusterId); storage.setStatus(StoragePoolStatus.Up); storage.setScope(ScopeType.CLUSTER); storage.setAvailableBytes(1000); storage.setCapacityBytes(20000); storage.setHostAddress(UUID.randomUUID().toString()); storage.setPath(UUID.randomUUID().toString()); storage.setStorageProviderName(provider.getName()); storage = storagePoolDao.persist(storage); storagePoolId = storage.getId(); storageMgr.createCapacityEntry(storage.getId()); diskOffering = new DiskOfferingVO(); diskOffering.setDiskSize(500); diskOffering.setName("test-disk"); diskOffering.setSystemUse(false); diskOffering.setUseLocalStorage(false); diskOffering.setCustomized(false); diskOffering.setRecreatable(false); diskOffering = diskOfferingDao.persist(diskOffering); diskOfferingId = diskOffering.getId(); volume = new VolumeVO( Volume.Type.ROOT, "volume", dcId, 1, 1, diskOffering.getId(), diskOffering.getDiskSize()); volume = volumeDao.persist(volume); volumeId = volume.getId(); }
public Long migrate(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); VMInstanceVO vm = mgr.get(vmId); if (vm == null || vm.getRemoved() != null) { s_logger.debug("Unable to find the vm " + vmId); return null; } s_logger.info("Migrating vm: " + vm.toString()); if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM is not longer running on the current hostId"); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER_MIGRATE; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY_MIGRATE; } HostVO fromHost = _hostDao.findById(vm.getHostId()); String fromHostName = ((fromHost == null) ? "unknown" : fromHost.getName()); HostVO toHost = null; if (work.getStep() == Step.Scheduled) { if (vm.getState() != State.Running) { s_logger.info( "VM's state is not ready for migration. " + vm.toString() + " State is " + vm.getState().toString()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; } DataCenterVO dcVO = _dcDao.findById(fromHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(fromHost.getPodId()); try { toHost = mgr.prepareForMigration(vm); if (toHost == null) { if (s_logger.isDebugEnabled()) { s_logger.debug("Unable to find a host for migrating vm " + vmId); } _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Unable to find a suitable host"); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to mgirate due to insufficient capacity " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Insufficient capacity"); } catch (final StorageUnavailableException e) { s_logger.warn("Storage is unavailable: " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Storage is gone."); } if (toHost == null) { _agentMgr.maintenanceFailed(vm.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug("Migrating from " + work.getHostId() + " to " + toHost.getId()); } work.setStep(Step.Migrating); work.setHostId(toHost.getId()); _haDao.update(work.getId(), work); } if (work.getStep() == Step.Migrating) { vm = mgr.get(vmId); // let's see if anything has changed. boolean migrated = false; if (vm == null || vm.getRemoved() != null || vm.getHostId() == null || !_itMgr.stateTransitTo(vm, Event.MigrationRequested, vm.getHostId())) { s_logger.info("Migration cancelled because state has changed: " + vm.toString()); } else { try { boolean isWindows = _guestOSCategoryDao .findById(_guestOSDao.findById(vm.getGuestOSId()).getCategoryId()) .getName() .equalsIgnoreCase("Windows"); MigrateCommand cmd = new MigrateCommand(vm.getInstanceName(), toHost.getPrivateIpAddress(), isWindows); Answer answer = _agentMgr.send(fromHost.getId(), cmd); if (answer != null && answer.getResult()) { migrated = true; _storageMgr.unshare(vm, fromHost); work.setStep(Step.Investigating); _haDao.update(work.getId(), work); } } catch (final AgentUnavailableException e) { s_logger.debug("host became unavailable"); } catch (final OperationTimedoutException e) { s_logger.debug("operation timed out"); if (e.isActive()) { scheduleRestart(vm, true); } } } if (!migrated) { s_logger.info("Migration was unsuccessful. Cleaning up: " + vm.toString()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostPodVO podVO = _podDao.findById(vm.getPodId()); _alertMgr.sendAlert( alertType, fromHost.getDataCenterId(), fromHost.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migrate Command failed. Please check logs."); _itMgr.stateTransitTo(vm, Event.MigrationFailedOnSource, toHost.getId()); _agentMgr.maintenanceFailed(vm.getHostId()); Command cleanup = mgr.cleanup(vm, null); _agentMgr.easySend(toHost.getId(), cleanup); _storageMgr.unshare(vm, toHost); return null; } } if (toHost == null) { toHost = _hostDao.findById(work.getHostId()); } DataCenterVO dcVO = _dcDao.findById(toHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(toHost.getPodId()); try { if (!mgr.completeMigration(vm, toHost)) { _alertMgr.sendAlert( alertType, toHost.getDataCenterId(), toHost.getPodId(), "Unable to migrate " + vmId + " to host " + toHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migration not completed"); s_logger.warn("Unable to complete migration: " + vm.toString()); } else { s_logger.info("Migration is complete: " + vm.toString()); } return null; } catch (final AgentUnavailableException e) { s_logger.warn("Agent is unavailable for " + vm.toString()); } catch (final OperationTimedoutException e) { s_logger.warn("Operation timed outfor " + vm.toString()); } _itMgr.stateTransitTo(vm, Event.MigrationFailedOnDest, toHost.getId()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; }
/** * compareState does as its name suggests and compares the states between management server and * agent. It returns whether something should be cleaned up */ protected Command compareState(VMInstanceVO vm, final AgentVmInfo info, final boolean fullSync) { State agentState = info.state; final String agentName = info.name; final State serverState = vm.getState(); final String serverName = vm.getName(); Command command = null; if (s_logger.isDebugEnabled()) { s_logger.debug( "VM " + serverName + ": server state = " + serverState.toString() + " and agent state = " + agentState.toString()); } if (agentState == State.Error) { agentState = State.Stopped; short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } HostPodVO podVO = _podDao.findById(vm.getPodId()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostVO hostVO = _hostDao.findById(vm.getHostId()); String hostDesc = "name: " + hostVO.getName() + " (id:" + hostVO.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vm.getId() + ") stopped on host " + hostDesc + " due to storage failure", "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() + "] stopped due to storage failure."); } if (serverState == State.Migrating) { s_logger.debug("Skipping vm in migrating state: " + vm.toString()); return null; } if (agentState == serverState) { if (s_logger.isDebugEnabled()) { s_logger.debug("Both states are " + agentState.toString() + " for " + serverName); } assert (agentState == State.Stopped || agentState == State.Running) : "If the states we send up is changed, this must be changed."; _itMgr.stateTransitTo( vm, agentState == State.Stopped ? VirtualMachine.Event.AgentReportStopped : VirtualMachine.Event.AgentReportRunning, vm.getHostId()); if (agentState == State.Stopped) { s_logger.debug("State matches but the agent said stopped so let's send a cleanup anyways."); return info.mgr.cleanup(vm, agentName); } return null; } if (agentState == State.Stopped) { // This state means the VM on the agent was detected previously // and now is gone. This is slightly different than if the VM // was never completed but we still send down a Stop Command // to ensure there's cleanup. if (serverState == State.Running) { // Our records showed that it should be running so let's restart it. vm = info.mgr.get(vm.getId()); scheduleRestart(vm, false); command = info.mgr.cleanup(vm, agentName); } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM is in stopping state on full sync. Updating the status to stopped"); vm = info.mgr.get(vm.getId()); info.mgr.completeStopCommand(vm); command = info.mgr.cleanup(vm, agentName); } else { s_logger.debug("Ignoring VM in stopping mode: " + vm.getName()); } } else if (serverState == State.Starting) { s_logger.debug("Ignoring VM in starting mode: " + vm.getName()); } else { s_logger.debug("Sending cleanup to a stopped vm: " + agentName); _itMgr.stateTransitTo(vm, VirtualMachine.Event.AgentReportStopped, null); command = info.mgr.cleanup(vm, agentName); } } else if (agentState == State.Running) { if (serverState == State.Starting) { if (fullSync) { s_logger.debug("VM state is starting on full sync so updating it to running"); vm = info.mgr.get(vm.getId()); info.mgr.completeStartCommand(vm); } } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM state is in stopping on fullsync so resend stop."); vm = info.mgr.get(vm.getId()); info.mgr.completeStopCommand(vm); command = info.mgr.cleanup(vm, agentName); } else { s_logger.debug("VM is in stopping state so no action."); } } else if (serverState == State.Destroyed || serverState == State.Stopped || serverState == State.Expunging) { s_logger.debug("VM state is in stopped so stopping it on the agent"); vm = info.mgr.get(vm.getId()); command = info.mgr.cleanup(vm, agentName); } else { _itMgr.stateTransitTo(vm, VirtualMachine.Event.AgentReportRunning, vm.getHostId()); } } /*else if (agentState == State.Unknown) { if (serverState == State.Running) { if (fullSync) { vm = info.handler.get(vm.getId()); } scheduleRestart(vm, false); } else if (serverState == State.Starting) { if (fullSync) { vm = info.handler.get(vm.getId()); } scheduleRestart(vm, false); } else if (serverState == State.Stopping) { if (fullSync) { s_logger.debug("VM state is stopping in full sync. Resending stop"); command = info.handler.cleanup(vm, agentName); } } }*/ return command; }
protected Long restart(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); if (mgr == null) { s_logger.warn( "Unable to find a handler for " + work.getType().toString() + ", throwing out " + vmId); return null; } VMInstanceVO vm = mgr.get(vmId); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm.toString()); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm.toString() + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } final HostVO host = _hostDao.findById(work.getHostId()); DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } Boolean alive = null; if (work.getStep() == Step.Investigating) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); if (vm.getState() == State.Starting && vm.getUpdated() == work.getUpdateTime()) { _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null); } return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); if (alive != null) { s_logger.debug( investigator.getName() + " found VM " + vm.getName() + "to be alive? " + alive); break; } } if (alive != null && alive) { s_logger.debug("VM " + vm.getName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { compareState(vm, new AgentVmInfo(vm.getInstanceName(), mgr, State.Running), false); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } boolean fenced = false; if (alive == null || !alive) { fenced = true; s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { final FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); if (result != null && !result) { fenced = false; } } } if (alive == null && !fenced) { s_logger.debug("We were unable to fence off the VM " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } mgr.completeStopCommand(vm); work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } // send an alert for VMs that stop unexpectedly _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vmId + ") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + hostDesc + "] stopped unexpectedly."); vm = mgr.get(vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = mgr.start(vm.getId(), 0); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } vm = mgr.get(vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (final StorageUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ExecutionException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } }
@Test(priority = -1) public void setUp() { ComponentContext.initComponentsLifeCycle(); host = hostDao.findByGuid(this.getHostGuid()); if (host != null) { dcId = host.getDataCenterId(); clusterId = host.getClusterId(); podId = host.getPodId(); return; } // create data center DataCenterVO dc = new DataCenterVO( UUID.randomUUID().toString(), "test", "8.8.8.8", null, "10.0.0.1", null, "10.0.0.1/24", null, null, NetworkType.Basic, null, null, true, true, null, null); dc = dcDao.persist(dc); dcId = dc.getId(); // create pod HostPodVO pod = new HostPodVO( UUID.randomUUID().toString(), dc.getId(), this.getHostGateway(), this.getHostCidr(), 8, "test"); pod = podDao.persist(pod); podId = pod.getId(); // create xen cluster ClusterVO cluster = new ClusterVO(dc.getId(), pod.getId(), "devcloud cluster"); cluster.setHypervisorType(HypervisorType.XenServer.toString()); cluster.setClusterType(ClusterType.CloudManaged); cluster.setManagedState(ManagedState.Managed); cluster = clusterDao.persist(cluster); clusterId = cluster.getId(); // create xen host host = new HostVO(this.getHostGuid()); host.setName("devcloud xen host"); host.setType(Host.Type.Routing); host.setPrivateIpAddress(this.getHostIp()); host.setDataCenterId(dc.getId()); host.setVersion("6.0.1"); host.setAvailable(true); host.setSetup(true); host.setPodId(podId); host.setLastPinged(0); host.setResourceState(ResourceState.Enabled); host.setHypervisorType(HypervisorType.XenServer); host.setClusterId(cluster.getId()); host = hostDao.persist(host); imageStore = new ImageStoreVO(); imageStore.setName("test"); imageStore.setDataCenterId(dcId); imageStore.setProviderName("CloudStack ImageStore Provider"); imageStore.setRole(DataStoreRole.Image); imageStore.setUrl(this.getSecondaryStorage()); imageStore.setUuid(UUID.randomUUID().toString()); imageStore = imageStoreDao.persist(imageStore); }
@Test(priority = -1) public void setUp() { ComponentContext.initComponentsLifeCycle(); host = hostDao.findByGuid(this.getHostGuid()); if (host != null) { dcId = host.getDataCenterId(); clusterId = host.getClusterId(); podId = host.getPodId(); imageStore = this.imageStoreDao.findByName(imageStoreName); } else { // create data center DataCenterVO dc = new DataCenterVO( UUID.randomUUID().toString(), "test", "8.8.8.8", null, "10.0.0.1", null, "10.0.0.1/24", null, null, NetworkType.Basic, null, null, true, true, null, null); dc = dcDao.persist(dc); dcId = dc.getId(); // create pod HostPodVO pod = new HostPodVO( UUID.randomUUID().toString(), dc.getId(), this.getHostGateway(), this.getHostCidr(), 8, "test"); pod = podDao.persist(pod); podId = pod.getId(); // create xen cluster ClusterVO cluster = new ClusterVO(dc.getId(), pod.getId(), "devcloud cluster"); cluster.setHypervisorType(HypervisorType.VMware.toString()); cluster.setClusterType(ClusterType.ExternalManaged); cluster.setManagedState(ManagedState.Managed); cluster = clusterDao.persist(cluster); clusterId = cluster.getId(); // setup vcenter ClusterDetailsVO clusterDetailVO = new ClusterDetailsVO(cluster.getId(), "url", null); this.clusterDetailsDao.persist(clusterDetailVO); clusterDetailVO = new ClusterDetailsVO(cluster.getId(), "username", null); this.clusterDetailsDao.persist(clusterDetailVO); clusterDetailVO = new ClusterDetailsVO(cluster.getId(), "password", null); this.clusterDetailsDao.persist(clusterDetailVO); // create xen host host = new HostVO(this.getHostGuid()); host.setName("devcloud vmware host"); host.setType(Host.Type.Routing); host.setPrivateIpAddress(this.getHostIp()); host.setDataCenterId(dc.getId()); host.setVersion("6.0.1"); host.setAvailable(true); host.setSetup(true); host.setPodId(podId); host.setLastPinged(0); host.setResourceState(ResourceState.Enabled); host.setHypervisorType(HypervisorType.VMware); host.setClusterId(cluster.getId()); host = hostDao.persist(host); imageStore = new ImageStoreVO(); imageStore.setName(imageStoreName); imageStore.setDataCenterId(dcId); imageStore.setProviderName("CloudStack ImageStore Provider"); imageStore.setRole(DataStoreRole.Image); imageStore.setUrl(this.getSecondaryStorage()); imageStore.setUuid(UUID.randomUUID().toString()); imageStore.setProtocol("nfs"); imageStore = imageStoreDao.persist(imageStore); } image = new VMTemplateVO(); image.setTemplateType(TemplateType.USER); image.setUrl(this.getTemplateUrl()); image.setUniqueName(UUID.randomUUID().toString()); image.setName(UUID.randomUUID().toString()); image.setPublicTemplate(true); image.setFeatured(true); image.setRequiresHvm(true); image.setBits(64); image.setFormat(Storage.ImageFormat.VHD); image.setEnablePassword(true); image.setEnableSshKey(true); image.setGuestOSId(1); image.setBootable(true); image.setPrepopulate(true); image.setCrossZones(true); image.setExtractable(true); image = imageDataDao.persist(image); /* * TemplateDataStoreVO templateStore = new TemplateDataStoreVO(); * * templateStore.setDataStoreId(imageStore.getId()); * templateStore.setDownloadPercent(100); * templateStore.setDownloadState(Status.DOWNLOADED); * templateStore.setDownloadUrl(imageStore.getUrl()); * templateStore.setInstallPath(this.getImageInstallPath()); * templateStore.setTemplateId(image.getId()); * templateStoreDao.persist(templateStore); */ DataStore store = this.dataStoreMgr.getDataStore(imageStore.getId(), DataStoreRole.Image); TemplateInfo template = templateFactory.getTemplate(image.getId(), DataStoreRole.Image); DataObject templateOnStore = store.create(template); TemplateObjectTO to = new TemplateObjectTO(); to.setPath(this.getImageInstallPath()); CopyCmdAnswer answer = new CopyCmdAnswer(to); templateOnStore.processEvent(Event.CreateOnlyRequested); templateOnStore.processEvent(Event.OperationSuccessed, answer); }
public void checkForAlerts() { recalculateCapacity(); // abort if we can't possibly send an alert... if (_emailAlert == null) { return; } // Get all datacenters, pods and clusters in the system. List<DataCenterVO> dataCenterList = _dcDao.listAll(); List<ClusterVO> clusterList = _clusterDao.listAll(); List<HostPodVO> podList = _podDao.listAll(); // Get capacity types at different levels List<Short> dataCenterCapacityTypes = getCapacityTypesAtZoneLevel(); List<Short> podCapacityTypes = getCapacityTypesAtPodLevel(); List<Short> clusterCapacityTypes = getCapacityTypesAtClusterLevel(); // Generate Alerts for Zone Level capacities for (DataCenterVO dc : dataCenterList) { for (Short capacityType : dataCenterCapacityTypes) { List<SummedCapacity> capacity = new ArrayList<SummedCapacity>(); capacity = _capacityDao.findCapacityBy(capacityType.intValue(), dc.getId(), null, null); if (capacityType == Capacity.CAPACITY_TYPE_SECONDARY_STORAGE) { capacity.add(getUsedStats(capacityType, dc.getId(), null, null)); } if (capacity == null || capacity.size() == 0) { continue; } double totalCapacity = capacity.get(0).getTotalCapacity(); double usedCapacity = capacity.get(0).getUsedCapacity(); if (totalCapacity != 0 && usedCapacity / totalCapacity > _capacityTypeThresholdMap.get(capacityType)) { generateEmailAlert(dc, null, null, totalCapacity, usedCapacity, capacityType); } } } // Generate Alerts for Pod Level capacities for (HostPodVO pod : podList) { for (Short capacityType : podCapacityTypes) { List<SummedCapacity> capacity = _capacityDao.findCapacityBy( capacityType.intValue(), pod.getDataCenterId(), pod.getId(), null); if (capacity == null || capacity.size() == 0) { continue; } double totalCapacity = capacity.get(0).getTotalCapacity(); double usedCapacity = capacity.get(0).getUsedCapacity(); if (totalCapacity != 0 && usedCapacity / totalCapacity > _capacityTypeThresholdMap.get(capacityType)) { generateEmailAlert( ApiDBUtils.findZoneById(pod.getDataCenterId()), pod, null, totalCapacity, usedCapacity, capacityType); } } } // Generate Alerts for Cluster Level capacities for (ClusterVO cluster : clusterList) { for (Short capacityType : clusterCapacityTypes) { List<SummedCapacity> capacity = new ArrayList<SummedCapacity>(); float overProvFactor = 1f; capacity = _capacityDao.findCapacityBy( capacityType.intValue(), cluster.getDataCenterId(), null, cluster.getId()); if (capacityType == Capacity.CAPACITY_TYPE_STORAGE) { capacity.add( getUsedStats( capacityType, cluster.getDataCenterId(), cluster.getPodId(), cluster.getId())); } if (capacity == null || capacity.size() == 0) { continue; } if (capacityType == Capacity.CAPACITY_TYPE_CPU) { overProvFactor = ApiDBUtils.getCpuOverprovisioningFactor(); } double totalCapacity = capacity.get(0).getTotalCapacity() * overProvFactor; double usedCapacity = capacity.get(0).getUsedCapacity() + capacity.get(0).getReservedCapacity(); if (totalCapacity != 0 && usedCapacity / totalCapacity > _capacityTypeThresholdMap.get(capacityType)) { generateEmailAlert( ApiDBUtils.findZoneById(cluster.getDataCenterId()), ApiDBUtils.findPodById(cluster.getPodId()), cluster, totalCapacity, usedCapacity, capacityType); } } } }
@Override @DB public void recalculateCapacity() { // FIXME: the right way to do this is to register a listener (see RouterStatsListener, // VMSyncListener) // for the vm sync state. The listener model has connects/disconnects to keep things in // sync much better // than this model right now, so when a VM is started, we update the amount allocated, // and when a VM // is stopped we updated the amount allocated, and when VM sync reports a changed state, // we update // the amount allocated. Hopefully it's limited to 3 entry points and will keep the // amount allocated // per host accurate. try { if (s_logger.isDebugEnabled()) { s_logger.debug("recalculating system capacity"); s_logger.debug("Executing cpu/ram capacity update"); } // Calculate CPU and RAM capacities // get all hosts...even if they are not in 'UP' state List<HostVO> hosts = _resourceMgr.listAllHostsInAllZonesByType(Host.Type.Routing); for (HostVO host : hosts) { _capacityMgr.updateCapacityForHost(host); } if (s_logger.isDebugEnabled()) { s_logger.debug("Done executing cpu/ram capacity update"); s_logger.debug("Executing storage capacity update"); } // Calculate storage pool capacity List<StoragePoolVO> storagePools = _storagePoolDao.listAll(); for (StoragePoolVO pool : storagePools) { long disk = 0l; Pair<Long, Long> sizes = _volumeDao.getCountAndTotalByPool(pool.getId()); disk = sizes.second(); if (pool.isShared()) { _storageMgr.createCapacityEntry(pool, Capacity.CAPACITY_TYPE_STORAGE_ALLOCATED, disk); } else { _storageMgr.createCapacityEntry(pool, Capacity.CAPACITY_TYPE_LOCAL_STORAGE, disk); } } if (s_logger.isDebugEnabled()) { s_logger.debug("Done executing storage capacity update"); s_logger.debug("Executing capacity updates public ip and Vlans"); } List<DataCenterVO> datacenters = _dcDao.listAll(); for (DataCenterVO datacenter : datacenters) { long dcId = datacenter.getId(); // NOTE // What happens if we have multiple vlans? Dashboard currently shows stats // with no filter based on a vlan // ideal way would be to remove out the vlan param, and filter only on dcId // implementing the same // Calculate new Public IP capacity for Virtual Network if (datacenter.getNetworkType() == NetworkType.Advanced) { createOrUpdateIpCapacity(dcId, null, CapacityVO.CAPACITY_TYPE_VIRTUAL_NETWORK_PUBLIC_IP); } // Calculate new Public IP capacity for Direct Attached Network createOrUpdateIpCapacity(dcId, null, CapacityVO.CAPACITY_TYPE_DIRECT_ATTACHED_PUBLIC_IP); if (datacenter.getNetworkType() == NetworkType.Advanced) { // Calculate VLAN's capacity createOrUpdateVlanCapacity(dcId); } } if (s_logger.isDebugEnabled()) { s_logger.debug("Done capacity updates for public ip and Vlans"); s_logger.debug("Executing capacity updates for private ip"); } // Calculate new Private IP capacity List<HostPodVO> pods = _podDao.listAll(); for (HostPodVO pod : pods) { long podId = pod.getId(); long dcId = pod.getDataCenterId(); createOrUpdateIpCapacity(dcId, podId, CapacityVO.CAPACITY_TYPE_PRIVATE_IP); } if (s_logger.isDebugEnabled()) { s_logger.debug("Done executing capacity updates for private ip"); s_logger.debug("Done recalculating system capacity"); } } catch (Throwable t) { s_logger.error("Caught exception in recalculating capacity", t); } }