@Test public void testPoolStateIsNotUp() { try { createDb(); StoragePoolVO pool = storagePoolDao.findById(storagePoolId); pool.setScope(ScopeType.ZONE); pool.setStatus(StoragePoolStatus.Maintenance); storagePoolDao.update(pool.getId(), pool); DiskProfile profile = new DiskProfile(volume, diskOffering, HypervisorType.XenServer); VirtualMachineProfile vmProfile = Mockito.mock(VirtualMachineProfile.class); Mockito.when( storageMgr.storagePoolHasEnoughSpace( Mockito.anyListOf(Volume.class), Mockito.any(StoragePool.class))) .thenReturn(true); DeploymentPlan plan = new DataCenterDeployment(dcId, podId, clusterId, null, null, null); int foundAcct = 0; for (StoragePoolAllocator allocator : allocators) { List<StoragePool> pools = allocator.allocateToPool(profile, vmProfile, plan, new ExcludeList(), 1); if (!pools.isEmpty()) { Assert.assertEquals(pools.get(0).getId(), storage.getId()); foundAcct++; } } if (foundAcct == 1) { Assert.fail(); } } catch (Exception e) { cleanDb(); Assert.fail(); } }
@Override public boolean attachCluster(DataStore store, ClusterScope scope) { PrimaryDataStoreInfo primarystore = (PrimaryDataStoreInfo) store; // Check if there is host up in this cluster List<HostVO> allHosts = _resourceMgr.listAllUpAndEnabledHosts( Host.Type.Routing, primarystore.getClusterId(), primarystore.getPodId(), primarystore.getDataCenterId()); if (allHosts.isEmpty()) { primaryDataStoreDao.expunge(primarystore.getId()); throw new CloudRuntimeException( "No host up to associate a storage pool with in cluster " + primarystore.getClusterId()); } if (primarystore.getPoolType() == StoragePoolType.OCFS2 && !_ocfs2Mgr.prepareNodes(allHosts, primarystore)) { s_logger.warn( "Can not create storage pool " + primarystore + " on cluster " + primarystore.getClusterId()); primaryDataStoreDao.expunge(primarystore.getId()); return false; } boolean success = false; for (HostVO h : allHosts) { success = createStoragePool(h.getId(), primarystore); if (success) { break; } } s_logger.debug("In createPool Adding the pool to each of the hosts"); List<HostVO> poolHosts = new ArrayList<HostVO>(); for (HostVO h : allHosts) { try { storageMgr.connectHostToSharedPool(h.getId(), primarystore.getId()); poolHosts.add(h); } catch (Exception e) { s_logger.warn("Unable to establish a connection between " + h + " and " + primarystore, e); } } if (poolHosts.isEmpty()) { s_logger.warn( "No host can access storage pool " + primarystore + " on cluster " + primarystore.getClusterId()); primaryDataStoreDao.expunge(primarystore.getId()); throw new CloudRuntimeException("Failed to access storage pool"); } dataStoreHelper.attachCluster(store); return true; }
private SummedCapacity getUsedStats(short capacityType, long zoneId, Long podId, Long clusterId) { CapacityVO capacity; if (capacityType == Capacity.CAPACITY_TYPE_SECONDARY_STORAGE) { capacity = _storageMgr.getSecondaryStorageUsedStats(null, zoneId); } else { capacity = _storageMgr.getStoragePoolUsedStats(null, clusterId, podId, zoneId); } if (capacity != null) { return new SummedCapacity( capacity.getUsedCapacity(), 0, capacity.getTotalCapacity(), capacityType, clusterId, podId); } else { return null; } }
@Test public void testClusterAllocatorMultiplePools() { Long newStorageId = null; try { createDb(); DataStoreProvider provider = providerMgr.getDataStoreProvider("ancient primary data store provider"); storage = new StoragePoolVO(); storage.setDataCenterId(dcId); storage.setPodId(podId); storage.setPoolType(StoragePoolType.NetworkFilesystem); storage.setClusterId(clusterId); storage.setStatus(StoragePoolStatus.Up); storage.setScope(ScopeType.CLUSTER); storage.setAvailableBytes(1000); storage.setCapacityBytes(20000); storage.setHostAddress(UUID.randomUUID().toString()); storage.setPath(UUID.randomUUID().toString()); storage.setStorageProviderName(provider.getName()); StoragePoolVO newStorage = storagePoolDao.persist(storage); newStorageId = newStorage.getId(); DiskProfile profile = new DiskProfile(volume, diskOffering, HypervisorType.XenServer); VirtualMachineProfile vmProfile = Mockito.mock(VirtualMachineProfile.class); Mockito.when( storageMgr.storagePoolHasEnoughSpace( Mockito.anyListOf(Volume.class), Mockito.any(StoragePool.class))) .thenReturn(true); DeploymentPlan plan = new DataCenterDeployment(dcId, podId, clusterId, null, null, null); int foundAcct = 0; for (StoragePoolAllocator allocator : allocators) { List<StoragePool> pools = allocator.allocateToPool(profile, vmProfile, plan, new ExcludeList(), 1); if (!pools.isEmpty()) { Assert.assertEquals(pools.size(), 1); foundAcct++; } } if (foundAcct > 1 || foundAcct == 0) { Assert.fail(); } } catch (Exception e) { cleanDb(); if (newStorageId != null) { storagePoolDao.remove(newStorageId); } Assert.fail(); } }
@Test public void testClusterAllocatorWithTags() { try { createDb(); StoragePoolDetailVO detailVO = new StoragePoolDetailVO(this.storagePoolId, "high", "true"); poolDetailsDao.persist(detailVO); DiskOfferingVO diskOff = this.diskOfferingDao.findById(diskOffering.getId()); List<String> tags = new ArrayList<String>(); tags.add("high"); diskOff.setTagsArray(tags); diskOfferingDao.update(diskOff.getId(), diskOff); DiskProfile profile = new DiskProfile(volume, diskOff, HypervisorType.XenServer); VirtualMachineProfile vmProfile = Mockito.mock(VirtualMachineProfile.class); Mockito.when( storageMgr.storagePoolHasEnoughSpace( Mockito.anyListOf(Volume.class), Mockito.any(StoragePool.class))) .thenReturn(true); DeploymentPlan plan = new DataCenterDeployment(dcId, podId, clusterId, null, null, null); int foundAcct = 0; for (StoragePoolAllocator allocator : allocators) { List<StoragePool> pools = allocator.allocateToPool(profile, vmProfile, plan, new ExcludeList(), 1); if (!pools.isEmpty()) { Assert.assertEquals(pools.get(0).getId(), storage.getId()); foundAcct++; } } if (foundAcct > 1 || foundAcct == 0) { Assert.fail(); } } catch (Exception e) { cleanDb(); Assert.fail(); } }
protected Long restart(HaWorkVO work) { List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Cancelling this work item because newer ones have been scheduled. Work Ids = ["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return null; } items = _haDao.listRunningHaWorkForVm(work.getInstanceId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Waiting because there's HA work being executed on an item currently. Work Ids =["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } long vmId = work.getInstanceId(); VMInstanceVO vm = _itMgr.findByIdAndType(work.getType(), work.getInstanceId()); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_SSVM; } HostVO host = _hostDao.findById(work.getHostId()); boolean isHostRemoved = false; if (host == null) { host = _hostDao.findByIdIncludingRemoved(work.getHostId()); if (host != null) { s_logger.debug( "VM " + vm.toString() + " is now no longer on host " + work.getHostId() + " as the host is removed"); isHostRemoved = true; } } DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); Boolean alive = null; if (work.getStep() == Step.Investigating) { if (!isHostRemoved) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); s_logger.info(investigator.getName() + " found " + vm + "to be alive? " + alive); if (alive != null) { break; } } boolean fenced = false; if (alive == null) { s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); s_logger.info("Fencer " + fb.getName() + " returned " + result); if (result != null && result) { fenced = true; break; } } } else if (!alive) { fenced = true; } else { s_logger.debug( "VM " + vm.getHostName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { s_logger.info(vm + " is alive and host is up. No need to restart it."); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } if (!fenced) { s_logger.debug("We were unable to fence off the VM " + vm); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } else { s_logger.debug( "How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways"); try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } } } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = _itMgr.advanceStart( vm, new HashMap<VirtualMachineProfile.Param, Object>(), _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (final ResourceUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (OperationTimedoutException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; }
protected void createDb() { DataCenterVO dc = new DataCenterVO( UUID.randomUUID().toString(), "test", "8.8.8.8", null, "10.0.0.1", null, "10.0.0.1/24", null, null, NetworkType.Basic, null, null, true, true, null, null); dc = dcDao.persist(dc); dcId = dc.getId(); HostPodVO pod = new HostPodVO(UUID.randomUUID().toString(), dc.getId(), "255.255.255.255", "", 8, "test"); pod = podDao.persist(pod); podId = pod.getId(); ClusterVO cluster = new ClusterVO(dc.getId(), pod.getId(), "devcloud cluster"); cluster.setHypervisorType(HypervisorType.XenServer.toString()); cluster.setClusterType(ClusterType.CloudManaged); cluster.setManagedState(ManagedState.Managed); cluster = clusterDao.persist(cluster); clusterId = cluster.getId(); DataStoreProvider provider = providerMgr.getDataStoreProvider("ancient primary data store provider"); storage = new StoragePoolVO(); storage.setDataCenterId(dcId); storage.setPodId(podId); storage.setPoolType(StoragePoolType.NetworkFilesystem); storage.setClusterId(clusterId); storage.setStatus(StoragePoolStatus.Up); storage.setScope(ScopeType.CLUSTER); storage.setAvailableBytes(1000); storage.setCapacityBytes(20000); storage.setHostAddress(UUID.randomUUID().toString()); storage.setPath(UUID.randomUUID().toString()); storage.setStorageProviderName(provider.getName()); storage = storagePoolDao.persist(storage); storagePoolId = storage.getId(); storageMgr.createCapacityEntry(storage.getId()); diskOffering = new DiskOfferingVO(); diskOffering.setDiskSize(500); diskOffering.setName("test-disk"); diskOffering.setSystemUse(false); diskOffering.setUseLocalStorage(false); diskOffering.setCustomized(false); diskOffering.setRecreatable(false); diskOffering = diskOfferingDao.persist(diskOffering); diskOfferingId = diskOffering.getId(); volume = new VolumeVO( Volume.Type.ROOT, "volume", dcId, 1, 1, diskOffering.getId(), diskOffering.getDiskSize()); volume = volumeDao.persist(volume); volumeId = volume.getId(); }
public Long migrate(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); VMInstanceVO vm = mgr.get(vmId); if (vm == null || vm.getRemoved() != null) { s_logger.debug("Unable to find the vm " + vmId); return null; } s_logger.info("Migrating vm: " + vm.toString()); if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM is not longer running on the current hostId"); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM_MIGRATE; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER_MIGRATE; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY_MIGRATE; } HostVO fromHost = _hostDao.findById(vm.getHostId()); String fromHostName = ((fromHost == null) ? "unknown" : fromHost.getName()); HostVO toHost = null; if (work.getStep() == Step.Scheduled) { if (vm.getState() != State.Running) { s_logger.info( "VM's state is not ready for migration. " + vm.toString() + " State is " + vm.getState().toString()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; } DataCenterVO dcVO = _dcDao.findById(fromHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(fromHost.getPodId()); try { toHost = mgr.prepareForMigration(vm); if (toHost == null) { if (s_logger.isDebugEnabled()) { s_logger.debug("Unable to find a host for migrating vm " + vmId); } _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Unable to find a suitable host"); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to mgirate due to insufficient capacity " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Insufficient capacity"); } catch (final StorageUnavailableException e) { s_logger.warn("Storage is unavailable: " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHostName + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Storage is gone."); } if (toHost == null) { _agentMgr.maintenanceFailed(vm.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug("Migrating from " + work.getHostId() + " to " + toHost.getId()); } work.setStep(Step.Migrating); work.setHostId(toHost.getId()); _haDao.update(work.getId(), work); } if (work.getStep() == Step.Migrating) { vm = mgr.get(vmId); // let's see if anything has changed. boolean migrated = false; if (vm == null || vm.getRemoved() != null || vm.getHostId() == null || !_itMgr.stateTransitTo(vm, Event.MigrationRequested, vm.getHostId())) { s_logger.info("Migration cancelled because state has changed: " + vm.toString()); } else { try { boolean isWindows = _guestOSCategoryDao .findById(_guestOSDao.findById(vm.getGuestOSId()).getCategoryId()) .getName() .equalsIgnoreCase("Windows"); MigrateCommand cmd = new MigrateCommand(vm.getInstanceName(), toHost.getPrivateIpAddress(), isWindows); Answer answer = _agentMgr.send(fromHost.getId(), cmd); if (answer != null && answer.getResult()) { migrated = true; _storageMgr.unshare(vm, fromHost); work.setStep(Step.Investigating); _haDao.update(work.getId(), work); } } catch (final AgentUnavailableException e) { s_logger.debug("host became unavailable"); } catch (final OperationTimedoutException e) { s_logger.debug("operation timed out"); if (e.isActive()) { scheduleRestart(vm, true); } } } if (!migrated) { s_logger.info("Migration was unsuccessful. Cleaning up: " + vm.toString()); DataCenterVO dcVO = _dcDao.findById(vm.getDataCenterId()); HostPodVO podVO = _podDao.findById(vm.getPodId()); _alertMgr.sendAlert( alertType, fromHost.getDataCenterId(), fromHost.getPodId(), "Unable to migrate vm " + vm.getName() + " from host " + fromHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migrate Command failed. Please check logs."); _itMgr.stateTransitTo(vm, Event.MigrationFailedOnSource, toHost.getId()); _agentMgr.maintenanceFailed(vm.getHostId()); Command cleanup = mgr.cleanup(vm, null); _agentMgr.easySend(toHost.getId(), cleanup); _storageMgr.unshare(vm, toHost); return null; } } if (toHost == null) { toHost = _hostDao.findById(work.getHostId()); } DataCenterVO dcVO = _dcDao.findById(toHost.getDataCenterId()); HostPodVO podVO = _podDao.findById(toHost.getPodId()); try { if (!mgr.completeMigration(vm, toHost)) { _alertMgr.sendAlert( alertType, toHost.getDataCenterId(), toHost.getPodId(), "Unable to migrate " + vmId + " to host " + toHost.getName() + " in zone " + dcVO.getName() + " and pod " + podVO.getName(), "Migration not completed"); s_logger.warn("Unable to complete migration: " + vm.toString()); } else { s_logger.info("Migration is complete: " + vm.toString()); } return null; } catch (final AgentUnavailableException e) { s_logger.warn("Agent is unavailable for " + vm.toString()); } catch (final OperationTimedoutException e) { s_logger.warn("Operation timed outfor " + vm.toString()); } _itMgr.stateTransitTo(vm, Event.MigrationFailedOnDest, toHost.getId()); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; }
protected Long restart(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); if (mgr == null) { s_logger.warn( "Unable to find a handler for " + work.getType().toString() + ", throwing out " + vmId); return null; } VMInstanceVO vm = mgr.get(vmId); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm.toString()); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm.toString() + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } final HostVO host = _hostDao.findById(work.getHostId()); DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } Boolean alive = null; if (work.getStep() == Step.Investigating) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); if (vm.getState() == State.Starting && vm.getUpdated() == work.getUpdateTime()) { _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null); } return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); if (alive != null) { s_logger.debug( investigator.getName() + " found VM " + vm.getName() + "to be alive? " + alive); break; } } if (alive != null && alive) { s_logger.debug("VM " + vm.getName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { compareState(vm, new AgentVmInfo(vm.getInstanceName(), mgr, State.Running), false); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } boolean fenced = false; if (alive == null || !alive) { fenced = true; s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { final FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); if (result != null && !result) { fenced = false; } } } if (alive == null && !fenced) { s_logger.debug("We were unable to fence off the VM " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } mgr.completeStopCommand(vm); work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } // send an alert for VMs that stop unexpectedly _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vmId + ") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + hostDesc + "] stopped unexpectedly."); vm = mgr.get(vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = mgr.start(vm.getId(), 0); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } vm = mgr.get(vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (final StorageUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ExecutionException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } }
@Override @DB public void recalculateCapacity() { // FIXME: the right way to do this is to register a listener (see RouterStatsListener, // VMSyncListener) // for the vm sync state. The listener model has connects/disconnects to keep things in // sync much better // than this model right now, so when a VM is started, we update the amount allocated, // and when a VM // is stopped we updated the amount allocated, and when VM sync reports a changed state, // we update // the amount allocated. Hopefully it's limited to 3 entry points and will keep the // amount allocated // per host accurate. try { if (s_logger.isDebugEnabled()) { s_logger.debug("recalculating system capacity"); s_logger.debug("Executing cpu/ram capacity update"); } // Calculate CPU and RAM capacities // get all hosts...even if they are not in 'UP' state List<HostVO> hosts = _resourceMgr.listAllHostsInAllZonesByType(Host.Type.Routing); for (HostVO host : hosts) { _capacityMgr.updateCapacityForHost(host); } if (s_logger.isDebugEnabled()) { s_logger.debug("Done executing cpu/ram capacity update"); s_logger.debug("Executing storage capacity update"); } // Calculate storage pool capacity List<StoragePoolVO> storagePools = _storagePoolDao.listAll(); for (StoragePoolVO pool : storagePools) { long disk = 0l; Pair<Long, Long> sizes = _volumeDao.getCountAndTotalByPool(pool.getId()); disk = sizes.second(); if (pool.isShared()) { _storageMgr.createCapacityEntry(pool, Capacity.CAPACITY_TYPE_STORAGE_ALLOCATED, disk); } else { _storageMgr.createCapacityEntry(pool, Capacity.CAPACITY_TYPE_LOCAL_STORAGE, disk); } } if (s_logger.isDebugEnabled()) { s_logger.debug("Done executing storage capacity update"); s_logger.debug("Executing capacity updates public ip and Vlans"); } List<DataCenterVO> datacenters = _dcDao.listAll(); for (DataCenterVO datacenter : datacenters) { long dcId = datacenter.getId(); // NOTE // What happens if we have multiple vlans? Dashboard currently shows stats // with no filter based on a vlan // ideal way would be to remove out the vlan param, and filter only on dcId // implementing the same // Calculate new Public IP capacity for Virtual Network if (datacenter.getNetworkType() == NetworkType.Advanced) { createOrUpdateIpCapacity(dcId, null, CapacityVO.CAPACITY_TYPE_VIRTUAL_NETWORK_PUBLIC_IP); } // Calculate new Public IP capacity for Direct Attached Network createOrUpdateIpCapacity(dcId, null, CapacityVO.CAPACITY_TYPE_DIRECT_ATTACHED_PUBLIC_IP); if (datacenter.getNetworkType() == NetworkType.Advanced) { // Calculate VLAN's capacity createOrUpdateVlanCapacity(dcId); } } if (s_logger.isDebugEnabled()) { s_logger.debug("Done capacity updates for public ip and Vlans"); s_logger.debug("Executing capacity updates for private ip"); } // Calculate new Private IP capacity List<HostPodVO> pods = _podDao.listAll(); for (HostPodVO pod : pods) { long podId = pod.getId(); long dcId = pod.getDataCenterId(); createOrUpdateIpCapacity(dcId, podId, CapacityVO.CAPACITY_TYPE_PRIVATE_IP); } if (s_logger.isDebugEnabled()) { s_logger.debug("Done executing capacity updates for private ip"); s_logger.debug("Done recalculating system capacity"); } } catch (Throwable t) { s_logger.error("Caught exception in recalculating capacity", t); } }