@Override public Status investigate(final long hostId) { final HostVO host = _hostDao.findById(hostId); if (host == null) { return null; } final Enumeration<Investigator> en = _investigators.enumeration(); Status hostState = null; Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); hostState = investigator.isAgentAlive(host); if (hostState != null) { if (s_logger.isDebugEnabled()) { s_logger.debug( investigator.getName() + " was able to determine host " + hostId + " is in " + hostState.toString()); } return hostState; } if (s_logger.isDebugEnabled()) { s_logger.debug( investigator.getName() + " unable to determine the state of the host. Moving on."); } } return null; }
private SecondaryStorageVmAllocator getCurrentAllocator() { // for now, only one adapter is supported Enumeration<SecondaryStorageVmAllocator> it = _ssVmAllocators.enumeration(); if (it.hasMoreElements()) { return it.nextElement(); } return null; }
protected Long restart(HaWorkVO work) { List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Cancelling this work item because newer ones have been scheduled. Work Ids = ["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return null; } items = _haDao.listRunningHaWorkForVm(work.getInstanceId()); if (items.size() > 0) { StringBuilder str = new StringBuilder( "Waiting because there's HA work being executed on an item currently. Work Ids =["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); } str.delete(str.length() - 2, str.length()).append("]"); s_logger.info(str.toString()); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } long vmId = work.getInstanceId(); VMInstanceVO vm = _itMgr.findByIdAndType(work.getType(), work.getInstanceId()); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_SSVM; } HostVO host = _hostDao.findById(work.getHostId()); boolean isHostRemoved = false; if (host == null) { host = _hostDao.findByIdIncludingRemoved(work.getHostId()); if (host != null) { s_logger.debug( "VM " + vm.toString() + " is now no longer on host " + work.getHostId() + " as the host is removed"); isHostRemoved = true; } } DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); Boolean alive = null; if (work.getStep() == Step.Investigating) { if (!isHostRemoved) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); s_logger.info(investigator.getName() + " found " + vm + "to be alive? " + alive); if (alive != null) { break; } } boolean fenced = false; if (alive == null) { s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); s_logger.info("Fencer " + fb.getName() + " returned " + result); if (result != null && result) { fenced = true; break; } } } else if (!alive) { fenced = true; } else { s_logger.debug( "VM " + vm.getHostName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { s_logger.info(vm + " is alive and host is up. No need to restart it."); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } if (!fenced) { s_logger.debug("We were unable to fence off the VM " + vm); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } else { s_logger.debug( "How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways"); try { _itMgr.advanceStop(vm, true, _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); } catch (ResourceUnavailableException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (OperationTimedoutException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } catch (ConcurrentOperationException e) { assert false : "How do we hit this when force is true?"; throw new CloudRuntimeException("Caught exception even though it should be handled.", e); } } } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = _itMgr.advanceStart( vm, new HashMap<VirtualMachineProfile.Param, Object>(), _accountMgr.getSystemUser(), _accountMgr.getSystemAccount()); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (final ResourceUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } catch (OperationTimedoutException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterIdToDeployIn(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); } vm = _itMgr.findByIdAndType(vm.getType(), vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; }
protected Long restart(final HaWorkVO work) { final long vmId = work.getInstanceId(); final VirtualMachineGuru<VMInstanceVO> mgr = findManager(work.getType()); if (mgr == null) { s_logger.warn( "Unable to find a handler for " + work.getType().toString() + ", throwing out " + vmId); return null; } VMInstanceVO vm = mgr.get(vmId); if (vm == null) { s_logger.info("Unable to find vm: " + vmId); return null; } s_logger.info("HA on " + vm.toString()); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { s_logger.info( "VM " + vm.toString() + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + vm.getUpdated() + " previous updated = " + work.getUpdateTime()); return null; } final HostVO host = _hostDao.findById(work.getHostId()); DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId()); HostPodVO podVO = _podDao.findById(host.getPodId()); String hostDesc = "name: " + host.getName() + "(id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName(); short alertType = AlertManager.ALERT_TYPE_USERVM; if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_DOMAIN_ROUTER; } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { alertType = AlertManager.ALERT_TYPE_CONSOLE_PROXY; } Boolean alive = null; if (work.getStep() == Step.Investigating) { if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) { s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId()); if (vm.getState() == State.Starting && vm.getUpdated() == work.getUpdateTime()) { _itMgr.stateTransitTo(vm, Event.AgentReportStopped, null); } return null; } Enumeration<Investigator> en = _investigators.enumeration(); Investigator investigator = null; while (en.hasMoreElements()) { investigator = en.nextElement(); alive = investigator.isVmAlive(vm, host); if (alive != null) { s_logger.debug( investigator.getName() + " found VM " + vm.getName() + "to be alive? " + alive); break; } } if (alive != null && alive) { s_logger.debug("VM " + vm.getName() + " is found to be alive by " + investigator.getName()); if (host.getStatus() == Status.Up) { compareState(vm, new AgentVmInfo(vm.getInstanceName(), mgr, State.Running), false); return null; } else { s_logger.debug("Rescheduling because the host is not up but the vm is alive"); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } boolean fenced = false; if (alive == null || !alive) { fenced = true; s_logger.debug("Fencing off VM that we don't know the state of"); Enumeration<FenceBuilder> enfb = _fenceBuilders.enumeration(); while (enfb.hasMoreElements()) { final FenceBuilder fb = enfb.nextElement(); Boolean result = fb.fenceOff(vm, host); if (result != null && !result) { fenced = false; } } } if (alive == null && !fenced) { s_logger.debug("We were unable to fence off the VM " + vm.toString()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } mgr.completeStopCommand(vm); work.setStep(Step.Scheduled); _haDao.update(work.getId(), work); } // send an alert for VMs that stop unexpectedly _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "VM (name: " + vm.getName() + ", id: " + vmId + ") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getName() + " (id: " + vm.getId() + ") running on host [" + hostDesc + "] stopped unexpectedly."); vm = mgr.get(vm.getId()); if (!_forceHA && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } return null; // VM doesn't require HA } if (!_storageMgr.canVmRestartOnAnotherServer(vm.getId())) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM can not restart on another server."); } return null; } if (work.getTimesTried() > _maxRetries) { s_logger.warn("Retried to max times so deleting: " + vmId); return null; } try { VMInstanceVO started = mgr.start(vm.getId(), 0); if (started != null) { s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId()); return null; } if (s_logger.isDebugEnabled()) { s_logger.debug( "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); } vm = mgr.get(vm.getId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } catch (final InsufficientCapacityException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (final StorageUnavailableException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ConcurrentOperationException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } catch (ExecutionException e) { s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); _alertMgr.sendAlert( alertType, vm.getDataCenterId(), vm.getPodId(), "Unable to restart " + vm.getName() + " which was running on host " + hostDesc, "The Storage is unavailable for trying to restart VM, name: " + vm.getName() + ", id: " + vmId + " which was running on host " + hostDesc); return null; } }