Exemplo n.º 1
0
  @Override
  public void CheckAndDestroyTunnel(VirtualMachine vm) {
    if (!_isEnabled) {
      return;
    }

    List<UserVmVO> userVms = _userVmDao.listByAccountIdAndHostId(vm.getAccountId(), vm.getHostId());
    if (vm.getType() == VirtualMachine.Type.User) {
      if (userVms.size() > 1) {
        return;
      }

      List<DomainRouterVO> routers =
          _routerDao.findBy(vm.getAccountId(), vm.getDataCenterIdToDeployIn());
      for (DomainRouterVO router : routers) {
        if (router.getHostId() == vm.getHostId()) {
          return;
        }
      }
    } else if (vm.getType() == VirtualMachine.Type.DomainRouter && userVms.size() != 0) {
      return;
    }

    try {
      /* Now we are last one on host, destroy all tunnels of my account */
      Command cmd = new OvsDestroyTunnelCommand(vm.getAccountId(), "[]");
      Answer ans = _agentMgr.send(vm.getHostId(), cmd);
      handleDestroyTunnelAnswer(ans, vm.getHostId(), 0, vm.getAccountId());

      /* Then ask hosts have peer tunnel with me to destroy them */
      List<OvsTunnelAccountVO> peers =
          _tunnelAccountDao.listByToAccount(vm.getHostId(), vm.getAccountId());
      for (OvsTunnelAccountVO p : peers) {
        cmd = new OvsDestroyTunnelCommand(p.getAccount(), p.getPortName());
        ans = _agentMgr.send(p.getFrom(), cmd);
        handleDestroyTunnelAnswer(ans, p.getFrom(), p.getTo(), p.getAccount());
      }
    } catch (Exception e) {
      s_logger.warn(
          String.format(
              "Destroy tunnel(account:%1$s, hostId:%2$s) failed",
              vm.getAccountId(), vm.getHostId()),
          e);
    }
  }
  @Override
  public Boolean isVmAlive(VirtualMachine vm, Host host) {
    if (vm.getType() != VirtualMachine.Type.User) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("Not a User Vm, unable to determine state of " + vm + " returning null");
      }
      return null;
    }

    if (s_logger.isDebugEnabled()) {
      s_logger.debug("testing if " + vm + " is alive");
    }
    // to verify that the VM is alive, we ask the domR (router) to ping the VM (private IP)
    UserVmVO userVm = _userVmDao.findById(vm.getId());

    List<? extends Nic> nics = _networkMgr.getNicsForTraffic(userVm.getId(), TrafficType.Guest);

    for (Nic nic : nics) {
      if (nic.getIp4Address() == null) {
        continue;
      }

      List<VirtualRouter> routers = _vnaMgr.getRoutersForNetwork(nic.getNetworkId());
      if (routers == null || routers.isEmpty()) {
        if (s_logger.isDebugEnabled()) {
          s_logger.debug(
              "Unable to find a router in network " + nic.getNetworkId() + " to ping " + vm);
        }
        continue;
      }

      Boolean result = null;
      for (VirtualRouter router : routers) {
        result = testUserVM(vm, nic, router);
        if (result != null) {
          break;
        }
      }

      if (result == null) {
        continue;
      }

      return result;
    }

    if (s_logger.isDebugEnabled()) {
      s_logger.debug("Returning null since we're unable to determine state of " + vm);
    }
    return null;
  }
  @Override
  public Boolean isVmAlive(VirtualMachine vm, Host host) {
    if (!vm.getType().isUsedBySystem()) {
      s_logger.debug("Not a System Vm, unable to determine state of " + vm + " returning null");
    }

    if (s_logger.isDebugEnabled()) {
      s_logger.debug("Testing if " + vm + " is alive");
    }

    if (vm.getHostId() == null) {
      s_logger.debug("There's no host id for " + vm);
      return null;
    }

    HostVO vmHost = _hostDao.findById(vm.getHostId());
    if (vmHost == null) {
      s_logger.debug("Unable to retrieve the host by using id " + vm.getHostId());
      return null;
    }

    List<? extends Nic> nics = _networkMgr.getNicsForTraffic(vm.getId(), TrafficType.Management);
    if (nics.size() == 0) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug(
            "Unable to find a management nic, cannot ping this system VM, unable to determine state of "
                + vm
                + " returning null");
      }
      return null;
    }

    for (Nic nic : nics) {
      if (nic.getIp4Address() == null) {
        continue;
      }
      // get the data center IP address, find a host on the pod, use that host to ping the data
      // center IP address
      List<Long> otherHosts = findHostByPod(vmHost.getPodId(), vm.getHostId());
      for (Long otherHost : otherHosts) {
        Status vmState = testIpAddress(otherHost, nic.getIp4Address());
        if (vmState == null) {
          // can't get information from that host, try the next one
          continue;
        }
        if (vmState == Status.Up) {
          if (s_logger.isDebugEnabled()) {
            s_logger.debug(
                "successfully pinged vm's private IP ("
                    + vm.getPrivateIpAddress()
                    + "), returning that the VM is up");
          }
          return Boolean.TRUE;
        } else if (vmState == Status.Down) {
          // We can't ping the VM directly...if we can ping the host, then report the VM down.
          // If we can't ping the host, then we don't have enough information.
          Status vmHostState = testIpAddress(otherHost, vmHost.getPrivateIpAddress());
          if ((vmHostState != null) && (vmHostState == Status.Up)) {
            if (s_logger.isDebugEnabled()) {
              s_logger.debug(
                  "successfully pinged vm's host IP ("
                      + vmHost.getPrivateIpAddress()
                      + "), but could not ping VM, returning that the VM is down");
            }
            return Boolean.FALSE;
          }
        }
      }
    }

    if (s_logger.isDebugEnabled()) {
      s_logger.debug("unable to determine state of " + vm + " returning null");
    }
    return null;
  }
Exemplo n.º 4
0
  @DB
  protected void CheckAndCreateTunnel(VirtualMachine instance, DeployDestination dest) {
    if (!_isEnabled) {
      return;
    }

    if (instance.getType() != VirtualMachine.Type.User
        && instance.getType() != VirtualMachine.Type.DomainRouter) {
      return;
    }

    long hostId = dest.getHost().getId();
    long accountId = instance.getAccountId();
    List<UserVmVO> vms = _userVmDao.listByAccountId(accountId);
    List<DomainRouterVO> routers =
        _routerDao.findBy(accountId, instance.getDataCenterIdToDeployIn());
    List<VMInstanceVO> ins = new ArrayList<VMInstanceVO>();
    if (vms != null) {
      ins.addAll(vms);
    }
    if (routers.size() != 0) {
      ins.addAll(routers);
    }
    List<Pair<Long, Integer>> toHosts = new ArrayList<Pair<Long, Integer>>();
    List<Pair<Long, Integer>> fromHosts = new ArrayList<Pair<Long, Integer>>();
    int key;

    for (VMInstanceVO v : ins) {
      Long rh = v.getHostId();
      if (rh == null || rh.longValue() == hostId) {
        continue;
      }

      OvsTunnelAccountVO ta =
          _tunnelAccountDao.getByFromToAccount(hostId, rh.longValue(), accountId);
      if (ta == null) {
        key = getGreKey(hostId, rh.longValue(), accountId);
        if (key == -1) {
          s_logger.warn(
              String.format(
                  "Cannot get GRE key for from=%1$s to=%2$s accountId=%3$s, tunnel create failed",
                  hostId, rh.longValue(), accountId));
          continue;
        }

        Pair<Long, Integer> p = new Pair<Long, Integer>(rh, Integer.valueOf(key));
        if (!toHosts.contains(p)) {
          toHosts.add(p);
        }
      }

      ta = _tunnelAccountDao.getByFromToAccount(rh.longValue(), hostId, accountId);
      if (ta == null) {
        key = getGreKey(rh.longValue(), hostId, accountId);
        if (key == -1) {
          s_logger.warn(
              String.format(
                  "Cannot get GRE key for from=%1$s to=%2$s accountId=%3$s, tunnel create failed",
                  rh.longValue(), hostId, accountId));
          continue;
        }

        Pair<Long, Integer> p = new Pair<Long, Integer>(rh, Integer.valueOf(key));
        if (!fromHosts.contains(p)) {
          fromHosts.add(p);
        }
      }
    }

    try {
      String myIp = dest.getHost().getPrivateIpAddress();
      for (Pair<Long, Integer> i : toHosts) {
        HostVO rHost = _hostDao.findById(i.first());
        Commands cmds =
            new Commands(
                new OvsCreateTunnelCommand(
                    rHost.getPrivateIpAddress(),
                    i.second().toString(),
                    Long.valueOf(hostId),
                    i.first(),
                    accountId,
                    myIp));
        s_logger.debug("Ask host " + hostId + " to create gre tunnel to " + i.first());
        Answer[] answers = _agentMgr.send(hostId, cmds);
        handleCreateTunnelAnswer(answers);
      }

      for (Pair<Long, Integer> i : fromHosts) {
        HostVO rHost = _hostDao.findById(i.first());
        Commands cmd2s =
            new Commands(
                new OvsCreateTunnelCommand(
                    myIp,
                    i.second().toString(),
                    i.first(),
                    Long.valueOf(hostId),
                    accountId,
                    rHost.getPrivateIpAddress()));
        s_logger.debug("Ask host " + i.first() + " to create gre tunnel to " + hostId);
        Answer[] answers = _agentMgr.send(i.first(), cmd2s);
        handleCreateTunnelAnswer(answers);
      }
    } catch (Exception e) {
      s_logger.debug("Ovs Tunnel network created tunnel failed", e);
    }
  }
  protected Long restart(HaWorkVO work) {
    List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
    if (items.size() > 0) {
      StringBuilder str =
          new StringBuilder(
              "Cancelling this work item because newer ones have been scheduled.  Work Ids = [");
      for (HaWorkVO item : items) {
        str.append(item.getId()).append(", ");
      }
      str.delete(str.length() - 2, str.length()).append("]");
      s_logger.info(str.toString());
      return null;
    }

    items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
    if (items.size() > 0) {
      StringBuilder str =
          new StringBuilder(
              "Waiting because there's HA work being executed on an item currently.  Work Ids =[");
      for (HaWorkVO item : items) {
        str.append(item.getId()).append(", ");
      }
      str.delete(str.length() - 2, str.length()).append("]");
      s_logger.info(str.toString());
      return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
    }

    long vmId = work.getInstanceId();

    VirtualMachine vm = _itMgr.findById(work.getInstanceId());
    if (vm == null) {
      s_logger.info("Unable to find vm: " + vmId);
      return null;
    }

    s_logger.info("HA on " + vm);
    if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) {
      s_logger.info(
          "VM "
              + vm
              + " has been changed.  Current State = "
              + vm.getState()
              + " Previous State = "
              + work.getPreviousState()
              + " last updated = "
              + vm.getUpdated()
              + " previous updated = "
              + work.getUpdateTime());
      return null;
    }

    AlertManager.AlertType alertType = AlertManager.AlertType.ALERT_TYPE_USERVM;
    if (VirtualMachine.Type.DomainRouter.equals(vm.getType())) {
      alertType = AlertManager.AlertType.ALERT_TYPE_DOMAIN_ROUTER;
    } else if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) {
      alertType = AlertManager.AlertType.ALERT_TYPE_CONSOLE_PROXY;
    } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) {
      alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
    }

    HostVO host = _hostDao.findById(work.getHostId());
    boolean isHostRemoved = false;
    if (host == null) {
      host = _hostDao.findByIdIncludingRemoved(work.getHostId());
      if (host != null) {
        s_logger.debug(
            "VM "
                + vm.toString()
                + " is now no longer on host "
                + work.getHostId()
                + " as the host is removed");
        isHostRemoved = true;
      }
    }

    DataCenterVO dcVO = _dcDao.findById(host.getDataCenterId());
    HostPodVO podVO = _podDao.findById(host.getPodId());
    String hostDesc =
        "name: "
            + host.getName()
            + "(id:"
            + host.getId()
            + "), availability zone: "
            + dcVO.getName()
            + ", pod: "
            + podVO.getName();

    Boolean alive = null;
    if (work.getStep() == Step.Investigating) {
      if (!isHostRemoved) {
        if (vm.getHostId() == null || vm.getHostId() != work.getHostId()) {
          s_logger.info("VM " + vm.toString() + " is now no longer on host " + work.getHostId());
          return null;
        }

        Investigator investigator = null;
        for (Investigator it : investigators) {
          investigator = it;
          alive = investigator.isVmAlive(vm, host);
          s_logger.info(investigator.getName() + " found " + vm + "to be alive? " + alive);
          if (alive != null) {
            break;
          }
        }

        boolean fenced = false;
        if (alive == null) {
          s_logger.debug("Fencing off VM that we don't know the state of");
          for (FenceBuilder fb : fenceBuilders) {
            Boolean result = fb.fenceOff(vm, host);
            s_logger.info("Fencer " + fb.getName() + " returned " + result);
            if (result != null && result) {
              fenced = true;
              break;
            }
          }

        } else if (!alive) {
          fenced = true;
        } else {
          s_logger.debug(
              "VM " + vm.getHostName() + " is found to be alive by " + investigator.getName());
          if (host.getStatus() == Status.Up) {
            s_logger.info(vm + " is alive and host is up. No need to restart it.");
            return null;
          } else {
            s_logger.debug("Rescheduling because the host is not up but the vm is alive");
            return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
          }
        }

        if (!fenced) {
          s_logger.debug("We were unable to fence off the VM " + vm);
          _alertMgr.sendAlert(
              alertType,
              vm.getDataCenterId(),
              vm.getPodIdToDeployIn(),
              "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
              "Insufficient capacity to restart VM, name: "
                  + vm.getHostName()
                  + ", id: "
                  + vmId
                  + " which was running on host "
                  + hostDesc);
          return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
        }

        try {
          _itMgr.advanceStop(vm.getUuid(), true);
        } catch (ResourceUnavailableException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (OperationTimedoutException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (ConcurrentOperationException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        }

        work.setStep(Step.Scheduled);
        _haDao.update(work.getId(), work);
      } else {
        s_logger.debug(
            "How come that HA step is Investigating and the host is removed? Calling forced Stop on Vm anyways");
        try {
          _itMgr.advanceStop(vm.getUuid(), true);
        } catch (ResourceUnavailableException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (OperationTimedoutException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        } catch (ConcurrentOperationException e) {
          assert false : "How do we hit this when force is true?";
          throw new CloudRuntimeException("Caught exception even though it should be handled.", e);
        }
      }
    }

    vm = _itMgr.findById(vm.getId());

    if (!_forceHA && !vm.isHaEnabled()) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("VM is not HA enabled so we're done.");
      }
      return null; // VM doesn't require HA
    }

    if (!volumeMgr.canVmRestartOnAnotherServer(vm.getId())) {
      if (s_logger.isDebugEnabled()) {
        s_logger.debug("VM can not restart on another server.");
      }
      return null;
    }

    if (work.getTimesTried() > _maxRetries) {
      s_logger.warn("Retried to max times so deleting: " + vmId);
      return null;
    }

    try {
      HashMap<VirtualMachineProfile.Param, Object> params =
          new HashMap<VirtualMachineProfile.Param, Object>();
      if (_haTag != null) {
        params.put(VirtualMachineProfile.Param.HaTag, _haTag);
      }

      try {
        // First try starting the vm with its original planner, if it doesn't succeed send HAPlanner
        // as its an emergency.
        _itMgr.advanceStart(vm.getUuid(), params, null);
      } catch (InsufficientCapacityException e) {
        s_logger.warn("Failed to deploy vm " + vmId + " with original planner, sending HAPlanner");
        _itMgr.advanceStart(vm.getUuid(), params, _haPlanners.get(0));
      }

      VMInstanceVO started = _instanceDao.findById(vm.getId());
      if (started != null && started.getState() == VirtualMachine.State.Running) {
        s_logger.info("VM is now restarted: " + vmId + " on " + started.getHostId());
        return null;
      }

      if (s_logger.isDebugEnabled()) {
        s_logger.debug(
            "Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval);
      }
    } catch (final InsufficientCapacityException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "Insufficient capacity to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    } catch (final ResourceUnavailableException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    } catch (ConcurrentOperationException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    } catch (OperationTimedoutException e) {
      s_logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage());
      _alertMgr.sendAlert(
          alertType,
          vm.getDataCenterId(),
          vm.getPodIdToDeployIn(),
          "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc,
          "The Storage is unavailable for trying to restart VM, name: "
              + vm.getHostName()
              + ", id: "
              + vmId
              + " which was running on host "
              + hostDesc);
    }
    vm = _itMgr.findById(vm.getId());
    work.setUpdateTime(vm.getUpdated());
    work.setPreviousState(vm.getState());
    return (System.currentTimeMillis() >> 10) + _restartRetryInterval;
  }