You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cloudstack.apache.org by ro...@apache.org on 2021/05/14 17:44:57 UTC

[cloudstack] branch 4.14 updated: forceha: fix two issues when (1)stop vm from inside (2) force remove host (#4647)

This is an automated email from the ASF dual-hosted git repository.

rohit pushed a commit to branch 4.14
in repository https://gitbox.apache.org/repos/asf/cloudstack.git


The following commit(s) were added to refs/heads/4.14 by this push:
     new e2183ed  forceha: fix two issues when (1)stop vm from inside (2) force remove host (#4647)
e2183ed is described below

commit e2183ed666d202cb5c83b3a640160bea52ab18fa
Author: Wei Zhou <w....@global.leaseweb.com>
AuthorDate: Fri May 14 19:44:39 2021 +0200

    forceha: fix two issues when (1)stop vm from inside (2) force remove host (#4647)
    
    * forceha: fix vm is not started if it is poweroff from inside
    
    steps to reproduce the issue
    (1) make sure force.ha is true in global setting. if not, change it to true, and restart mgt server
    (2) create a service offering , ha is not enabled
    (3) create a vm
    (4) log into the vm, and power off via cli.
    
    expected result: vm is started again by cloudstack
    actual result: vm is not started.
    
    * forceha: fix vms are still running if host is force-removed
    
    when host can be force removed, however vms are stopped in cloudstack, but not stopped on host
    ```
    (localcloud) 🐱 > delete host id="a5625393-444d-4d0a-b31d-62baf88a8be1" forced=true
    {
      "success": true
    }```
    
    after some minutes, vms are still runnning on host
    ```
    root@mgt01:~# ssh node63 virsh list
     Id   Name        State
    ---------------------------
     1    i-2-19-VM   running
     2    i-2-11-VM   running
    ```
    
    error message are
    ```
    Cannot transmit host 2 to Enabled state
    com.cloud.utils.fsm.NoTransitionException: No next resource state found for current state = Enabled event = DeleteHost
            at com.cloud.resource.ResourceManagerImpl.resourceStateTransitTo(ResourceManagerImpl.java:1216)
            at com.cloud.resource.ResourceManagerImpl$1.doInTransactionWithoutResult(ResourceManagerImpl.java:907)
    ```
    
    * forceha: Make ForceHA dynamic
---
 .../java/com/cloud/resource/ResourceState.java     |  1 +
 .../java/com/cloud/ha/HighAvailabilityManager.java |  2 +-
 .../com/cloud/vm/VirtualMachineManagerImpl.java    |  2 +-
 .../com/cloud/ha/HighAvailabilityManagerImpl.java  |  6 +--
 .../com/cloud/resource/ResourceManagerImpl.java    | 53 ++++++++++++----------
 5 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/api/src/main/java/com/cloud/resource/ResourceState.java b/api/src/main/java/com/cloud/resource/ResourceState.java
index 9b3bafe..6e0fa90 100644
--- a/api/src/main/java/com/cloud/resource/ResourceState.java
+++ b/api/src/main/java/com/cloud/resource/ResourceState.java
@@ -114,6 +114,7 @@ public enum ResourceState {
         s_fsm.addTransition(ResourceState.Enabled, Event.Disable, ResourceState.Disabled);
         s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance);
         s_fsm.addTransition(ResourceState.Enabled, Event.InternalEnterMaintenance, ResourceState.Maintenance);
+        s_fsm.addTransition(ResourceState.Enabled, Event.DeleteHost, ResourceState.Disabled);
         s_fsm.addTransition(ResourceState.Disabled, Event.Enable, ResourceState.Enabled);
         s_fsm.addTransition(ResourceState.Disabled, Event.Disable, ResourceState.Disabled);
         s_fsm.addTransition(ResourceState.Disabled, Event.InternalCreated, ResourceState.Disabled);
diff --git a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
index 18ceddb..1dd999d 100644
--- a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
+++ b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java
@@ -32,7 +32,7 @@ import java.util.List;
  */
 public interface HighAvailabilityManager extends Manager {
 
-    ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
+    public ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
         "Force High-Availability to happen even if the VM says no.", true, Cluster);
 
     ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",
diff --git a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java
index 2fb0a62..830e8a1 100755
--- a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java
+++ b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java
@@ -4498,7 +4498,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
                         String.format("VM %s is at %s and we received a %s report while there is no pending jobs on it"
                                 , vm.getInstanceName(), vm.getState(), vm.getPowerState()));
             }
-            if(vm.isHaEnabled() && vm.getState() == State.Running
+            if((HighAvailabilityManager.ForceHA.value() || vm.isHaEnabled()) && vm.getState() == State.Running
                     && HaVmRestartHostUp.value()
                     && vm.getHypervisorType() != HypervisorType.VMware
                     && vm.getHypervisorType() != HypervisorType.Hyperv) {
diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
index b05e008..cde5594 100644
--- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
+++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java
@@ -197,7 +197,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
     int _maxRetries;
     long _timeBetweenFailures;
     long _timeBetweenCleanups;
-    boolean _forceHA;
     String _haTag = null;
 
     protected HighAvailabilityManagerImpl() {
@@ -364,7 +363,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
                 alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
             }
 
-            if (!(_forceHA || vm.isHaEnabled())) {
+            if (!(ForceHA.value() || vm.isHaEnabled())) {
                 String hostDesc = "id:" + vm.getHostId() + ", availability zone id:" + vm.getDataCenterId() + ", pod id:" + vm.getPodIdToDeployIn();
                 _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "VM (name: " + vm.getHostName() + ", id: " + vm.getId() +
                     ") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getHostName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() +
@@ -569,7 +568,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
 
         vm = _itMgr.findById(vm.getId());
 
-        if (!_forceHA && !vm.isHaEnabled()) {
+        if (!ForceHA.value() && !vm.isHaEnabled()) {
             if (s_logger.isDebugEnabled()) {
                 s_logger.debug("VM is not HA enabled so we're done.");
             }
@@ -861,7 +860,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
             _workers[i] = new WorkerThread("HA-Worker-" + i);
         }
 
-        _forceHA = ForceHA.value();
         _timeToSleep = TimeToSleep.value() * SECONDS_TO_MILLISECONDS_FACTOR;
         _maxRetries = MigrationMaxRetries.value();
         _timeBetweenFailures = TimeBetweenFailures.value() * SECONDS_TO_MILLISECONDS_FACTOR;
diff --git a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
index 6945d6f..c8fe578 100755
--- a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
+++ b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java
@@ -2325,34 +2325,32 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
             s_logger.debug("Deleting Host: " + host.getId() + " Guid:" + host.getGuid());
         }
 
-        if (forceDestroyStorage) {
+        final StoragePoolVO storagePool = _storageMgr.findLocalStorageOnHost(host.getId());
+        if (forceDestroyStorage && storagePool != null) {
             // put local storage into mainenance mode, will set all the VMs on
             // this local storage into stopped state
-            final StoragePoolVO storagePool = _storageMgr.findLocalStorageOnHost(host.getId());
-            if (storagePool != null) {
-                if (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.ErrorInMaintenance) {
-                    try {
-                        final StoragePool pool = _storageSvr.preparePrimaryStorageForMaintenance(storagePool.getId());
-                        if (pool == null) {
-                            s_logger.debug("Failed to set primary storage into maintenance mode");
+            if (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.ErrorInMaintenance) {
+                try {
+                    final StoragePool pool = _storageSvr.preparePrimaryStorageForMaintenance(storagePool.getId());
+                    if (pool == null) {
+                        s_logger.debug("Failed to set primary storage into maintenance mode");
 
-                            throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode");
-                        }
-                    } catch (final Exception e) {
-                        s_logger.debug("Failed to set primary storage into maintenance mode, due to: " + e.toString());
-                        throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode, due to: " + e.toString());
+                        throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode");
                     }
+                } catch (final Exception e) {
+                    s_logger.debug("Failed to set primary storage into maintenance mode, due to: " + e.toString());
+                    throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode, due to: " + e.toString());
                 }
+            }
 
-                final List<VMInstanceVO> vmsOnLocalStorage = _storageMgr.listByStoragePool(storagePool.getId());
-                for (final VMInstanceVO vm : vmsOnLocalStorage) {
-                    try {
-                        _vmMgr.destroy(vm.getUuid(), false);
-                    } catch (final Exception e) {
-                        final String errorMsg = "There was an error Destory the vm: " + vm + " as a part of hostDelete id=" + host.getId();
-                        s_logger.debug(errorMsg, e);
-                        throw new UnableDeleteHostException(errorMsg + "," + e.getMessage());
-                    }
+            final List<VMInstanceVO> vmsOnLocalStorage = _storageMgr.listByStoragePool(storagePool.getId());
+            for (final VMInstanceVO vm : vmsOnLocalStorage) {
+                try {
+                    _vmMgr.destroy(vm.getUuid(), false);
+                } catch (final Exception e) {
+                    final String errorMsg = "There was an error Destory the vm: " + vm + " as a part of hostDelete id=" + host.getId();
+                    s_logger.debug(errorMsg, e);
+                    throw new UnableDeleteHostException(errorMsg + "," + e.getMessage());
                 }
             }
         } else {
@@ -2362,17 +2360,22 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
                 if (isForced) {
                     // Stop HA disabled vms and HA enabled vms in Stopping state
                     // Restart HA enabled vms
+                    try {
+                        resourceStateTransitTo(host, ResourceState.Event.DeleteHost, host.getId());
+                    } catch (final NoTransitionException e) {
+                        s_logger.debug("Cannot transmit host " + host.getId() + " to Disabled state", e);
+                    }
                     for (final VMInstanceVO vm : vms) {
-                        if (!vm.isHaEnabled() || vm.getState() == State.Stopping) {
+                        if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
                             s_logger.debug("Stopping vm: " + vm + " as a part of deleteHost id=" + host.getId());
                             try {
-                                _vmMgr.advanceStop(vm.getUuid(), false);
+                                _haMgr.scheduleStop(vm, host.getId(), WorkType.Stop);
                             } catch (final Exception e) {
                                 final String errorMsg = "There was an error stopping the vm: " + vm + " as a part of hostDelete id=" + host.getId();
                                 s_logger.debug(errorMsg, e);
                                 throw new UnableDeleteHostException(errorMsg + "," + e.getMessage());
                             }
-                        } else if (vm.isHaEnabled() && (vm.getState() == State.Running || vm.getState() == State.Starting)) {
+                        } else if ((HighAvailabilityManager.ForceHA.value() || vm.isHaEnabled()) && (vm.getState() == State.Running || vm.getState() == State.Starting)) {
                             s_logger.debug("Scheduling restart for vm: " + vm + " " + vm.getState() + " on the host id=" + host.getId());
                             _haMgr.scheduleRestart(vm, false);
                         }