You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cloudstack.apache.org by ke...@apache.org on 2013/06/17 04:39:48 UTC

git commit: updated refs/heads/vmsync to 40dbc54

Updated Branches:
  refs/heads/vmsync 39337adcf -> 40dbc5499


Hook up VM transitional state handling into VirtualMachineManager host ping and periodically maintanance tasksg


Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/40dbc549
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/40dbc549
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/40dbc549

Branch: refs/heads/vmsync
Commit: 40dbc549989bbcd114fe6600deda6e29a6016e51
Parents: 39337ad
Author: Kelven Yang <ke...@gmail.com>
Authored: Sun Jun 16 19:39:32 2013 -0700
Committer: Kelven Yang <ke...@gmail.com>
Committed: Sun Jun 16 19:39:32 2013 -0700

----------------------------------------------------------------------
 .../src/com/cloud/alert/AlertManager.java       |  2 +-
 .../cloudstack/engine/config/Configs.java       |  2 +
 .../com/cloud/vm/VirtualMachineManagerImpl.java | 66 +++++++++++++-------
 3 files changed, 48 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cloudstack/blob/40dbc549/engine/components-api/src/com/cloud/alert/AlertManager.java
----------------------------------------------------------------------
diff --git a/engine/components-api/src/com/cloud/alert/AlertManager.java b/engine/components-api/src/com/cloud/alert/AlertManager.java
index b6d005a..7533d4d 100755
--- a/engine/components-api/src/com/cloud/alert/AlertManager.java
+++ b/engine/components-api/src/com/cloud/alert/AlertManager.java
@@ -48,7 +48,7 @@ public interface AlertManager extends Manager {
     public static final short ALERT_TYPE_DIRECT_ATTACHED_PUBLIC_IP = 24;
     public static final short ALERT_TYPE_LOCAL_STORAGE = 25;
     public static final short ALERT_TYPE_RESOURCE_LIMIT_EXCEEDED = 26; // Generated when the resource limit exceeds the limit. Currently used for recurring snapshots only
-
+    public static final short ALERT_TYPE_SYNC = 27;
 
     void clearAlert(short alertType, long dataCenterId, long podId);
 

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/40dbc549/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
----------------------------------------------------------------------
diff --git a/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java b/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
index aa5ca70..9dcb86d 100644
--- a/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
+++ b/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
@@ -41,4 +41,6 @@ public interface Configs {
     public static final ConfigKey<Boolean> VmDestroyForcestop = new ConfigKey<Boolean>(
             Boolean.class, "vm.destroy.forcestop", "Advanced", OrchestrationService.class, "false", "On destroy, force-stop takes this value ", null);
 
+    public static final ConfigKey<Long> PingInterval = new ConfigKey<Long>(
+            Long.class, "ping.interval", "Advanced", OrchestrationService.class, "60", "Ping interval in seconds", null);
 }

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/40dbc549/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
----------------------------------------------------------------------
diff --git a/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
index 2d82814..74219bd 100755
--- a/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
+++ b/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
@@ -170,6 +170,8 @@ import com.cloud.vm.snapshot.VMSnapshotManager;
 public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMachineManager, Listener {
     private static final Logger s_logger = Logger.getLogger(VirtualMachineManagerImpl.class);
 
+    private static final String VM_SYNC_ALERT_SUBJECT = "VM state sync alert";
+    
     @Inject
     protected EntityManager _entityMgr;
     @Inject
@@ -257,6 +259,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
     protected ConfigValue<Integer> _lockStateRetry;
     protected ConfigValue<Integer> _operationTimeout;
     protected ConfigValue<Boolean> _forceStop;
+    protected ConfigValue<Long> _pingInterval;
     protected long _nodeId;
 
     SearchBuilder<VolumeVO> RootVolumeSearch;
@@ -425,7 +428,8 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
 
     @Override
     public boolean start() {
-        _executor.scheduleAtFixedRate(new CleanupTask(), _cleanupInterval.value(), _cleanupInterval.value(), TimeUnit.SECONDS);
+        _executor.scheduleAtFixedRate(new TransitionTask(), _pingInterval.value(), _pingInterval.value(), TimeUnit.SECONDS);
+        _executor.scheduleAtFixedRate(new CleanupTask(), _pingInterval.value()*2, _pingInterval.value()*2, TimeUnit.SECONDS);
         cancelWorkItems(_nodeId);
         
         return true;
@@ -447,6 +451,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
         _lockStateRetry = _configRepo.get(Configs.VmOpLockStateRetry);
         _operationTimeout = _configRepo.get(Configs.Wait).setMultiplier(2);
         _forceStop = _configRepo.get(Configs.VmDestroyForcestop);
+        _pingInterval = _configRepo.get(Configs.PingInterval).setMultiplier(1000);
 
         ReservationContextImpl.setComponents(_entityMgr);
         VirtualMachineProfileImpl.setComponents(_entityMgr);
@@ -2616,22 +2621,10 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
                 PingRoutingCommand ping = (PingRoutingCommand) cmd;
                 if (ping.getNewStates() != null && ping.getNewStates().size() > 0) {
                 	_syncMgr.processHostVmStatePingReport(agentId, ping.getNewStates());
-
-/* TODO
-                    Commands commands = deltaHostSync(agentId, ping.getNewStates());
-                    if (commands.size() > 0) {
-                        try {
-                            _agentMgr.send(agentId, commands, this);
-                        } catch (final AgentUnavailableException e) {
-                            s_logger.warn("Agent is now unavailable", e);
-                        }
-                    }
-*/
-                    
                 }
                 
-                // take the chance to scan stalled VM
-                scanStalledVMInTransitionState(agentId);
+                // take the chance to scan VMs that are stuck in transitional states and are missing from the report
+                scanStalledVMInTransitionStateOnUpHost(agentId);
                 processed = true;
             }
         }
@@ -2740,7 +2733,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
                 s_logger.debug("Couldn't lock the db");
                 return;
             }
+            
             try {
+/*            	
                 lock.addRef();
                 List<VMInstanceVO> instances = _vmDao.findVMInTransition(new Date(new Date().getTime() - (_operationTimeout.value() * 1000)), State.Starting, State.Stopping);
                 for (VMInstanceVO instance : instances) {
@@ -2751,6 +2746,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
                         _haMgr.scheduleRestart(instance, true);
                     }
                 }
+*/               
+            	scanStalledVMInTransitionStateOnDisconnectedHosts();
+                
             } catch (Exception e) {
                 s_logger.warn("Caught the following exception on transition checking", e);
             } finally {
@@ -3452,7 +3450,10 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
     		} catch(NoTransitionException e) {
     			s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
     		}
-    		// TODO we need to alert admin or user about this risky state transition
+    		
+    		// we need to alert admin or user about this risky state transition
+    		_alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(), 
+    			VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (Starting -> Running) from out-of-context transition. VM network environment may need to be reset");
     		break;
     		
     	case Running :
@@ -3472,8 +3473,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
     		} catch(NoTransitionException e) {
     			s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
     		}
-    		// TODO we need to alert admin or user about this risky state transition
-    		break;
+      		_alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(), 
+        			VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (" + vm.getState() + " -> Running) from out-of-context transition. VM network environment may need to be reset");
+          		break;
     		
     	case Destroyed :
     	case Expunging :
@@ -3529,7 +3531,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
     	}
     }
     
-    private void scanStalledVMInTransitionState(long hostId) {
+    private void scanStalledVMInTransitionStateOnUpHost(long hostId) {
     	//
     	// Check VM that is stuck in Starting, Stopping, Migrating states, we won't check
     	// VMs in expunging state (this need to be handled specially)
@@ -3540,10 +3542,32 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
     	//
     	// When host is UP, soon or later we will get a report from the host about the VM, 
     	// however, if VM is missing from the host report (it may happen in out of band changes
-    	// or from designed behave of XS/KVM)
+    	// or from designed behave of XS/KVM), the VM may not get a chance to run the state-sync logic
     	//
+    	// Therefor, we will scan thoses VMs on UP host based on last update timestamp, if the host is UP
+    	// and a VM stalls for status update, we will consider them to be powered off 
+    	// (which is relatively safe to do so) 
     	
-    	
+    	long stallThresholdInMs = _pingInterval.value() + (_pingInterval.value() >> 1);
+    	Date cutTime = new Date(DateUtil.currentGMTTime().getTime() - stallThresholdInMs);
+    	List<Long> mostlikelyStoppedVMs = listStalledVMInTransitionStateOnUpHost(hostId, cutTime);
+    	for(Long vmId : mostlikelyStoppedVMs) {
+    		VMInstanceVO vm = _vmDao.findById(vmId);
+    		assert(vm != null);
+    		handlePowerOffReportWithNoPendingJobsOnVM(vm);
+    	}
+    }
+    
+    private void scanStalledVMInTransitionStateOnDisconnectedHosts() {
+    	Date cutTime = new Date(DateUtil.currentGMTTime().getTime() - this._operationTimeout.value()*1000);
+    	List<Long> stuckAndUncontrollableVMs = listStalledVMInTransitionStateOnDisconnectedHosts(cutTime);
+    	for(Long vmId : stuckAndUncontrollableVMs) {
+    		VMInstanceVO vm = _vmDao.findById(vmId);
+    		
+    		// We now only alert administrator about this situation
+      		_alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(), 
+        		VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") is stuck in " + vm.getState() + " state and its host is unreachable for too long");
+    	}
     }
     
     // TODO, use sql query directly for quick prototype, need to refactor to use joins and search builders