You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cloudstack.apache.org by ke...@apache.org on 2013/06/17 04:39:48 UTC
git commit: updated refs/heads/vmsync to 40dbc54
Updated Branches:
refs/heads/vmsync 39337adcf -> 40dbc5499
Hook up VM transitional state handling into VirtualMachineManager host ping and periodically maintanance tasksg
Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/40dbc549
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/40dbc549
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/40dbc549
Branch: refs/heads/vmsync
Commit: 40dbc549989bbcd114fe6600deda6e29a6016e51
Parents: 39337ad
Author: Kelven Yang <ke...@gmail.com>
Authored: Sun Jun 16 19:39:32 2013 -0700
Committer: Kelven Yang <ke...@gmail.com>
Committed: Sun Jun 16 19:39:32 2013 -0700
----------------------------------------------------------------------
.../src/com/cloud/alert/AlertManager.java | 2 +-
.../cloudstack/engine/config/Configs.java | 2 +
.../com/cloud/vm/VirtualMachineManagerImpl.java | 66 +++++++++++++-------
3 files changed, 48 insertions(+), 22 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cloudstack/blob/40dbc549/engine/components-api/src/com/cloud/alert/AlertManager.java
----------------------------------------------------------------------
diff --git a/engine/components-api/src/com/cloud/alert/AlertManager.java b/engine/components-api/src/com/cloud/alert/AlertManager.java
index b6d005a..7533d4d 100755
--- a/engine/components-api/src/com/cloud/alert/AlertManager.java
+++ b/engine/components-api/src/com/cloud/alert/AlertManager.java
@@ -48,7 +48,7 @@ public interface AlertManager extends Manager {
public static final short ALERT_TYPE_DIRECT_ATTACHED_PUBLIC_IP = 24;
public static final short ALERT_TYPE_LOCAL_STORAGE = 25;
public static final short ALERT_TYPE_RESOURCE_LIMIT_EXCEEDED = 26; // Generated when the resource limit exceeds the limit. Currently used for recurring snapshots only
-
+ public static final short ALERT_TYPE_SYNC = 27;
void clearAlert(short alertType, long dataCenterId, long podId);
http://git-wip-us.apache.org/repos/asf/cloudstack/blob/40dbc549/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
----------------------------------------------------------------------
diff --git a/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java b/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
index aa5ca70..9dcb86d 100644
--- a/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
+++ b/engine/components-api/src/org/apache/cloudstack/engine/config/Configs.java
@@ -41,4 +41,6 @@ public interface Configs {
public static final ConfigKey<Boolean> VmDestroyForcestop = new ConfigKey<Boolean>(
Boolean.class, "vm.destroy.forcestop", "Advanced", OrchestrationService.class, "false", "On destroy, force-stop takes this value ", null);
+ public static final ConfigKey<Long> PingInterval = new ConfigKey<Long>(
+ Long.class, "ping.interval", "Advanced", OrchestrationService.class, "60", "Ping interval in seconds", null);
}
http://git-wip-us.apache.org/repos/asf/cloudstack/blob/40dbc549/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
----------------------------------------------------------------------
diff --git a/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
index 2d82814..74219bd 100755
--- a/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
+++ b/engine/orchestration/src/com/cloud/vm/VirtualMachineManagerImpl.java
@@ -170,6 +170,8 @@ import com.cloud.vm.snapshot.VMSnapshotManager;
public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMachineManager, Listener {
private static final Logger s_logger = Logger.getLogger(VirtualMachineManagerImpl.class);
+ private static final String VM_SYNC_ALERT_SUBJECT = "VM state sync alert";
+
@Inject
protected EntityManager _entityMgr;
@Inject
@@ -257,6 +259,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
protected ConfigValue<Integer> _lockStateRetry;
protected ConfigValue<Integer> _operationTimeout;
protected ConfigValue<Boolean> _forceStop;
+ protected ConfigValue<Long> _pingInterval;
protected long _nodeId;
SearchBuilder<VolumeVO> RootVolumeSearch;
@@ -425,7 +428,8 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
@Override
public boolean start() {
- _executor.scheduleAtFixedRate(new CleanupTask(), _cleanupInterval.value(), _cleanupInterval.value(), TimeUnit.SECONDS);
+ _executor.scheduleAtFixedRate(new TransitionTask(), _pingInterval.value(), _pingInterval.value(), TimeUnit.SECONDS);
+ _executor.scheduleAtFixedRate(new CleanupTask(), _pingInterval.value()*2, _pingInterval.value()*2, TimeUnit.SECONDS);
cancelWorkItems(_nodeId);
return true;
@@ -447,6 +451,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
_lockStateRetry = _configRepo.get(Configs.VmOpLockStateRetry);
_operationTimeout = _configRepo.get(Configs.Wait).setMultiplier(2);
_forceStop = _configRepo.get(Configs.VmDestroyForcestop);
+ _pingInterval = _configRepo.get(Configs.PingInterval).setMultiplier(1000);
ReservationContextImpl.setComponents(_entityMgr);
VirtualMachineProfileImpl.setComponents(_entityMgr);
@@ -2616,22 +2621,10 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
PingRoutingCommand ping = (PingRoutingCommand) cmd;
if (ping.getNewStates() != null && ping.getNewStates().size() > 0) {
_syncMgr.processHostVmStatePingReport(agentId, ping.getNewStates());
-
-/* TODO
- Commands commands = deltaHostSync(agentId, ping.getNewStates());
- if (commands.size() > 0) {
- try {
- _agentMgr.send(agentId, commands, this);
- } catch (final AgentUnavailableException e) {
- s_logger.warn("Agent is now unavailable", e);
- }
- }
-*/
-
}
- // take the chance to scan stalled VM
- scanStalledVMInTransitionState(agentId);
+ // take the chance to scan VMs that are stuck in transitional states and are missing from the report
+ scanStalledVMInTransitionStateOnUpHost(agentId);
processed = true;
}
}
@@ -2740,7 +2733,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
s_logger.debug("Couldn't lock the db");
return;
}
+
try {
+/*
lock.addRef();
List<VMInstanceVO> instances = _vmDao.findVMInTransition(new Date(new Date().getTime() - (_operationTimeout.value() * 1000)), State.Starting, State.Stopping);
for (VMInstanceVO instance : instances) {
@@ -2751,6 +2746,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
_haMgr.scheduleRestart(instance, true);
}
}
+*/
+ scanStalledVMInTransitionStateOnDisconnectedHosts();
+
} catch (Exception e) {
s_logger.warn("Caught the following exception on transition checking", e);
} finally {
@@ -3452,7 +3450,10 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
} catch(NoTransitionException e) {
s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
}
- // TODO we need to alert admin or user about this risky state transition
+
+ // we need to alert admin or user about this risky state transition
+ _alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(),
+ VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (Starting -> Running) from out-of-context transition. VM network environment may need to be reset");
break;
case Running :
@@ -3472,8 +3473,9 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
} catch(NoTransitionException e) {
s_logger.warn("Unexpected VM state transition exception, race-condition?", e);
}
- // TODO we need to alert admin or user about this risky state transition
- break;
+ _alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(),
+ VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") state is sync-ed (" + vm.getState() + " -> Running) from out-of-context transition. VM network environment may need to be reset");
+ break;
case Destroyed :
case Expunging :
@@ -3529,7 +3531,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
}
}
- private void scanStalledVMInTransitionState(long hostId) {
+ private void scanStalledVMInTransitionStateOnUpHost(long hostId) {
//
// Check VM that is stuck in Starting, Stopping, Migrating states, we won't check
// VMs in expunging state (this need to be handled specially)
@@ -3540,10 +3542,32 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
//
// When host is UP, soon or later we will get a report from the host about the VM,
// however, if VM is missing from the host report (it may happen in out of band changes
- // or from designed behave of XS/KVM)
+ // or from designed behave of XS/KVM), the VM may not get a chance to run the state-sync logic
//
+ // Therefor, we will scan thoses VMs on UP host based on last update timestamp, if the host is UP
+ // and a VM stalls for status update, we will consider them to be powered off
+ // (which is relatively safe to do so)
-
+ long stallThresholdInMs = _pingInterval.value() + (_pingInterval.value() >> 1);
+ Date cutTime = new Date(DateUtil.currentGMTTime().getTime() - stallThresholdInMs);
+ List<Long> mostlikelyStoppedVMs = listStalledVMInTransitionStateOnUpHost(hostId, cutTime);
+ for(Long vmId : mostlikelyStoppedVMs) {
+ VMInstanceVO vm = _vmDao.findById(vmId);
+ assert(vm != null);
+ handlePowerOffReportWithNoPendingJobsOnVM(vm);
+ }
+ }
+
+ private void scanStalledVMInTransitionStateOnDisconnectedHosts() {
+ Date cutTime = new Date(DateUtil.currentGMTTime().getTime() - this._operationTimeout.value()*1000);
+ List<Long> stuckAndUncontrollableVMs = listStalledVMInTransitionStateOnDisconnectedHosts(cutTime);
+ for(Long vmId : stuckAndUncontrollableVMs) {
+ VMInstanceVO vm = _vmDao.findById(vmId);
+
+ // We now only alert administrator about this situation
+ _alertMgr.sendAlert(AlertManager.ALERT_TYPE_SYNC, vm.getDataCenterId(), vm.getPodIdToDeployIn(),
+ VM_SYNC_ALERT_SUBJECT, "VM " + vm.getHostName() + "(" + vm.getInstanceName() + ") is stuck in " + vm.getState() + " state and its host is unreachable for too long");
+ }
}
// TODO, use sql query directly for quick prototype, need to refactor to use joins and search builders