You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@airavata.apache.org by sa...@apache.org on 2014/07/22 17:05:39 UTC
[14/50] [abbrv] git commit: fixing monitorng
fixing monitorng
Project: http://git-wip-us.apache.org/repos/asf/airavata/repo
Commit: http://git-wip-us.apache.org/repos/asf/airavata/commit/b0fde67c
Tree: http://git-wip-us.apache.org/repos/asf/airavata/tree/b0fde67c
Diff: http://git-wip-us.apache.org/repos/asf/airavata/diff/b0fde67c
Branch: refs/heads/workflow-support
Commit: b0fde67c03f2787c5f6d7ec576dc92ab41322c53
Parents: 779b618
Author: lahiru <la...@apache.org>
Authored: Sat Jul 12 12:58:38 2014 -0400
Committer: lahiru <la...@apache.org>
Committed: Sat Jul 12 12:58:38 2014 -0400
----------------------------------------------------------------------
.../airavata/gfac/core/monitor/MonitorID.java | 10 ++++++++--
.../monitor/impl/pull/qstat/HPCPullMonitor.java | 16 ++++++++--------
.../airavata/gfac/monitor/util/CommonUtils.java | 1 +
3 files changed, 17 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/airavata/blob/b0fde67c/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java
----------------------------------------------------------------------
diff --git a/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java b/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java
index 8456e35..8599a02 100644
--- a/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java
+++ b/modules/gfac/gfac-core/src/main/java/org/apache/airavata/gfac/core/monitor/MonitorID.java
@@ -177,12 +177,14 @@ public class MonitorID {
// because in some machines job state vanishes quicckly when the job is done
// during that case job state comes as unknown.so we handle it here.
if (this.state != null && status.equals(JobState.UNKNOWN)) {
- if (getFailedCount() > 2) {
+ if (getFailedCount() >= 2) {
switch (this.state) {
case ACTIVE:
this.state = JobState.COMPLETE;
+ logger.info("Failed count is high and old status is ACTIVE so we mark this as COMPLETE");
break;
case QUEUED:
+ logger.info("Failed count is high and old status is QUEUED so we mark this as COMPLETE");
this.state = JobState.COMPLETE;
break;
}
@@ -193,10 +195,14 @@ public class MonitorID {
} catch (InterruptedException e) {
e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
}
+ int loginfo = getFailedCount()+1;
+ logger.info("Increasing the failed count to:"+loginfo);
setFailedCount(getFailedCount() + 1);
}
- } else {
+ } else {
// normal scenario
+ logger.info("Resetting failed count to 0 because correct state came in");
+ setFailedCount(0);
this.state = status;
}
}
http://git-wip-us.apache.org/repos/asf/airavata/blob/b0fde67c/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
----------------------------------------------------------------------
diff --git a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
index 193f23f..1238bf6 100644
--- a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
+++ b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/impl/pull/qstat/HPCPullMonitor.java
@@ -149,20 +149,20 @@ public class HPCPullMonitor extends PullMonitor {
if (iHostMonitorData.getHost().getType() instanceof GsisshHostType
|| iHostMonitorData.getHost().getType() instanceof SSHHostType) {
currentHostDescription = iHostMonitorData.getHost();
- String hostName = iHostMonitorData.getHost().getType().getHostAddress();
+ String hostName = iHostMonitorData.getHost().getType().getHostAddress();
ResourceConnection connection = null;
if (connections.containsKey(hostName)) {
logger.debug("We already have this connection so not going to create one");
connection = connections.get(hostName);
} else {
- connection = new ResourceConnection(iHostMonitorData, getAuthenticationInfo());
+ connection = new ResourceConnection(iHostMonitorData,getAuthenticationInfo());
connections.put(hostName, connection);
}
List<MonitorID> monitorID = iHostMonitorData.getMonitorIDs();
Map<String, JobState> jobStatuses = connection.getJobStatuses(monitorID);
for (MonitorID iMonitorID : monitorID) {
currentMonitorID = iMonitorID;
- iMonitorID.setStatus(jobStatuses.get(iMonitorID.getJobID()));
+ iMonitorID.setStatus(jobStatuses.get(iMonitorID.getJobID())); //IMPORTANT this is not a simple setter we have a logic
jobStatus = new JobStatusChangeRequest(iMonitorID);
// we have this JobStatus class to handle amqp monitoring
@@ -176,13 +176,13 @@ public class HPCPullMonitor extends PullMonitor {
try {
gfac.invokeOutFlowHandlers(iMonitorID.getJobExecutionContext());
} catch (GFacException e) {
- publisher.publish(new TaskStatusChangeRequest(new TaskIdentity(iMonitorID.getExperimentID(), iMonitorID.getWorkflowNodeID(),
- iMonitorID.getTaskID()), TaskState.FAILED));
- publisher.publish(new ExperimentStatusChangeRequest(new ExperimentIdentity(iMonitorID.getExperimentID()),
- ExperimentState.FAILED));
+ publisher.publish(new TaskStatusChangeRequest(new TaskIdentity(iMonitorID.getExperimentID(), iMonitorID.getWorkflowNodeID(),
+ iMonitorID.getTaskID()), TaskState.FAILED));
+ publisher.publish(new ExperimentStatusChangeRequest(new ExperimentIdentity(iMonitorID.getExperimentID()),
+ ExperimentState.FAILED));
logger.info(e.getLocalizedMessage(), e);
}
- } else if (iMonitorID.getFailedCount() > 2 && iMonitorID.getStatus().equals(JobState.UNKNOWN)) {
+ } else if (iMonitorID.getFailedCount() > 2) {
logger.error("Tried to monitor the job with ID " + iMonitorID.getJobID() + " But failed 3 times, so skip this Job from Monitor");
iMonitorID.setLastMonitored(new Timestamp((new Date()).getTime()));
completedJobs.add(iMonitorID);
http://git-wip-us.apache.org/repos/asf/airavata/blob/b0fde67c/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java
----------------------------------------------------------------------
diff --git a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java
index a9f1520..27b213f 100644
--- a/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java
+++ b/modules/gfac/gfac-monitor/src/main/java/org/apache/airavata/gfac/monitor/util/CommonUtils.java
@@ -138,6 +138,7 @@ public class CommonUtils {
if(iMonitorID.getJobID().equals(monitorID.getJobID())) {
// OK we found the object, we cannot do list.remove(object) states of two objects
// could be different, thats why we check the jobID
+ logger.info("Removing the job:"+ monitorID.getJobID()+" from monitoring last status:" + monitorID.getStatus().toString());
monitorIDs.remove(iMonitorID);
if(monitorIDs.size()==0) {
hostMonitorData.remove(iHostMonitorID);