You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ch...@apache.org on 2013/03/14 13:49:44 UTC

svn commit: r1456407 - in /uima/sandbox/uima-ducc/trunk: src/main/resources/ uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/

Author: challngr
Date: Thu Mar 14 12:49:44 2013
New Revision: 1456407

URL: http://svn.apache.org/r1456407
Log:
UIMA-2739
SM Service instance restart policy.  Controlled by ducc.properties
  ducc.sm.instance.failure.max
and Service registry's
  process_failures_limit

Applies to both failure during instance submission and run-failure after
OR has accepted it.

Also modifies "instance termination" to purge instance information as soon
as orchestrator indicates it is no longer running. Previously this did not 
happen until the instance was removed from the OR publications.

Modified:
    uima/sandbox/uima-ducc/trunk/src/main/resources/ducc.properties
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java

Modified: uima/sandbox/uima-ducc/trunk/src/main/resources/ducc.properties
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/src/main/resources/ducc.properties?rev=1456407&r1=1456406&r2=1456407&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/src/main/resources/ducc.properties (original)
+++ uima/sandbox/uima-ducc/trunk/src/main/resources/ducc.properties Thu Mar 14 12:49:44 2013
@@ -141,6 +141,7 @@ ducc.sm.state.update.endpoint.type=topic
 ducc.sm.meta.ping.rate = 60000
 ducc.sm.meta.ping.stability =  10
 ducc.sm.meta.ping.timeout =  500
+ducc.sm.instance.falure.max = 5
 ducc.sm.http.port=19989
 ducc.sm.http.node=${ducc.head}
 ducc.sm.default.linger=300000

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java?rev=1456407&r1=1456406&r2=1456407&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java Thu Mar 14 12:49:44 2013
@@ -36,6 +36,7 @@ import org.apache.uima.ducc.transport.ev
 import org.apache.uima.ducc.transport.event.ServiceStopEvent;
 import org.apache.uima.ducc.transport.event.ServiceUnregisterEvent;
 import org.apache.uima.ducc.transport.event.common.DuccWorkJob;
+import org.apache.uima.ducc.transport.event.common.IDuccCompletionType.JobCompletionType;
 import org.apache.uima.ducc.transport.event.common.IDuccState.JobState;
 import org.apache.uima.ducc.transport.event.common.IDuccWork;
 import org.apache.uima.ducc.transport.event.sm.IServiceDescription;
@@ -619,17 +620,17 @@ public class ServiceHandler
             if ( sset == null ) {
                 // may have already died and this is just leftover OR publications.
                 if ( w.isActive() ) {             // or maybe we just screwed up!
-                    try {
-                        throw new IllegalStateException("Got update for service " + id.toString() + " but no ServiceSet! Job state: " + w.getJobState());
-                    } catch ( Throwable t ) {
-                        // catch and log stack but don't crash SM
-                        logger.error(methodName, id, t);
-                        continue;
-                    }
+                    logger.info(methodName, id, "Got update for active service instance", id.toString(), "but no ServiceSet! Job state:", w.getJobState());
+                    continue;
                 }
                 continue;
             }
-
+            
+            if ( !sset.containsImplementor(id) ) {
+                logger.info(methodName, id, "Bypassing removed service instance for", endpoint);
+                continue;
+            }
+        
             ServiceDependency s = serviceMap.get(id);
             if ( w.isFinished() ) {              // nothing more, just dereference and maybe stop stuff I'm dependent upon
                 stopDependentServices(id);
@@ -641,6 +642,30 @@ public class ServiceHandler
             // now factor in cumulative state of the implementors and manage the ping thread as needed
             sset.establish(id, w.getJobState());
 
+            // State is established.  Now, if the instance died, remove it - OR will keep publishing it for a while and we want to ignore those
+            if ( ! w.isActive() ) {
+                sset.removeImplementor(id);
+
+                JobCompletionType jct = w.getCompletionType();
+                JobState          state = w.getJobState();
+                logger.info(methodName, id, "Removing stopped instance from maps: state[", state, "] completion[", jct, "]");
+                switch ( jct ) {
+                    case EndOfJob:
+                    case CanceledByUser:
+                    case CanceledByAdministrator:
+                    case Undefined:
+                        break;
+                    default:
+                        logger.debug(methodName, id, "RECORDING FAILURE");
+                        // all other cases are errors that contribute to the error count
+                        if ( sset.runFailures() ) {    // if true, the count is exceeeded, but reset
+                            logger.warn(methodName, null, "Process Failure: " + jct + " Maximum consecutive failures[" + sset.failure_run + "] max [" + sset.failure_max + "]");
+                        }
+                        break;
+                }
+                
+            }
+
             if ( (sset.getServiceState() == ServiceState.NotAvailable) && (sset.countReferences() == 0) ) {
                 // this service is now toast.  remove from our maps asap to avoid clashes if it gets
                 // resubmitted before the OR can purge it.

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java?rev=1456407&r1=1456406&r2=1456407&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java Thu Mar 14 12:49:44 2013
@@ -96,6 +96,8 @@ public class ServiceManagerComponent 
     static int meta_ping_timeout = 500;      // timeout on ping 
     static String default_ping_class;
 
+    static int failure_max = 5;
+
     private String state_dir = null;
     private String state_file = null;
     private String descriptor_dir = null;
@@ -262,6 +264,7 @@ public class ServiceManagerComponent 
 		super.start(service, args);
 		DuccDaemonRuntimeProperties.getInstance().boot(DaemonName.ServiceManager,getProcessJmxUrl());
 
+        failure_max = SystemPropertyResolver.getIntProperty("ducc.sm.instance.failure.max", failure_max);
         meta_ping_rate = SystemPropertyResolver.getIntProperty("ducc.sm.meta.ping.rate", meta_ping_rate);
         meta_ping_timeout = SystemPropertyResolver.getIntProperty("ducc.sm.meta.ping.timeout", meta_ping_timeout);
         meta_ping_stability = SystemPropertyResolver.getIntProperty("ducc.sm.meta.ping.stability", meta_ping_stability);
@@ -293,6 +296,7 @@ public class ServiceManagerComponent 
         logger.info(methodName, null, "    Service ping rate       : ", meta_ping_rate);
         logger.info(methodName, null, "    Service ping timeout    : ", meta_ping_timeout);
         logger.info(methodName, null, "    Service ping stability  : ", meta_ping_stability);
+        logger.info(methodName, null, "    Instance Failure Max    : ", failure_max);
         logger.info(methodName, null, "    DUCC Version            : ", Version.version());
         logger.info(methodName, null, "------------------------------------------------------------------------------------");
 

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java?rev=1456407&r1=1456406&r2=1456407&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java Thu Mar 14 12:49:44 2013
@@ -38,6 +38,7 @@ import org.apache.uima.UIMAFramework;
 import org.apache.uima.aae.client.UimaAsynchronousEngine;
 import org.apache.uima.adapter.jms.client.BaseUIMAAsynchronousEngine_impl;
 import org.apache.uima.ducc.cli.IServiceApi.RegistrationOption;
+import org.apache.uima.ducc.cli.IUiOptions.UiOption;
 import org.apache.uima.ducc.common.ServiceStatistics;
 import org.apache.uima.ducc.common.TcpStreamHandler;
 import org.apache.uima.ducc.common.utils.DuccLogger;
@@ -103,6 +104,9 @@ public class ServiceSet
     // Registered services, the number of instances to maintain
     int instances = 1;
 
+    // max allowable failures before disabling autostart
+    int max_failures = 5;
+
     // UIMA-AS pinger
     IServiceMeta serviceMeta = null;
 
@@ -122,6 +126,11 @@ public class ServiceSet
     LingerTask linger = null;
     long linger_time = 5000;
 
+    // max allowed consecutive failures, current failure count
+    int failure_max = ServiceManagerComponent.failure_max;
+    int failure_start = 0;
+    int failure_run = 0;
+
     //JobState     job_state     = JobState.Undefined;
     //
     // This is the constructor for an implicit service
@@ -199,8 +208,8 @@ public class ServiceSet
         this.meta_filename = meta_filename;
         this.service_state = ServiceState.NotAvailable;
         this.linger_time = props.getLongProperty(RegistrationOption.ServiceLinger.decode(), 5000);
-
         this.key = meta.getProperty("endpoint");
+        this.failure_max = props.getIntProperty(UiOption.ProcessFailuresLimit.pname(), ServiceManagerComponent.failure_max);
 
         parseEndpoint(key);
 
@@ -660,6 +669,12 @@ public class ServiceSet
         return false;            
     }
 
+    boolean containsImplementor(DuccId id)
+    {
+        // must use friendly, in case the thing was just started and not into the implementors set yet
+        return friendly_ids.containsKey(id.getFriendly());
+    }
+
     public void removeImplementor(DuccId id)
     {
         if ( ! implementors.containsKey(id ) ) return;  // quick short circuit if it's already gone
@@ -807,6 +822,10 @@ public class ServiceSet
     {
         String methodName = "establish.1";
 
+        if ( job_state == JobState.Running ) {
+            failure_run = 0;
+        }
+
         if ( service_class == ServiceClass.Implicit ) {
             startPingThread(); 
             return;
@@ -960,6 +979,19 @@ public class ServiceSet
         return key;
     }
 
+    synchronized boolean runFailures()
+    {
+        String methodName = "runFailures";
+        if ( (++failure_run) > failure_max ) {
+            logger.debug(methodName, id, "RUN FAILURES EXCEEDED");
+            setAutostart(false);
+            failure_run = 0;
+            return true;
+        }
+        logger.debug(methodName, id, "RUN FAILURES NOT EXCEEDED YET", failure_run);
+        return false;
+    }
+
     private void startPingThread()
     {
     	String methodName = "startPingThread";
@@ -1140,32 +1172,27 @@ public class ServiceSet
             logger.error(methodName, null, t);
 		}
 
+        for ( String s : stderr_lines ) {
+            logger.info(methodName, id, "Start stderr:", s);
+        }
+
         // That was annoying.  Now search the lines for some hint of the id.
         boolean inhibit_cp = false;
+        boolean started = false;
         for ( String s : stdout_lines ) {
+
+            // simple logic to inhibit printing the danged classpath
             if ( inhibit_cp ) {
-                // The classpaths can be just awful filling the logs with junk.  It will end up in the agent log
-                // anyway so let's inhibit it here.
-                logger.debug(methodName, id, "<INHIBITED CP>");
                 inhibit_cp = false;
+                logger.info(methodName, id, "<INHIBITED CP>");
             } else {
-                logger.debug(methodName, id, "Start stdout:", s);
-                if ( s.indexOf("-cp") >= 0 ) {
-                    inhibit_cp = true;
-                }
+                logger.info(methodName, id, "Start stdout:", s);
             }
-        }
-
-        for ( String s : stderr_lines ) {
-            logger.debug(methodName, id, "Start stderr:", s);
-        }
 
-        // That was annoying.  Now search the lines for some hint of the id.
+            if ( s.indexOf("-cp") >= 0 ) {
+                inhibit_cp = true;
+            }
 
-        boolean started = false;
-        for ( String s : stdout_lines ) {
-            logger.debug(methodName, id, "Start stdout:", s);
-            
             if ( s.startsWith("Service") && s.endsWith("submitted") ) {
                 String[] toks = s.split("\\s");
                 long friendly = 0;
@@ -1183,9 +1210,16 @@ public class ServiceSet
         }
         
         if ( ! started ) {
-            logger.warn(methodName, null, "Request to start service " + id.toString() + " failed:", stdout_lines.toString() + " " + stderr_lines.toString());
+            logger.warn(methodName, null, "Request to start service " + id.toString() + " failed.");
+            if ( (++failure_start) >= (failure_max) ) {
+                logger.warn(methodName, null, "Start failure. Maximum consecutive failures[" + failure_max + "] exceeded{" + failure_start + "].  Autostart disabled.");
+                setAutostart(false);
+                failure_start = 0;
+            }
+
         } else {
             setServiceState(ServiceState.Initializing);
+            failure_start = 0;
         }
         saveMetaProperties();
         logger.debug(methodName, null, "ENDSTART ENDSTART ENDSTART ENDSTART ENDSTART ENDSTART");
@@ -1212,7 +1246,13 @@ public class ServiceSet
         };
         
         for ( int i = 0; i < args.length; i++ ) { 
-            logger.debug(methodName, null, "Args[", i, "]:", args[i]);
+            if ( i > 0 && (args[i-1].equals("-cp") ) ) {
+                // The classpaths can be just awful filling the logs with junk.  It will end up in the agent log
+                // anyway so let's inhibit it here.
+                logger.debug(methodName, null, "Args[", i, "]: <CLASSPATH>");
+            } else {
+                logger.debug(methodName, null, "Args[", i, "]:", args[i]);
+            }
         }
         
         ProcessBuilder pb = new ProcessBuilder(args);
@@ -1231,6 +1271,7 @@ public class ServiceSet
             InputStream stderr = p.getErrorStream();
             BufferedReader stdout_reader = new BufferedReader(new InputStreamReader(stdout));
             BufferedReader stderr_reader = new BufferedReader(new InputStreamReader(stderr));
+
             String line = null;
             while ( (line = stdout_reader.readLine()) != null ) {
                 stdout_lines.add(line);
@@ -1246,13 +1287,23 @@ public class ServiceSet
             logger.error(methodName, null, t);
         }
 
-        // That was annoying.  Now search the lines for some hint of the id.
+        boolean inhibit_cp = false;
         for ( String s : stdout_lines ) {
-            logger.debug(methodName, id, "Stop stdout:", s);
+            // simple logic to inhibit printing the danged classpath
+            if ( inhibit_cp ) {
+                inhibit_cp = false;
+                logger.info(methodName, id, "<INHIBITED CP>");
+            } else {
+                logger.info(methodName, id, "Stop stdout:", s);
+            }
+            
+            if ( s.indexOf("-cp") >= 0 ) {
+                inhibit_cp = true;
+            }
         }
             
         for ( String s : stderr_lines ) {
-            logger.debug(methodName, id, "Stop stderr:", s);
+            logger.info(methodName, id, "Stop stderr:", s);
         }
 
         // is this the last implementor?  if so the service is no longer available.