You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ch...@apache.org on 2014/02/07 14:48:18 UTC

svn commit: r1565655 - in /uima/sandbox/uima-ducc/trunk: uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/ uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/

Author: challngr
Date: Fri Feb  7 13:48:17 2014
New Revision: 1565655

URL: http://svn.apache.org/r1565655
Log:
UIMA-3405 Implement service instance failure/restart policy.

Modified:
    uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/AServicePing.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/UimaAsPing.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/PingDriver.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/AServicePing.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/AServicePing.java?rev=1565655&r1=1565654&r2=1565655&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/AServicePing.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/AServicePing.java Fri Feb  7 13:48:17 2014
@@ -28,9 +28,21 @@ import org.apache.uima.ducc.common.IServ
 
 public abstract class AServicePing
 {
+    protected int[] failure_window = null;      // tracks consecutive failures within a window
+    protected int failure_cursor = 0;           // cursor to track failures within the current window
+    protected int total_failures = 0;           // current total run failures. usually monotonically increasing.
+    protected int failure_max = 3;              // max allowed failures within any given window
+
+    protected int failure_window_size = 30;     // 30 minutes. overridden at first ping
+    protected int monitor_rate = 1;             // ping rate, in minutes, min 1 used for calculations
+
+    protected boolean log_enabled = false;
+    protected long service_id = 0;    
 
     protected Properties smState;
 
+    private org.apache.uima.ducc.common.utils.DuccLogger duccLogger = null;
+
     /**
      * Called by the ping driver, to pass in useful things the pinger may want.
      * @param arguments This is passed in from the service specification's
@@ -42,6 +54,43 @@ public abstract class AServicePing
     public abstract void init(String arguments, String endpoint)  throws Exception;
 
     /**
+     * Called by the ping driver to initialize static information about the service and
+     * pinger.  This guy calls the public init() method and is not intended for public
+     * consumption.
+     *
+     * This guy initializes the default failure monitor, service id, whether the
+     * service log is enabled, and the monitor (ping) rate, all of which are
+     * free to be used by derived classes.
+     *
+     * @param arguments This is passed in from the service specification's
+     *                  service_ping_arguments string.
+     *
+     * @param endpoint This is the name of the service endpoint, as passed in
+     *                 at service registration.
+     *
+     * @param initProps Properties file with static data about the service and 
+     *                  pinger.
+     */
+    public void init(String arguments, String endpoint, Properties initProps)
+        throws Exception
+    {
+        failure_window_size = Integer.parseInt(initProps.getProperty("failure-window"));
+        failure_window = new int[failure_window_size];
+        failure_cursor = 0;
+        
+        monitor_rate = Integer.parseInt(initProps.getProperty("monitor-rate") ) / 60000;       // convert to minutes
+        if (monitor_rate <= 0 ) monitor_rate = 1;                                                // minimum 1 minute allowed
+        
+        service_id = Long.parseLong(initProps.getProperty("service-id") );
+        
+        log_enabled = Boolean.parseBoolean(initProps.getProperty("do-log"));
+        
+        failure_max = Integer.parseInt(initProps.getProperty("failure-max"));                                            
+
+        init(arguments, endpoint);
+    }
+
+    /**
      * Stop is called by the ping wrapper when it is being killed.  Implementors may optionally
      * override this method with conenction shutdown code.
      */
@@ -53,31 +102,135 @@ public abstract class AServicePing
      *     for correct management and display of the service.
      */
     public abstract IServiceStatistics getStatistics();
-    
 
+    /**
+     * Current state of the monitored service is passed in here.
+     */    
     public void setSmState(Properties props)
     {
         smState = props;
     }
 
+    /**
+     * Getter of the service state;  Implementors may just access it directly if they want.
+     */
     public Properties getSmState() 
     {
         return smState;
     }
 
+    /**
+     * Called by the service manager to query the number of additional needed service instances.
+     */
     public int getAdditions()
     {
         return 0;
     }
 
+    /**
+     * Called by the service manager to query the number of service insances to dump.
+     */
     public int getDeletions()
     {
         return 0;
     }
 
+    private String fmtArray(int[] array)
+    {
+        Object[] vals = new Object[array.length];
+        StringBuffer sb = new StringBuffer();
+        
+        for ( int i = 0; i < array.length; i++ ) {
+            sb.append("%3s ");
+            vals[i] = Integer.toString(array[i]);
+        }
+        return String.format(sb.toString(), vals);
+    }
+
+    /**
+     * This is intended for use by the SM when it drives a pinger in an internal thread.  External
+     * pingers won't have this set.  
+     *
+     * However, external pinger's stdout is picked up by DUCC so the logger will still print
+     * stuff to the service log without the use of the ducc logger.
+     */
+    public void setLogger(org.apache.uima.ducc.common.utils.DuccLogger logger)
+    {
+        this.duccLogger = logger;
+    }
+
+    private void doLog(String methodName, Object ... msg)
+    {        
+        if ( !log_enabled ) return;
+
+        StringBuffer buf = new StringBuffer(methodName);
+        buf.append(" ");
+        buf.append(Long.toString(service_id));
+        for ( Object o : msg ) {
+            buf.append(" ");
+            if ( o == null ) {
+                buf.append("<null>");
+            } else {
+                buf.append(o.toString());
+            }
+        }
+
+        if ( duccLogger != null ) {
+            duccLogger.info(methodName, null, buf);
+        } else {
+            System.out.println(buf);
+        }
+    }
+
+    /**
+     * This determines if there have been excessive service instance failures by tracking the 
+     * number of failures, not consecutive, but rather within a window of time.  It may be
+     * overridden by extending monitors.
+     */
     public boolean isExcessiveFailures()
     {
-        return false;
+        String methodName = "isExcessiveFailures";
+        boolean excessive_failures = false;
+
+        // Calculate total instance failures within some configured window.  If we get a cluster
+        // of failures, signal excessive failures so SM stops spawning new ones.
+        int failures = Integer.parseInt(smState.getProperty("run-failures"));
+        doLog(methodName, "failures:", failures, "total_failures", total_failures);
+        if ( failures > 0 ) {
+            int diff = Math.max(0, failures - total_failures);  // nfailures since last update
+            if ( diff < 0 ) {
+                // This indicates an instance was restarted, which forces a cleaning of
+                // failure conditions.
+                total_failures = 0;
+                failure_cursor = 0;
+                for ( int i = 0; i < failure_window_size; i++ ) {
+                    failure_window[i] = 0;
+                }
+            } else if ( diff > 0 ) {
+                total_failures += diff;
+            }
+
+            if ( diff >= 0 ) {
+                failure_window[failure_cursor++] = diff;
+            }
+
+            doLog(methodName, "failures", failures, "total_failures", total_failures, 
+                  "failure_window", fmtArray(failure_window), "failure_cursor", failure_cursor);
+
+            failure_cursor = failure_cursor % failure_window_size;
+
+            int windowed_failures = 0;
+            excessive_failures = false;
+            for ( int i = 0; i < failure_window_size; i++ ) {
+                windowed_failures += failure_window[i];                    
+            }
+            if ( windowed_failures >= failure_max ) {
+                excessive_failures = true;
+            }
+            doLog(methodName, "windowed_failures", windowed_failures, "excessive_failures", excessive_failures);
+        }
+        return excessive_failures;
     }
 
+
 }

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/UimaAsPing.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/UimaAsPing.java?rev=1565655&r1=1565654&r2=1565655&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/UimaAsPing.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-cli/src/main/java/org/apache/uima/ducc/cli/UimaAsPing.java Fri Feb  7 13:48:17 2014
@@ -45,18 +45,6 @@ public class UimaAsPing
 {
     String ep;
 
-    int failure_max = 5;            // max consecutive run failures before reporting excessive failures
-                                    // which prevents restart of instances
-    int current_failures = 0;       // current consecutive run failures
-    int consecutive_failures = 0;   // n failures in consecutive pings
-    int failure_window_size = 15;   // 15 minutes
-    int monitor_rate = 1;           // ping rate, in minutes, min 1 used for calculations
-    int fail_index = 0;
-    int[] failure_window = null;    // tracks consecutive failures within a window
-    int failure_cursor = 0;
-    long service_id = 0;
-    
-    boolean excessive_failures = false;
     
     String endpoint;
     String broker;
@@ -73,7 +61,6 @@ public class UimaAsPing
     String nodeIp;
     String pid;
     boolean gmfail = false;
-    boolean enable_log = false;
     
     public UimaAsPing()
     {
@@ -128,15 +115,8 @@ public class UimaAsPing
             }
             meta_timeout         = props.getIntProperty    ("meta-timeout"   , 5000);
             broker_jmx_port      = props.getIntProperty    ("broker-jmx-port", 1099);
-            enable_log           = props.getBooleanProperty("enable-log"     , false);
-            failure_max          = props.getIntProperty    ("max-failures"   , failure_max);
-            failure_window_size  = props.getIntProperty    ("failure-window" , failure_window_size);
-            failure_window = new int[failure_window_size];
-            failure_cursor = 0;
         }
 
-        doLog("<ctr>", null, "INIT: meta_timeout", meta_timeout, "broker-jmx-port", broker_jmx_port);
-
         this.monitor = new UimaAsServiceMonitor(endpoint, broker_host, broker_jmx_port);
     }
 
@@ -146,8 +126,8 @@ public class UimaAsPing
     }
 
     private void doLog(String methodName, Object ... msg)
-    {
-        if ( ! enable_log ) return;
+    {        
+        if ( !log_enabled ) return;
 
         StringBuffer buf = new StringBuffer(methodName);
         for ( Object o : msg ) {
@@ -161,17 +141,6 @@ public class UimaAsPing
         System.out.println(buf);
     }
 
-    private String fmtArray(int[] array)
-    {
-        Object[] vals = new Object[array.length];
-        StringBuffer sb = new StringBuffer();
-        
-        for ( int i = 0; i < array.length; i++ ) {
-            sb.append("%3s ");
-            vals[i] = Integer.toString(array[i]);
-        }
-        return String.format(sb.toString(), vals);
-    }
 
     void evaluateService(IServiceStatistics stats)
     {
@@ -182,55 +151,12 @@ public class UimaAsPing
             monitor.collect();
             stats.setHealthy(true);       // this pinger defines 'healthy' as
                                           // 'service responds to get-meta and broker returns jmx stats'
-
-
-            monitor_rate = Integer.parseInt(smState.getProperty("monitor-rate") ) / 60000;       // convert to minutes
-            service_id   = Long.parseLong(smState.getProperty("service-id"));            
-            if (monitor_rate <= 0 ) monitor_rate = 1;                                            // minimum 1 minute allowed
-
-            // Calculate total instance failures within some configured window.  If we get a cluster
-            // of failures, signal excessive failures so SM stops spawning new ones.
-            int failures = Integer.parseInt(smState.getProperty("run-failures"));
-            doLog(methodName, "run-failures:", failures);
-            if ( (failure_window != null) && (failures > 0) ) {
-                int diff = failures - current_failures;  // nfailures since last update
-                current_failures = failures;
-
-                if ( diff > 0 ) {
-                    failure_window[failure_cursor++] = diff;
-                } else {
-                    failure_window[failure_cursor++] = 0;                    
-                }
-
-                doLog(methodName, "failures", failures, "current_failures", current_failures, 
-                      "failure_window", fmtArray(failure_window), "failure_cursor", failure_cursor);
-
-                failure_cursor = failure_cursor % failure_window_size;
-
-
-
-                int windowed_failures = 0;
-                excessive_failures = false;
-                for ( int i = 0; i < failure_window_size; i++ ) {
-                    windowed_failures += failure_window[i];                    
-                }
-                if ( windowed_failures >= failure_max ) {
-                    excessive_failures = true;
-                }
-                doLog(methodName, "windowed_failures", windowed_failures, "excessive_failures", excessive_failures);
-            }
-
         } catch ( Throwable t ) {
             stats.setHealthy(false);
             monitor.setJmxFailure(t.getMessage());
         }
     }
 
-    public boolean isExcessiveFailures()
-    {
-        return excessive_failures;
-    }
-
     public IServiceStatistics getStatistics()
     {
         String methodName = "getStatistics";

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/PingDriver.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/PingDriver.java?rev=1565655&r1=1565654&r2=1565655&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/PingDriver.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/PingDriver.java Fri Feb  7 13:48:17 2014
@@ -145,6 +145,7 @@ class PingDriver
 
         this.log_directory     = resolveStringProperty ("log_directory", ping_props, job_props, null);     // cli always puts this int job props, no default 
 
+
         jvm_args_str = jvm_args_str.trim();
         if ( jvm_args_str.equals("") ) {
             jvm_args = null;
@@ -357,12 +358,30 @@ class PingDriver
         }
     }
 
+    void setCommonInitProperties(Properties props)
+    {
+        props.setProperty("monitor-rate"    , "" + meta_ping_rate);
+        props.setProperty("service-id"      , "" + sset.getId().getFriendly());
+        props.setProperty("failure-max"     , "" + ServiceManagerComponent.failure_max);
+        props.setProperty("failure-window"  , "" + ServiceManagerComponent.failure_window);
+        props.setProperty("do-log"          , "" + do_log);
+    }
+
+    void setCommonProperties(Properties props)
+    {
+        props.setProperty("total-instances" , "" + sset.countImplementors());
+        props.setProperty("active-instances", "" + sset.getActiveInstances());
+        props.setProperty("references"      , "" + sset.countReferences());
+        props.setProperty("run-failures"    , "" + sset.getRunFailures());
+    }
+
     void runAsThread()
     {
         long tid = Thread.currentThread().getId();
 
     	String methodName = "runAsThread[" + tid + "]";
         AServicePing pinger = null;
+        Properties initProps = new Properties();
         Properties props = new Properties();
 
 		try {
@@ -381,18 +400,14 @@ class PingDriver
             return;		
 		}
 
-        try {
-            props.setProperty("total-instances", "" + sset.countImplementors());
-            props.setProperty("active-instances", "" + sset.getActiveInstances());
-            props.setProperty("references", "" + sset.countReferences());
-            props.setProperty("run-failures", "" + sset.getRunFailures());
-            props.setProperty("monitor-rate", "" + meta_ping_rate);
-            props.setProperty("service-id", "" + sset.getId().getFriendly());
-
-            pinger.setSmState(props);
-            pinger.init(ping_arguments, endpoint);
-            while ( ! shutdown ) {
-                
+        try {            
+            setCommonInitProperties(initProps);
+            pinger.setLogger(logger);
+            pinger.init(ping_arguments, endpoint, initProps);
+            
+            while ( ! shutdown ) {                
+                setCommonProperties(props);
+                pinger.setSmState(props);
                 Pong pr = new Pong();
 
                 pr.setStatistics(pinger.getStatistics());
@@ -647,13 +662,7 @@ class PingDriver
                     // Ask for the ping
                     try {
                         logger.info(methodName, sset.getId(), "ExtrnPingDriver: ping OUT");
-                        props.setProperty("total-instances" , "" + sset.countImplementors());
-                        props.setProperty("active-instances", "" + sset.getActiveInstances());
-                        props.setProperty("references"      , "" + sset.countReferences());
-                        props.setProperty("run-failures"    , "" + sset.getRunFailures());
-                        props.setProperty("monitor-rate"    , "" + meta_ping_rate);
-                        props.setProperty("service-id"      , "" + sset.getId().getFriendly());
-
+                        setCommonProperties(props);
                         oos.writeObject(new Ping(false, props));
                         oos.flush();
                         oos.reset();

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java?rev=1565655&r1=1565654&r2=1565655&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceHandler.java Fri Feb  7 13:48:17 2014
@@ -688,8 +688,9 @@ public class ServiceHandler
             wanted = instances;
         }
 
+        sset.setNInstances(running + instances);
+        sset.resetRuntimeErrors();
         if ( update ) {
-            sset.setNInstances(running + instances);
             sset.saveMetaProperties();
         }
 
@@ -743,8 +744,8 @@ public class ServiceHandler
             tolose = Math.min(instances, running);
         }
 
+        sset.setNInstances(Math.max(0, running - instances)); // never persist < 0 registered instance
         if ( update ) {
-            sset.setNInstances(Math.max(0, running - instances)); // never persist < 0 registered instance
             sset.saveMetaProperties();
         }
         

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java?rev=1565655&r1=1565654&r2=1565655&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceManagerComponent.java Fri Feb  7 13:48:17 2014
@@ -98,6 +98,7 @@ public class ServiceManagerComponent 
     static String default_ping_class;
 
     static int failure_max = 5;
+    static int failure_window = 30;
 
     private String state_dir = null;
     private String state_file = null;
@@ -253,6 +254,7 @@ public class ServiceManagerComponent 
 		DuccDaemonRuntimeProperties.getInstance().boot(DaemonName.ServiceManager,getProcessJmxUrl());
 
         failure_max = SystemPropertyResolver.getIntProperty("ducc.sm.instance.failure.max", failure_max);
+        failure_window = SystemPropertyResolver.getIntProperty("ducc.sm.instance.failure.window", failure_window);
         meta_ping_rate = SystemPropertyResolver.getIntProperty("ducc.sm.meta.ping.rate", meta_ping_rate);
         meta_ping_timeout = SystemPropertyResolver.getIntProperty("ducc.sm.meta.ping.timeout", meta_ping_timeout);
         meta_ping_stability = SystemPropertyResolver.getIntProperty("ducc.sm.meta.ping.stability", meta_ping_stability);

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java?rev=1565655&r1=1565654&r2=1565655&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-sm/src/main/java/org/apache/uima/ducc/sm/ServiceSet.java Fri Feb  7 13:48:17 2014
@@ -470,6 +470,12 @@ public class ServiceSet
         return ping_only;
     }
 
+    synchronized void resetRuntimeErrors()
+    {
+        run_failures = 0;
+        excessiveRunFailures = false;
+    }
+
     synchronized void setAutostart(boolean auto)
     {
         cancelLinger();