You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@tomcat.apache.org by mt...@apache.org on 2008/04/11 11:13:14 UTC

svn commit: r647085 - in /tomcat/connectors/trunk/jk: native/common/jk_lb_worker.c native/common/jk_lb_worker.h native/common/jk_shm.h xdocs/miscellaneous/changelog.xml

Author: mturk
Date: Fri Apr 11 02:13:07 2008
New Revision: 647085

URL: http://svn.apache.org/viewvc?rev=647085&view=rev
Log:
Fix lb node in-error setting (see changelog)

Modified:
    tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c
    tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h
    tomcat/connectors/trunk/jk/native/common/jk_shm.h
    tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml

Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c
URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c (original)
+++ tomcat/connectors/trunk/jk/native/common/jk_lb_worker.c Fri Apr 11 02:13:07 2008
@@ -50,8 +50,8 @@
  * Note: activation == JK_LB_ACTIVATION_ACTIVE is equivalent to
  *       activation is none of JK_LB_ACTIVATION_STOPPED, JK_LB_ACTIVATION_DISABLED
  */
-#define JK_WORKER_USABLE(s, activation)   ((s)->state <= JK_LB_STATE_FORCE && activation == JK_LB_ACTIVATION_ACTIVE)
-#define JK_WORKER_USABLE_STICKY(s, activation)   ((s)->state <= JK_LB_STATE_BUSY && activation != JK_LB_ACTIVATION_STOPPED)
+#define JK_WORKER_USABLE(s, activation)   ((s) <= JK_LB_STATE_FORCE && activation == JK_LB_ACTIVATION_ACTIVE)
+#define JK_WORKER_USABLE_STICKY(s, activation)   ((s) <= JK_LB_STATE_BUSY && activation != JK_LB_ACTIVATION_STOPPED)
 
 static const char *lb_locking_type[] = {
     JK_LB_LOCK_TEXT_OPTIMISTIC,
@@ -657,7 +657,7 @@
 
 static int find_by_session(jk_ws_service_t *s,
                            lb_worker_t *p,
-                           const char *name,
+                           const char *name,                           
                            jk_logger_t *l)
 {
 
@@ -676,6 +676,7 @@
 static int find_best_bydomain(jk_ws_service_t *s,
                               lb_worker_t *p,
                               const char *domain,
+                              int *states,
                               jk_logger_t *l)
 {
     unsigned int i;
@@ -701,7 +702,7 @@
                      JK_LB_ACTIVATION_UNSET;
         if (activation == JK_LB_ACTIVATION_UNSET)
             activation = wr.activation;
-        if (JK_WORKER_USABLE(wr.s, activation)) {
+        if (JK_WORKER_USABLE(states[wr.i], activation)) {
             if (candidate < 0 || wr.distance < d ||
                 (wr.s->lb_value < curmin &&
                 wr.distance == d)) {
@@ -718,6 +719,7 @@
 
 static int find_best_byvalue(jk_ws_service_t *s,
                              lb_worker_t *p,
+                             int *states,
                              jk_logger_t *l)
 {
     unsigned int i;
@@ -746,7 +748,7 @@
         /* Take into calculation only the workers that are
          * not in error state, stopped, disabled or busy.
          */
-        if (JK_WORKER_USABLE(wr.s, activation)) {
+        if (JK_WORKER_USABLE(states[wr.i], activation)) {
             if (candidate < 0 || wr.distance < d ||
                 (wr.s->lb_value < curmin &&
                 wr.distance == d)) {
@@ -763,6 +765,7 @@
 static int find_bysession_route(jk_ws_service_t *s,
                                 lb_worker_t *p,
                                 const char *name,
+                                int *states,
                                 jk_logger_t *l)
 {
     int uses_domain  = 0;
@@ -771,7 +774,7 @@
     candidate = find_by_session(s, p, name, l);
     if (candidate < 0) {
         uses_domain = 1;
-        candidate = find_best_bydomain(s, p, name, l);
+        candidate = find_best_bydomain(s, p, name, states, l);
     }
     if (candidate >= 0) {
         lb_sub_worker_t wr = p->lb_workers[candidate];
@@ -783,7 +786,7 @@
                      JK_LB_ACTIVATION_UNSET;
         if (activation == JK_LB_ACTIVATION_UNSET)
             activation = wr.activation;
-        if (!JK_WORKER_USABLE_STICKY(wr.s, activation)) {
+        if (!JK_WORKER_USABLE_STICKY(states[wr.i], activation)) {
             /* We have a worker that is error state or stopped.
              * If it has a redirection set use that redirection worker.
              * This enables to safely remove the member from the
@@ -797,7 +800,7 @@
                 s->route = NULL;
             }
             else if (*wr.domain && !uses_domain) {
-                candidate = find_best_bydomain(s, p, wr.domain, l);
+                candidate = find_best_bydomain(s, p, wr.domain, states, l);
                 s->route = wr.domain;
             }
             if (candidate >= 0) {
@@ -807,7 +810,7 @@
                              JK_LB_ACTIVATION_UNSET;
                 if (activation == JK_LB_ACTIVATION_UNSET)
                     activation = wr.activation;
-                if (!JK_WORKER_USABLE_STICKY(wr.s, activation))
+                if (!JK_WORKER_USABLE_STICKY(states[wr.i], activation))
                     candidate = -1;
             }
         }
@@ -817,6 +820,7 @@
 
 static int find_failover_worker(jk_ws_service_t *s,
                                 lb_worker_t * p,
+                                int *states,
                                 jk_logger_t *l)
 {
     int rc = -1;
@@ -830,26 +834,28 @@
         }
     }
     if (redirect)
-        rc = find_bysession_route(s, p, redirect, l);
+        rc = find_bysession_route(s, p, redirect, states, l);
     return rc;
 }
 
 static int find_best_worker(jk_ws_service_t *s,
                             lb_worker_t * p,
+                            int *states,
                             jk_logger_t *l)
 {
     int rc = -1;
 
-    rc = find_best_byvalue(s, p, l);
+    rc = find_best_byvalue(s, p, states, l);
     /* By default use worker route as session route */
     if (rc < 0)
-        rc = find_failover_worker(s, p, l);
+        rc = find_failover_worker(s, p, states, l);
     return rc;
 }
 
 static lb_sub_worker_t *get_most_suitable_worker(jk_ws_service_t *s,
                                                  lb_worker_t * p,
                                                  char *sessionid,
+                                                 int *states,
                                                  jk_logger_t *l)
 {
     int rc = -1;
@@ -865,7 +871,7 @@
                          JK_LB_ACTIVATION_UNSET;
         if (activation == JK_LB_ACTIVATION_UNSET)
             activation = p->lb_workers[0].activation;
-        if (JK_WORKER_USABLE_STICKY(p->lb_workers[0].s, activation)) {
+        if (JK_WORKER_USABLE_STICKY(states[0], activation)) {
             if (activation != JK_LB_ACTIVATION_DISABLED) {
                 JK_TRACE_EXIT(l);
                 return p->lb_workers;
@@ -907,7 +913,7 @@
                            session_route);
 
                 /* We have a session route. Whow! */
-                rc = find_bysession_route(s, p, session_route, l);
+                rc = find_bysession_route(s, p, session_route, states, l);
                 if (rc >= 0) {
                     lb_sub_worker_t *wr = &(p->lb_workers[rc]);
                     if (p->lblock == JK_LB_LOCK_PESSIMISTIC)
@@ -940,7 +946,7 @@
             return NULL;
         }
     }
-    rc = find_best_worker(s, p, l);
+    rc = find_best_worker(s, p, states, l);
     if (p->lblock == JK_LB_LOCK_PESSIMISTIC)
         jk_shm_unlock();
     else {
@@ -1014,6 +1020,8 @@
     int recoverable = JK_TRUE;
     int rc = JK_UNSET;
     char *sessionid = NULL;
+    int  *states = NULL;
+    int i;
 
     JK_TRACE_ENTER(l);
 
@@ -1030,11 +1038,22 @@
 
     /* Set returned error to OK */
     *is_error = JK_HTTP_OK;
+    if (!(states = (int *)malloc(num_of_workers * sizeof(int)))) {
+        *is_error = JK_HTTP_SERVER_ERROR;
+        jk_log(l, JK_LOG_ERROR,
+               "Failed allocating private worker state memory");
+        JK_TRACE_EXIT(l);
+        return JK_SERVER_ERROR;        
+    }
 
     jk_shm_lock();
     if (p->worker->sequence != p->worker->s->h.sequence)
         jk_lb_pull(p->worker, l);
     jk_shm_unlock();
+    for (i = 0; i < num_of_workers; i++) {
+        /* Copy the shared state info */
+        states[i] = p->worker->lb_workers[i].s->state;
+    }
 
     /* set the recovery post, for LB mode */
     s->reco_buf = jk_b_new(s->pool);
@@ -1068,7 +1087,7 @@
 
     while (attempt <= num_of_workers && recoverable == JK_TRUE) {
         lb_sub_worker_t *rec =
-            get_most_suitable_worker(s, p->worker, sessionid, l);
+            get_most_suitable_worker(s, p->worker, sessionid, states, l);
         rc = JK_FALSE;
         *is_error = JK_HTTP_SERVER_BUSY;
         /* Do not reuse previous worker, because
@@ -1093,8 +1112,10 @@
 
             if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                 jk_shm_lock();
-            if (rec->s->state == JK_LB_STATE_RECOVER)
-                rec->s->state = JK_LB_STATE_PROBE;
+            if (rec->s->state == JK_LB_STATE_RECOVER) {
+                rec->s->state  = JK_LB_STATE_PROBE;
+                states[rec->i] = JK_LB_STATE_PROBE;
+            }
             if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                 jk_shm_unlock();
                        
@@ -1124,8 +1145,10 @@
                  */
                 if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                     jk_shm_lock();
-                if (rec->s->state != JK_LB_STATE_ERROR)
-                    rec->s->state = JK_LB_STATE_BUSY;
+                if (rec->s->state != JK_LB_STATE_ERROR) {
+                    rec->s->state  = JK_LB_STATE_BUSY;
+                    states[rec->i] = JK_LB_STATE_BUSY;
+                }
                 if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                     jk_shm_unlock();
                 jk_log(l, JK_LOG_INFO,
@@ -1146,6 +1169,7 @@
 
                 /* Increment the number of workers serving request */
                 p->worker->s->busy++;
+                rec->s->busy++;
                 if (p->worker->s->busy > p->worker->s->max_busy)
                     p->worker->s->max_busy = p->worker->s->busy;
                 if ( (p->worker->lbmethod == JK_LB_METHOD_REQUESTS) ||
@@ -1194,14 +1218,18 @@
                 /* When returning the endpoint mark the worker as not busy.
                  * We have at least one endpoint free
                  */
-                if (rec->s->state == JK_LB_STATE_BUSY)
-                    rec->s->state = JK_LB_STATE_OK;
+                if (rec->s->state == JK_LB_STATE_BUSY) {
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_OK;
+                }
                 /* Decrement the busy worker count.
                  * Check if the busy was reset to zero by graceful
                  * restart of the server.
                  */
                 if (p->worker->s->busy)
                     p->worker->s->busy--;
+                if (rec->s->busy)
+                    rec->s->busy--;
                 if (service_stat == JK_TRUE) {
                     rec->s->state = JK_LB_STATE_OK;
                     rec->s->error_time = 0;
@@ -1213,7 +1241,8 @@
                     * Client error !!!
                     * Since this is bad request do not fail over.
                     */
-                    rec->s->state = JK_LB_STATE_OK;
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = 0;
                     rc = JK_CLIENT_ERROR;
                     recoverable = JK_FALSE;
@@ -1224,7 +1253,8 @@
                     * Don't mark the node as bad.
                     * Failing over to another node could help.
                     */
-                    rec->s->state = JK_LB_STATE_OK;
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = 0;
                     rc = JK_FALSE;
                 }
@@ -1234,7 +1264,8 @@
                     * Don't mark the node as bad.
                     * Failing over to another node could help.
                     */
-                    rec->s->state = JK_LB_STATE_OK;
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = 0;
                     rc = JK_FALSE;
                 }
@@ -1245,46 +1276,66 @@
                     * Failing over to another node could help.
                     */
                     rec->s->errors++;
-                    rec->s->state = JK_LB_STATE_ERROR;
+                    if (rec->s->busy) {
+                        rec->s->state = JK_LB_STATE_OK;
+                    }
+                    else {
+                        rec->s->state = JK_LB_STATE_ERROR;
+                    }
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = time(NULL);
                     rc = JK_FALSE;
                 }
                 else if (service_stat == JK_REPLY_TIMEOUT) {
                     if (aw->s->reply_timeouts > (unsigned)p->worker->max_reply_timeouts) {
                         /*
-                        * Service failed - to many reply timeouts
-                        * Take this node out of service.
-                        */
+                         * Service failed - to many reply timeouts
+                         * Take this node out of service.
+                         */
                         rec->s->errors++;
-                        rec->s->state = JK_LB_STATE_ERROR;
+                        if (rec->s->busy) {
+                            rec->s->state = JK_LB_STATE_OK;
+                        }
+                        else {
+                            rec->s->state = JK_LB_STATE_ERROR;
+                        }
+                        states[rec->i] = JK_LB_STATE_ERROR;
                         rec->s->error_time = time(NULL);
                     }
                     else {
-                    /*
-                    * XXX: if we want to be able to failover
-                    * to other nodes after a reply timeout,
-                    * but we do not put the timeout node into error,
-                    * how can we make sure, that we actually fail over
-                    * to other nodes?
-                    */
-                        rec->s->state = JK_LB_STATE_OK;
+                        /*
+                         * XXX: if we want to be able to failover
+                         * to other nodes after a reply timeout,
+                         * but we do not put the timeout node into error,
+                         * how can we make sure, that we actually fail over
+                         * to other nodes?
+                         */
+                        rec->s->state  = JK_LB_STATE_OK;
+                        states[rec->i] = JK_LB_STATE_ERROR;
                         rec->s->error_time = 0;
                     }
                     rc = JK_FALSE;
                 }
                 else {
                     /*
-                    * Service failed !!!
-                    * Time for fault tolerance (if possible)...
-                    */
+                     * Service failed !!!
+                     * Time for fault tolerance (if possible)...
+                     */
                     rec->s->errors++;
-                    rec->s->state = JK_LB_STATE_ERROR;
+                    if (rec->s->busy) {
+                        rec->s->state = JK_LB_STATE_OK;
+                    }
+                    else {
+                        rec->s->state = JK_LB_STATE_ERROR;
+                    }
+                    states[rec->i] = JK_LB_STATE_ERROR;
                     rec->s->error_time = time(NULL);
                     rc = JK_FALSE;
                 }
-                if (rec->s->state == JK_LB_STATE_ERROR)
+                if (states[rec->i] == JK_LB_STATE_ERROR)
                     jk_log(l, JK_LOG_INFO,
-                           "service failed, worker %s is in error state",
+                           "service failed, %sworker %s is in error state",
+                           rec->s->state == JK_LB_STATE_ERROR ? "entire " : "",
                            rec->name);
                 if (p->worker->lblock == JK_LB_LOCK_PESSIMISTIC)
                     jk_shm_unlock();
@@ -1368,6 +1419,8 @@
         lb_add_log_items(s, lb_last_log_names, prec, l);
     }
 
+    /* Free any private memory used */
+    free(states);
     JK_TRACE_EXIT(l);
     return rc;
 }
@@ -1434,6 +1487,8 @@
             for (i = 0; i < num_of_workers; i++) {
                 const char *s;
                 unsigned int ms;
+
+                p->lb_workers[i].i = i;
                 strncpy(p->lb_workers[i].name, worker_names[i],
                         JK_SHM_STR_SIZ);
                 strncpy(p->lb_workers[i].s->h.name, worker_names[i],

Modified: tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h
URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h (original)
+++ tomcat/connectors/trunk/jk/native/common/jk_lb_worker.h Fri Apr 11 02:13:07 2008
@@ -158,6 +158,8 @@
     int activation;
     /* Current lb factor */
     int lb_factor;
+    /* Current worker index */
+    int i;
     /* Current lb reciprocal factor */
     jk_uint64_t lb_mult;
 };

Modified: tomcat/connectors/trunk/jk/native/common/jk_shm.h
URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/native/common/jk_shm.h?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/native/common/jk_shm.h (original)
+++ tomcat/connectors/trunk/jk/native/common/jk_shm.h Fri Apr 11 02:13:07 2008
@@ -124,6 +124,8 @@
     char    domain[JK_SHM_STR_SIZ+1];
     /* worker redirect route */
     char    redirect[JK_SHM_STR_SIZ+1];
+    /* Number of currently busy channels */
+    volatile int busy;
     /* worker distance */
     volatile int distance;
     /* current activation state (config) of the worker */

Modified: tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml
URL: http://svn.apache.org/viewvc/tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml?rev=647085&r1=647084&r2=647085&view=diff
==============================================================================
--- tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml (original)
+++ tomcat/connectors/trunk/jk/xdocs/miscellaneous/changelog.xml Fri Apr 11 02:13:07 2008
@@ -48,6 +48,15 @@
         preprocessor defines. (rjung)
       </update>
       <fix>
+        Do not put loadbalancer node in error state if there is opened
+        channel. This fixes the bug when new new connection fails due to
+        bussines, causing opened connections fail stickyness.
+        This brings back per-node busy counter and private state array
+        for each request. We can mark the state as error for failover to
+        work while still operating and reporting node as OK if there are
+        opened working connections. (mturk)
+      </fix>
+      <fix>
         <bug>44738</bug>: Fix merging of JkOption ForwardURI* between virtual hosts.
         Patch contributed by Toshihiro Sasajima. (rjung)
       </fix>



---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org

Re: svn commit: r647085 - in /tomcat/connectors/trunk/jk: native/common/jk_lb_worker.c native/common/jk_lb_worker.h native/common/jk_shm.h xdocs/miscellaneous/changelog.xml

Posted by Rainer Jung <ra...@kippdata.de>.

Hi Mladen,

I went through the new states and busy lines. Comments see below.

mturk@apache.org schrieb:
> @@ -1030,11 +1038,22 @@
>  
>      /* Set returned error to OK */
>      *is_error = JK_HTTP_OK;
> +    if (!(states = (int *)malloc(num_of_workers * sizeof(int)))) {
> +        *is_error = JK_HTTP_SERVER_ERROR;
> +        jk_log(l, JK_LOG_ERROR,
> +               "Failed allocating private worker state memory");
> +        JK_TRACE_EXIT(l);
> +        return JK_SERVER_ERROR;        
> +    }

We don't free, if a few lines further down allocating the AJP 
message/message buffer fails, because then we use an early return.

Maybe we should also add

@@ -1231,7 +1231,11 @@
                  if (rec->s->busy)
                      rec->s->busy--;
                  if (service_stat == JK_TRUE) {
-                    rec->s->state = JK_LB_STATE_OK;
+                    /*
+                     * Successful request.
+                     */
+                    rec->s->state  = JK_LB_STATE_OK;
+                    states[rec->i] = JK_LB_STATE_OK;
                      rec->s->error_time = 0;
                      rc = JK_TRUE;
                      recoverable = JK_UNSET;

Since we will use the final value of states[rec->i] for logging. At the 
moment you only check against ERROR there, but one never knows for the 
future. It sounds safer to set state[] always when we set ->state.

> @@ -1146,6 +1169,7 @@
>  
>                  /* Increment the number of workers serving request */
>                  p->worker->s->busy++;
> +                rec->s->busy++;
>                  if (p->worker->s->busy > p->worker->s->max_busy)
>                      p->worker->s->max_busy = p->worker->s->busy;
>                  if ( (p->worker->lbmethod == JK_LB_METHOD_REQUESTS) ||

So we actually have two different uses for the local states copy:

A) in case of a problem with an lb member, which is not big enough to 
put it into global error, we can make sure, that we don't use the same 
node again during failover of the single request (by setting the local 
state to error). This behaviour is easy to understand.

B) we want to differentiate the decision of putting an lb member into 
global error depending on the observation, if there is still some 
request from the same process running on the "failed" node. In some 
error situations only the last returning request will trigger global 
error state. This could mean, that we might never trigger it, because we 
never stop sending new requests there and none of the returning requests 
might be the last ones, although all of them return constantly with the 
same problem. This case seems to make sense only for few situations.

So when do we use cases A) and B)?

A) service_stat is one of:

- JK_CLIENT_ERROR: The case of client read or write errors. We don't 
recover in this case, so if we set states to OK or to ERROR only changes 
the logging behaviour at the end of the main loop. Do you really want to 
log the additional message? Since we do no recovery, we could just as 
well stick to OK here (also for states).

- JK_SERVER_ERROR: the case, when we ran into memory allocation problems 
in jk_ajp_common. Good example for A).

- JK_STATUS_ERROR: Fail on status soft error. Good example for A)

- JK_REPLY_TIMEOUT (below max reply timeouts): Good example for A)

B) service_stat is one of:

- JK_STATUS_FATAL_ERROR: Fail on status hard error. Since the user 
configured, that he wants the node to go into error, I don't understand, 
why we should respect the busy counter.

- JK_REPLY_TIMEOUT (above max reply timeouts). Here I'm afraid, that 
this has a high potential for actually making the timeout useless. If a 
node throws a high number of reply timeouts, but we wait for the "last" 
request to return before putting it into error, I assume it will never 
be put into error, sonce there will be no "last" request. We simply 
proceed forwarding requests and busy will stay above 0 all the time. So 
I think, in this case we should keep it the way it was before, i.e. 
state=ERROR.

- all other error return codes (JK_FALSE, JK_FATAL_ERROR): Lots of 
different reasons, from invalid struct params in service call, to all 
sorts of low-level communication problems (timeouts, connection refused 
etc.), protocol errors (corrupt AJP packets), ...

Again here I'm not able to understand the behaviour after we switch to 
respecting busyness. I would assume, that again the node gets traffic 
all the time and in effect we no longer put it into error, but keep it 
OK all the time.

In summary, I see no clear situations, where B) really is a good idea. 
Would you mind to provide more insight?

I somehow think, that if you want to do more clever error reactions, we 
need to check conditions inside jk_ajp_common and then return the info, 
if we only want to have local or global error back to the lb (like the 
recoverable flag in the jk_endpoint; we tried to guess recoverability 
from the ajp service return code earlier, but the info wasn't rich 
enough, so the jk_ajp_common can now return recoverability info directly 
via endpoint.recoverable).

A couple of additional thoughts:

- error_time: in the cases, where you set the global state depending on 
the busy value, you always set error_time, even if the node stays OK.

- force_recovery inside service loop does not reset the states array and 
thus is broken

- in case B), if busy>=0, should we really set the global state to OK? 
We might override an ERROR state set by some other process. I've got no 
good solution for this.

- states inside the service loop do no longer learn from other 
processes, because they are initialized before the loop. Moving 
initialization to the beginning of the loop doesn't help either, because 
then we'll override possible local error states learned during the 
previous loop iterations. One could fix it with "used state in this 
iteration = worse of global state and local state".

Regards,

Rainer

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@tomcat.apache.org
For additional commands, e-mail: dev-help@tomcat.apache.org