You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@httpd.apache.org by Dean Gaudet <dg...@arctic.org> on 1997/09/11 22:03:33 UTC

Re: [PATCH] scoreboard fixes (take 2)

Alexei just committed a win32 fix that conflicts with this patch.

Here's an update.

Dean

Index: CHANGES
===================================================================
RCS file: /export/home/cvs/apachen/src/CHANGES,v
retrieving revision 1.432
diff -u -r1.432 CHANGES
--- CHANGES	1997/09/10 17:43:17	1.432
+++ CHANGES	1997/09/11 20:00:39
@@ -1,5 +1,12 @@
 Changes with Apache 1.3b1
 
+  *) API: the short_score record has been split into two pieces, one which
+     the parent writes on, and one which the child writes on.  As part of
+     this change the get_scoreboard_info() function was removed, and
+     scoreboard_image was exported.  This change fixes a race condition
+     in filebased scoreboard systems, and speeds up changes involving the
+     scoreboard in earlier 1.3 development.  [Dean Gaudet]
+
   *) API: New function child_terminate() triggers the child process to
      exit, while allowing the child finish what it needs to for the
      current request first.  
Index: main/http_main.c
===================================================================
RCS file: /export/home/cvs/apachen/src/main/http_main.c,v
retrieving revision 1.217
diff -u -r1.217 http_main.c
--- http_main.c	1997/09/11 19:32:43	1.217
+++ http_main.c	1997/09/11 20:00:43
@@ -242,7 +242,7 @@
 static int my_child_num;
 #endif
 
-static scoreboard *scoreboard_image = NULL;
+scoreboard *scoreboard_image = NULL;
 
 /* small utility macros to make things easier to read */
 
@@ -1424,7 +1424,7 @@
  * anyway.
  */
 
-API_EXPORT(void) sync_scoreboard_image (void)
+inline void sync_scoreboard_image (void)
 {
 #ifdef SCOREBOARD_FILE
     lseek (scoreboard_fd, 0L, 0);
@@ -1434,7 +1434,7 @@
 
 #endif /* MULTITHREAD */
 
-API_EXPORT(int) exists_scoreboard_image (void)
+int exists_scoreboard_image (void)
 {
     return (scoreboard_image ? 1 : 0);
 }
@@ -1459,17 +1459,14 @@
     sync_scoreboard_image();
     ss = &scoreboard_image->servers[child_num];
     old_status = ss->status;
-    ss->x.pid = my_pid;
     ss->status = status;
 #ifdef OPTIMIZE_TIMEOUTS
     ++ss->cur_vtime;
 #endif
 
 #if defined(STATUS)
-#ifdef OPTIMIZE_TIMEOUTS
-    ss->last_used = ss->last_rtime;	/* close enough */
-#else
-    ss->last_used=time(NULL);
+#ifndef OPTIMIZE_TIMEOUTS
+    ss->last_used = time(NULL);
 #endif
     if (status == SERVER_READY || status == SERVER_DEAD) {
 	/*
@@ -1514,11 +1511,6 @@
 #endif
 }
 
-API_EXPORT(short_score) get_scoreboard_info(int i)
-{
-    return (scoreboard_image->servers[i]);
-}
-
 #if defined(STATUS)
 void time_process_request (int child_num, int status)
 {
@@ -1595,7 +1587,7 @@
     int i;
 
     for (i = 0; i < max_daemons_limit; ++i)
-	if (scoreboard_image->servers[i].x.pid == pid)
+	if (scoreboard_image->parent[i].pid == pid)
 	    return i;
 
     return -1;
@@ -1608,7 +1600,7 @@
 
     sync_scoreboard_image();
     for (i = 0; i < max_daemons_limit; ++i) {
-	int pid = scoreboard_image->servers[i].x.pid;
+	int pid = scoreboard_image->parent[i].pid;
 
 	if (pid != my_pid && pid != 0) { 
 	    int waitret = 0,
@@ -1687,7 +1679,7 @@
 
     for (n = 0; n < max_daemons_limit; ++n) {
 	if (scoreboard_image->servers[n].status != SERVER_DEAD
-		&& waitpid (scoreboard_image->servers[n].x.pid, &status, WNOHANG)
+		&& waitpid (scoreboard_image->parent[n].pid, &status, WNOHANG)
 		    == -1
 		&& errno == ECHILD) {
 	    sync_scoreboard_image ();
@@ -1720,7 +1712,7 @@
             if(scoreboard_image->servers[pi].status != SERVER_DEAD)
             {
                 e[hi] = pi;
-                h[hi++] = (HANDLE)scoreboard_image->servers[pi].x.pid;
+                h[hi++] = (HANDLE)scoreboard_image->parent[pi].pid;
             }
 
         }
@@ -1730,9 +1722,9 @@
             if(rv == -1)
                 err = GetLastError();
             if((WAIT_OBJECT_0 <= (unsigned int)rv) && ((unsigned int)rv < (WAIT_OBJECT_0 + hi)))
-                return(scoreboard_image->servers[e[rv - WAIT_OBJECT_0]].x.pid);
+                return(scoreboard_image->parent[e[rv - WAIT_OBJECT_0]].pid);
             else if((WAIT_ABANDONED_0 <= (unsigned int)rv) && ((unsigned int)rv < (WAIT_ABANDONED_0 + hi)))
-                return(scoreboard_image->servers[e[rv - WAIT_ABANDONED_0]].x.pid);
+                return(scoreboard_image->parent[e[rv - WAIT_ABANDONED_0]].pid);
 
         }
     }
@@ -2897,7 +2889,7 @@
     }    
 }
 
-static int make_child(server_rec *s, int slot)
+static int make_child(server_rec *s, int slot, time_t now)
 {
     int pid;
 
@@ -2944,14 +2936,15 @@
 	child_main (slot);
     }
 
-    /* If the parent proceeds with a restart before the child has written
-     * their pid into the scoreboard we'll end up "forgetting" about the
-     * child.  So we write the child pid into the scoreboard now.  (This
-     * is safe, because the child is going to be writing the same value
-     * to the same word.)
-     * XXX: this needs to be sync'd to disk in the non shared memory stuff
-     */
-    scoreboard_image->servers[slot].x.pid = pid;
+#ifdef OPTIMIZE_TIMEOUTS
+    scoreboard_image->parent[slot].last_rtime = now;
+#endif
+    scoreboard_image->parent[slot].pid = pid;
+#ifdef SCOREBOARD_FILE
+    lseek(scoreboard_fd, XtOffsetOf(scoreboard, parent[slot]), 0);
+    force_write(scoreboard_fd, &scoreboard_image->parent[slot],
+	sizeof(parent_score));
+#endif
 
     return 0;
 }
@@ -2961,12 +2954,13 @@
 static void startup_children (int number_to_start)
 {
     int i;
+    time_t now = time(0);
 
     for (i = 0; number_to_start && i < daemons_limit; ++i ) {
 	if (scoreboard_image->servers[i].status != SERVER_DEAD) {
 	    continue;
 	}
-	if (make_child (server_conf, i) < 0) {
+	if (make_child (server_conf, i, now) < 0) {
 	    break;
 	}
 	--number_to_start;
@@ -2991,26 +2985,30 @@
     int i;
     int to_kill;
     int idle_count;
-    int free_head;
-    int *free_ptr;
-    int free_length;
     short_score *ss;
-#ifdef OPTIMIZE_TIMEOUTS
     time_t now = time(0);
-#endif
+    int free_length;
+    int free_slots[MAX_SPAWN_RATE];
+    int last_non_dead;
 
     /* initialize the free_list */
-    free_head = -1;
-    free_ptr = &free_head;
     free_length = 0;
 
     to_kill = -1;
     idle_count = 0;
+    last_non_dead = -1;
 
     sync_scoreboard_image ();
     for (i = 0; i < daemons_limit; ++i) {
+	if (i >= max_daemons_limit && free_length == idle_spawn_rate) break;
 	ss = &scoreboard_image->servers[i];
 	switch (ss->status) {
+	    /* We consider a starting server as idle because we started it
+	     * at least a cycle ago, and if it still hasn't finished starting
+	     * then we're just going to swamp things worse by forking more.
+	     * So we hopefully won't need to fork more if we count it.
+	     */
+	case SERVER_STARTING:
 	case SERVER_READY:
 	    ++idle_count;
 	    /* always kill the highest numbered child if we have to...
@@ -3024,39 +3022,43 @@
 	case SERVER_DEAD:
 	    /* try to keep children numbers as low as possible */
 	    if (free_length < idle_spawn_rate) {
-		*free_ptr = i;
-		free_ptr = &scoreboard_image->servers[i].x.free_list;
+		free_slots[free_length] = i;
 		++free_length;
 	    }
 	    break;
 	}
+	if (ss->status != SERVER_DEAD) {
+	    last_non_dead = i;
 #ifdef OPTIMIZE_TIMEOUTS
-	if (ss->status != SERVER_DEAD && ss->timeout_len) {
-	    /* if it's a live server, with a live timeout then start checking
-	     * its timeout */
-	    if (ss->cur_vtime != ss->last_vtime) {
-		/* it has made progress, so update its last_rtime, last_vtime */
-		ss->last_rtime = now;
-		ss->last_vtime = ss->cur_vtime;
-	    } else if (ss->last_rtime + ss->timeout_len < now) {
-		/* no progress, and the timeout length has been exceeded */
-		ss->timeout_len = 0;
-		kill (ss->x.pid, SIGALRM);
+	    if (ss->timeout_len) {
+		/* if it's a live server, with a live timeout then
+		 * start checking its timeout */
+		parent_score *ps = &scoreboard_image->parent[i];
+		if (ss->cur_vtime != ps->last_vtime) {
+		    /* it has made progress, so update its last_rtime,
+		     * last_vtime */
+		    ps->last_rtime = now;
+		    ps->last_vtime = ss->cur_vtime;
+		} else if (ps->last_rtime + ss->timeout_len < now) {
+		    /* no progress, and the timeout length has been exceeded */
+		    ss->timeout_len = 0;
+		    kill (ps->pid, SIGALRM);
+		}
 	    }
-	}
 #endif
+	}
     }
+    max_daemons_limit = last_non_dead + 1;
     if (idle_count > daemons_max_free) {
 	/* kill off one child... we use SIGUSR1 because that'll cause it to
 	 * shut down gracefully, in case it happened to pick up a request
 	 * while we were counting
 	 */
-	kill (scoreboard_image->servers[to_kill].x.pid, SIGUSR1);
+	kill (scoreboard_image->parent[to_kill].pid, SIGUSR1);
 	idle_spawn_rate = 1;
     } else if (idle_count < daemons_min_free) {
 	/* terminate the free list */
-	*free_ptr = -1;
-	if (free_head == -1) {
+	if (free_length == 0) {
 	    /* only report this condition once */
 	    static int reported = 0;
 
@@ -3066,6 +3068,7 @@
 			    " raising the MaxClients setting");
 		reported = 1;
 	    }
+	    idle_spawn_rate = 1;
 	} else {
 	    if (idle_spawn_rate >= 4) {
 		aplog_error(APLOG_MARK, APLOG_ERR, server_conf,
@@ -3073,12 +3076,8 @@
 			    "to increase StartServers, or Min/MaxSpareServers)",
 			    idle_spawn_rate);
 	    }
-	    i = 0;
-	    while (i < idle_spawn_rate && free_head != -1) {
-		int slot = free_head;
-		free_head = scoreboard_image->servers[free_head].x.free_list;
-		make_child (server_conf, slot);
-		++i;
+	    for (i = 0; i < free_length; ++i) {
+		make_child (server_conf, free_slots[i], now);
 	    }
 	    /* the next time around we want to spawn twice as many if this
 	     * wasn't good enough, but not if we've just done a graceful
@@ -3202,10 +3201,8 @@
 			/* we're still doing a 1-for-1 replacement of dead
 			 * children with new children
 			 */
-			make_child (server_conf, child_slot);
+			make_child (server_conf, child_slot, time(0));
 			--remaining_children_to_start;
-			/* don't perform idle maintenance yet */
-			continue;
 		    }
 		} else if (is_graceful) {
 		    /* Great, we've probably just lost a slot in the
@@ -3215,6 +3212,12 @@
 		    aplog_error(APLOG_MARK, APLOG_WARNING, server_conf,
 				"long lost child came home! (pid %d)", pid );
 		}
+		/* Don't perform idle maintenance when a child dies,
+		 * only do it when there's a timeout.  Remember only a
+		 * finite number of children can die, and it's pretty
+		 * pathological for a lot to die suddenly.
+		 */
+		continue;
 	    } else if (remaining_children_to_start) {
 		/* we hit a 1 second timeout in which none of the previous
 	 	 * generation of children needed to be reaped... so assume
Index: main/scoreboard.h
===================================================================
RCS file: /export/home/cvs/apachen/src/main/scoreboard.h,v
retrieving revision 1.29
diff -u -r1.29 scoreboard.h
--- scoreboard.h	1997/09/11 19:32:44	1.29
+++ scoreboard.h	1997/09/11 20:00:43
@@ -74,6 +74,7 @@
 #define SERVER_BUSY_LOG 6       /* Logging the request */
 #define SERVER_BUSY_DNS 7       /* Looking up a hostname */
 #define SERVER_GRACEFUL 8	/* server is gracefully finishing request */
+#define SERVER_NUM_STATUS 9	/* number of status settings */
 
 /* A "virtual time" is simply a counter that indicates that a child is
  * making progress.  The parent checks up on each child, and when they have
@@ -90,14 +91,9 @@
  */
 typedef unsigned vtime_t;
 
+/* stuff which the children generally write, and the parent mainly reads */
 typedef struct {
-    union {
-	pid_t pid;		/* if it's not DEAD then this is the pid */
-	int free_list;		/* otherwise this is scratch space */
-    } x;
 #ifdef OPTIMIZE_TIMEOUTS
-    vtime_t last_vtime;		/* the last vtime the parent has seen */
-    time_t last_rtime;		/* time(0) of the last change */
     vtime_t cur_vtime;		/* the child's current vtime */
     unsigned short timeout_len;	/* length of the timeout */
 #endif
@@ -119,7 +115,9 @@
 #ifndef NO_TIMES
     struct tms times;
 #endif
+#ifndef OPTIMIZE_TIMEOUTS
     time_t last_used;
+#endif
     char client[32];	/* Keep 'em small... */
     char request[64];	/* We just want an idea... */
     char vhost[32];     /* What virtual host is being accessed? */
@@ -132,17 +130,28 @@
 				   restart is required */
     } global_score;
 
+/* stuff which the parent generally writes and the children rarely read */
+typedef struct {
+    pid_t pid;
+#ifdef OPTIMIZE_TIMEOUTS
+    time_t last_rtime;		/* time(0) of the last change */
+    vtime_t last_vtime;		/* the last vtime the parent has seen */
+#endif
+} parent_score;
+
 typedef struct
     {
     short_score servers[HARD_SERVER_LIMIT];
+    parent_score parent[HARD_SERVER_LIMIT];
     global_score global;
     } scoreboard;
 
 #define SCOREBOARD_SIZE		sizeof(scoreboard)
 
-API_EXPORT(void) sync_scoreboard_image(void);
-API_EXPORT(short_score) get_scoreboard_info(int x);
-API_EXPORT(int) exists_scoreboard_image (void);
+void sync_scoreboard_image(void);
+int exists_scoreboard_image (void);
+
+extern scoreboard *scoreboard_image;
 
 /* for time_process_request() in http_main.c */
 #define START_PREQUEST 1
Index: modules/standard/mod_log_config.c
===================================================================
RCS file: /export/home/cvs/apachen/src/modules/standard/mod_log_config.c,v
retrieving revision 1.36
diff -u -r1.36 mod_log_config.c
--- mod_log_config.c	1997/08/23 02:59:45	1.36
+++ mod_log_config.c	1997/09/11 20:00:44
@@ -164,6 +164,7 @@
 #include "httpd.h"
 #include "http_config.h"
 #include "http_core.h" /* For REMOTE_NAME */
+#include <limits.h>
 
 module MODULE_VAR_EXPORT config_log_module;
 
@@ -175,6 +176,21 @@
 static mode_t xfer_mode = ( S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH );
 #endif
 
+/* POSIX.1 defines PIPE_BUF as the maximum number of bytes that is
+ * guaranteed to be atomic when writing a pipe.  And PIPE_BUF >= 512
+ * is guaranteed.  So we'll just guess 512 in the event the system
+ * doesn't have this.  Now, for file writes there is actually no limit,
+ * the entire write is atomic.  Whether all systems implement this
+ * correctly is another question entirely ... so we'll just use PIPE_BUF
+ * because it's probably a good guess as to what is implemented correctly
+ * everywhere.
+ */
+#ifdef PIPE_BUF
+#define LOG_BUFSIZE	PIPE_BUF
+#else
+#define LOG_BUFSIZE	(512)
+#endif
+
 /*
  * multi_log_state is our per-(virtual)-server configuration. We store
  * an array of the logs we are going to use, each of type config_log_state.
@@ -209,6 +225,10 @@
     char *fname;
     array_header *format;
     int log_fd;
+#ifdef BUFFERED_LOGS
+    int outcnt;
+    char outbuf[LOG_BUFSIZE];
+#endif
 } config_log_state;
 
 /*
@@ -525,6 +545,16 @@
     return cp ? cp : "-";
 }
 
+#ifdef BUFFERED_LOGS
+static void flush_log (config_log_state *cls)
+{
+    if (cls->outcnt && cls->log_fd != -1) {
+	write (cls->log_fd, cls->outbuf, cls->outcnt);
+	cls->outcnt = 0;
+    }
+}
+#endif
+
 static int config_log_transaction(request_rec *r, config_log_state *cls,
 			   array_header *default_format) {
     array_header *strsa;
@@ -562,8 +592,20 @@
         strcpy (s, strs[i]);
         s += strlen (strs[i]);
     }
-    
-    write(cls->log_fd, str, strlen(str));
+
+#ifdef BUFFERED_LOGS
+    if (len + cls->outcnt > LOG_BUFSIZE) {
+	flush_log (cls);
+    }
+    if (len >= LOG_BUFSIZE) {
+	write (cls->log_fd, str, len);
+    } else {
+	memcpy (&cls->outbuf[cls->outcnt], str, len);
+	cls->outcnt += len;
+    }
+#else
+    write (cls->log_fd, str, len);
+#endif
 
     return OK;
 }
@@ -606,7 +648,7 @@
 	(multi_log_state *)palloc(p, sizeof (multi_log_state));
     
     mls->config_logs = 
-	make_array(p, 5, sizeof (config_log_state));
+	make_array(p, 1, sizeof (config_log_state));
     mls->default_format = NULL;
     mls->server_config_logs = NULL;
     
@@ -736,6 +778,9 @@
             exit(1);
         }
     }
+#ifdef BUFFERED_LOGS
+    cls->outcnt = 0;
+#endif
 
     return cls;
 }
@@ -784,6 +829,32 @@
     for (s = s->next; s; s = s->next) open_multi_logs (s, p);
 }
 
+#ifdef BUFFERED_LOGS
+static void flush_all_logs (server_rec *s, pool *p)
+{
+    multi_log_state *mls;
+    array_header *log_list;
+    config_log_state *clsarray;
+    int i;
+    
+    for (; s; s = s->next) {
+	mls = get_module_config(s->module_config, &config_log_module);
+	log_list = NULL;
+	if (mls->config_logs->nelts) {
+	    log_list = mls->config_logs;
+	} else if (mls->server_config_logs) {
+	    log_list = mls->server_config_logs;
+	}
+	if (log_list) {
+	    clsarray = (config_log_state *)log_list->elts;
+	    for (i = 0; i < log_list->nelts; ++i) {
+		flush_log (&clsarray[i]);
+	    }
+	}
+    }
+}
+#endif
+
 module MODULE_VAR_EXPORT config_log_module = {
    STANDARD_MODULE_STUFF,
    init_config_log,		/* initializer */
@@ -802,6 +873,10 @@
    multi_log_transaction,	/* logger */
    NULL,			/* header parser */
    NULL,			/* child_init */
-   NULL,			/* child_exit */
+#ifdef BUFFERED_LOGS
+   flush_all_logs,		/* child_exit */
+#else
+   NULL,
+#endif
    NULL				/* post read-request */
 };
Index: modules/standard/mod_status.c
===================================================================
RCS file: /export/home/cvs/apachen/src/modules/standard/mod_status.c,v
retrieving revision 1.62
diff -u -r1.62 mod_status.c
--- mod_status.c	1997/09/11 19:32:49	1.62
+++ mod_status.c	1997/09/11 20:00:45
@@ -126,6 +126,7 @@
 
 module MODULE_VAR_EXPORT status_module;
 
+#ifdef STATUS
 /* Format the number of bytes nicely */
 
 static void format_byte_out(request_rec *r,unsigned long bytes)
@@ -149,6 +150,7 @@
     else
 	rprintf(r,"%.1f GB",(float)kbytes/MBYTE);
 }
+#endif
 
 static void show_time(request_rec *r,time_t tsecs)
 {
@@ -223,7 +225,8 @@
     int no_table_report=0;
     server_rec *server = r->server;
     short_score score_record;
-    char status[]="??????????";
+    parent_score ps_record;
+    char status[SERVER_NUM_STATUS];
     char stat_buffer[HARD_SERVER_LIMIT];
     clock_t tu,ts,tcu,tcs;
 
@@ -290,9 +293,10 @@
     sync_scoreboard_image();
     for (i = 0; i<HARD_SERVER_LIMIT; ++i)
     {
-        score_record = get_scoreboard_info(i);
+	score_record = scoreboard_image->servers[i];
+	ps_record = scoreboard_image->parent[i];
         res = score_record.status;
-	stat_buffer[i] = status[res];
+	stat_buffer[i] = (res == SERVER_UNKNOWN) ? '?' : status[res];
         if (res == SERVER_READY)
 	    ready++;
         else if (res != SERVER_DEAD && res != SERVER_UNKNOWN)
@@ -300,8 +304,7 @@
 #if defined(STATUS)
         lres = score_record.access_count;
 	bytes= score_record.bytes_served;
-        if (lres!=0 || (score_record.status != SERVER_READY
-	  && score_record.status != SERVER_DEAD))
+        if (lres!=0 || (res != SERVER_READY && res != SERVER_DEAD))
 	{
 #ifndef NO_TIMES
 	    tu+=score_record.times.tms_utime;
@@ -444,7 +447,8 @@
 
     for (i = 0; i<HARD_SERVER_LIMIT; ++i)
     {
-        score_record=get_scoreboard_info(i);
+	score_record = scoreboard_image->servers[i];
+	ps_record = scoreboard_image->parent[i];
 
 #if defined(NO_GETTIMEOFDAY)
 #ifndef NO_TIMES
@@ -487,7 +491,7 @@
 			 i,(int)conn_lres,my_lres,lres);
 		    else
 			rprintf(r,"<b>Server %d</b> (%d): %d|%lu|%lu [",
-			 i,(int)score_record.x.pid,(int)conn_lres,my_lres,lres);
+			 i,(int)ps_record.pid,(int)conn_lres,my_lres,lres);
 
 		    switch (score_record.status)
 		    {
@@ -533,7 +537,11 @@
 			    score_record.times.tms_cutime/tick,
 			    score_record.times.tms_cstime/tick,
 #endif
+#ifdef OPTIMIZE_TIMEOUTS
+			    difftime(nowtime, ps_record.last_rtime),
+#else
 			    difftime(nowtime, score_record.last_used),
+#endif
 			    (long)req_time);
 		    format_byte_out(r,conn_bytes);
 		    rputs("|",r);
@@ -552,7 +560,7 @@
 			 i,(int)conn_lres,my_lres,lres);
 		    else
 			rprintf(r,"<tr><td><b>%d</b><td>%d<td>%d/%lu/%lu",
-			 i,(int)score_record.x.pid,(int)conn_lres,my_lres,lres);
+			 i,(int)ps_record.pid,(int)conn_lres,my_lres,lres);
 
 		    switch (score_record.status)
 		    {
@@ -597,7 +605,11 @@
 			    score_record.times.tms_cutime +
 			    score_record.times.tms_cstime)/tick,
 #endif
+#ifdef OPTIMIZE_TIMEOUTS
+			    difftime(nowtime, ps_record.last_rtime),
+#else
 			    difftime(nowtime, score_record.last_used),
+#endif
 			    (long)req_time);
 		    rprintf(r,"<td>%-1.1f<td>%-2.2f<td>%-2.2f\n",
 			(float)conn_bytes/KBYTE, (float)my_bytes/MBYTE,