You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@httpd.apache.org by Rob Hartill <ro...@imdb.com> on 1996/06/25 07:55:56 UTC

Re: lost children


thanks for the info and patch. Both will be passed on to the
develoers for consideration.


>Hi -
>
>I'm running Apache 1.1b4 on BSDI 2.0.1, on a very busy server that
>also has some local hacks.  On both this version of Apache and also
>the previous (1.0.2) I had problems with the server basically slowing
>down (taking new connections very slowly) after running a while (6-12
>hours), as if it had no more fork slots left (MAX set at 200).  Doing
>a HUP on the server would clear things up when this happened.
>
>Now that I got 1.1b4 running with the status display (thanks!) I found
>that when this happened I would gets lots of children in a state
>(usually a Write) when the time of the last transaction was getting
>up into 10000-30000 seconds (way past the timeout time).  Further
>investigation showed that the child process was long gone, but
>Apache thought it was still there.  So somehow Apache is losing track
>of children, maybe losing the signal that the process has exited.
>
>I could not figure out the cause of the bug, so I put the following
>hack (inside the /*NW*/ comments) into http_main.c and ran this for a
>while.  The idea of the code was to slowly check child process
>existance and reclaim any slots that were lost.  I am getting about
>100-200 of these a day on a server doing about 1 million hits/day.  It
>appears to have solved the problems I was having.
>
>-Mark
>
>-----------------------------
>http_main.c:
>
>void standalone_main(int argc, char **argv)
>{
>    struct sockaddr_in sa_server;
>/*NW begin*/
>    int checkslot = 0;
>    char errstr[MAX_STRING_LEN];
>    int my_pid = getpid();
>/*NW end*/
> 
>    standalone = 1;
>    sd = listenmaxfd = -1;
> 
>    if (!one_process) detach();
> 
>#ifdef NEXT
>    setjmp(restart_buffer);
>#else
>    sigsetjmp(restart_buffer,1);
>#endif
> 
>    signal (SIGHUP, SIG_IGN);   /* Until we're done (re)reading config
>*/
> 
>    if(!one_process)
>    {
>#ifndef NO_KILLPG
>      if (killpg(pgrp,SIGHUP) < 0)    /* Kill 'em off */
>#else
>      if (kill(-pgrp,SIGHUP) < 0)
>#endif
>        log_unixerr ("killpg SIGHUP", NULL, NULL, server_conf);
>    }
> 
>    if (sd != -1 || listenmaxfd != -1) {
>        reclaim_child_processes(); /* Not when just starting up */
>        log_error ("SIGHUP received.  Attempting to restart",
>server_conf);
>    }
> 
>    restart_time = time(NULL);
>    clear_pool (pconf);
>    ptrans = make_sub_pool (pconf);
> 
>    server_conf = read_config(pconf, ptrans, server_confname);
>    open_logs(server_conf, pconf);
>    set_group_privs();
>    accept_mutex_init(pconf);
>    reinit_scoreboard(pconf);
> 
>    default_server_hostnames (server_conf);
> 
>    if (listeners == NULL)
>    {
>        memset((char *) &sa_server, 0, sizeof(sa_server));
>        sa_server.sin_family=AF_INET;
>        sa_server.sin_addr=bind_address;
>        sa_server.sin_port=htons(server_conf->port);
> 
>        sd = make_sock(pconf, &sa_server);
>    } else
>    {
>        listen_rec *lr;
>        int fd;
> 
>        listenmaxfd = -1;
>        FD_ZERO(&listenfds);
>        for (lr=listeners; lr != NULL; lr=lr->next)
>        {
>            fd = make_sock(pconf, &lr->local_addr);
>            FD_SET(fd, &listenfds);
>            if (fd > listenmaxfd) listenmaxfd = fd;
>        }
>        sd = -1;
>    }
> 
>    set_signals();
>    log_pid(pconf, pid_fname);
> 
>    num_children = 0;
> 
>    if (daemons_max_free < daemons_min_free + 1) /* Don't thrash... */
>        daemons_max_free = daemons_min_free + 1;
> 
>    while (num_children < daemons_to_start) {
>        make_child(server_conf, num_children++);
>    }
> 
>    log_error ("Server configured -- resuming normal operations",
>server_conf);
> 
>    while (1) {
>        int status, child_slot;
>        int pid = wait_or_timeout(&status);
> 
>        if (pid >= 0) {
>            /* Child died... note that it's gone in the scoreboard. */
>            sync_scoreboard_image();
>            child_slot = find_child_by_pid (pid);
>            if (child_slot >= 0)
>                (void)update_child_status (child_slot, SERVER_DEAD,
>                 (request_rec*)NULL);
>        }
> 
>/*NW begin */
>/* slowly check all slots for any lost children */
>        if (get_child_status(checkslot) != SERVER_DEAD)
>        {
>          pid = scoreboard_image[checkslot].pid;
>          if (pid != my_pid && pid != 0)
>          {
>            if (kill(pid,0) == -1)
>            {
>              (void)update_child_status (checkslot, SERVER_DEAD,
>                                         (request_rec*)NULL);
>              sprintf(errstr,"lost child slot %d (pid %d) reclaimed",
>                      checkslot,pid);
>              log_error(errstr, server_conf);
>            }
>          }
>        }
>        checkslot++;
>        if (checkslot >= HARD_SERVER_LIMIT) checkslot = 0;
>/*NW end */
> 
>        sync_scoreboard_image();
>        if ((count_idle_servers() < daemons_min_free)
>         && (child_slot = find_free_child_num()) >= 0
>         && child_slot <= daemons_limit) {
>            (void)update_child_status(child_slot,SERVER_STARTING,
>             (request_rec*)NULL);
>            make_child(server_conf, child_slot);
>        }
>    }
> 
>} /* standalone_main */


-- 
Rob Hartill (robh@imdb.com)
The Internet Movie Database (IMDb)  http://www.imdb.com/
           ...more movie info than you can poke a stick at.

Re: lost children

Posted by Brian Behlendorf <br...@organic.com>.
Folks, this does seem to be something serious.  I see a serious
disjunction between the information the scoreboard claims and reality,
when it comes to number of running children.  While this patch is a
band-aid and we really should look for the cause, I would support using
this bandaid until we determine what the real problem is.  Anyone?  I'll
be applying this to www.apache.org tonight.

	Brian


On Mon, 24 Jun 1996, Rob Hartill wrote:
> thanks for the info and patch. Both will be passed on to the
> develoers for consideration.
> 
> 
> >Hi -
> >
> >I'm running Apache 1.1b4 on BSDI 2.0.1, on a very busy server that
> >also has some local hacks.  On both this version of Apache and also
> >the previous (1.0.2) I had problems with the server basically slowing
> >down (taking new connections very slowly) after running a while (6-12
> >hours), as if it had no more fork slots left (MAX set at 200).  Doing
> >a HUP on the server would clear things up when this happened.
> >
> >Now that I got 1.1b4 running with the status display (thanks!) I found
> >that when this happened I would gets lots of children in a state
> >(usually a Write) when the time of the last transaction was getting
> >up into 10000-30000 seconds (way past the timeout time).  Further
> >investigation showed that the child process was long gone, but
> >Apache thought it was still there.  So somehow Apache is losing track
> >of children, maybe losing the signal that the process has exited.
> >
> >I could not figure out the cause of the bug, so I put the following
> >hack (inside the /*NW*/ comments) into http_main.c and ran this for a
> >while.  The idea of the code was to slowly check child process
> >existance and reclaim any slots that were lost.  I am getting about
> >100-200 of these a day on a server doing about 1 million hits/day.  It
> >appears to have solved the problems I was having.
> >
> >-Mark
> >
> >-----------------------------
> >http_main.c:
> >
> >void standalone_main(int argc, char **argv)
> >{
> >    struct sockaddr_in sa_server;
> >/*NW begin*/
> >    int checkslot = 0;
> >    char errstr[MAX_STRING_LEN];
> >    int my_pid = getpid();
> >/*NW end*/
> > 
> >    standalone = 1;
> >    sd = listenmaxfd = -1;
> > 
> >    if (!one_process) detach();
> > 
> >#ifdef NEXT
> >    setjmp(restart_buffer);
> >#else
> >    sigsetjmp(restart_buffer,1);
> >#endif
> > 
> >    signal (SIGHUP, SIG_IGN);   /* Until we're done (re)reading config
> >*/
> > 
> >    if(!one_process)
> >    {
> >#ifndef NO_KILLPG
> >      if (killpg(pgrp,SIGHUP) < 0)    /* Kill 'em off */
> >#else
> >      if (kill(-pgrp,SIGHUP) < 0)
> >#endif
> >        log_unixerr ("killpg SIGHUP", NULL, NULL, server_conf);
> >    }
> > 
> >    if (sd != -1 || listenmaxfd != -1) {
> >        reclaim_child_processes(); /* Not when just starting up */
> >        log_error ("SIGHUP received.  Attempting to restart",
> >server_conf);
> >    }
> > 
> >    restart_time = time(NULL);
> >    clear_pool (pconf);
> >    ptrans = make_sub_pool (pconf);
> > 
> >    server_conf = read_config(pconf, ptrans, server_confname);
> >    open_logs(server_conf, pconf);
> >    set_group_privs();
> >    accept_mutex_init(pconf);
> >    reinit_scoreboard(pconf);
> > 
> >    default_server_hostnames (server_conf);
> > 
> >    if (listeners == NULL)
> >    {
> >        memset((char *) &sa_server, 0, sizeof(sa_server));
> >        sa_server.sin_family=AF_INET;
> >        sa_server.sin_addr=bind_address;
> >        sa_server.sin_port=htons(server_conf->port);
> > 
> >        sd = make_sock(pconf, &sa_server);
> >    } else
> >    {
> >        listen_rec *lr;
> >        int fd;
> > 
> >        listenmaxfd = -1;
> >        FD_ZERO(&listenfds);
> >        for (lr=listeners; lr != NULL; lr=lr->next)
> >        {
> >            fd = make_sock(pconf, &lr->local_addr);
> >            FD_SET(fd, &listenfds);
> >            if (fd > listenmaxfd) listenmaxfd = fd;
> >        }
> >        sd = -1;
> >    }
> > 
> >    set_signals();
> >    log_pid(pconf, pid_fname);
> > 
> >    num_children = 0;
> > 
> >    if (daemons_max_free < daemons_min_free + 1) /* Don't thrash... */
> >        daemons_max_free = daemons_min_free + 1;
> > 
> >    while (num_children < daemons_to_start) {
> >        make_child(server_conf, num_children++);
> >    }
> > 
> >    log_error ("Server configured -- resuming normal operations",
> >server_conf);
> > 
> >    while (1) {
> >        int status, child_slot;
> >        int pid = wait_or_timeout(&status);
> > 
> >        if (pid >= 0) {
> >            /* Child died... note that it's gone in the scoreboard. */
> >            sync_scoreboard_image();
> >            child_slot = find_child_by_pid (pid);
> >            if (child_slot >= 0)
> >                (void)update_child_status (child_slot, SERVER_DEAD,
> >                 (request_rec*)NULL);
> >        }
> > 
> >/*NW begin */
> >/* slowly check all slots for any lost children */
> >        if (get_child_status(checkslot) != SERVER_DEAD)
> >        {
> >          pid = scoreboard_image[checkslot].pid;
> >          if (pid != my_pid && pid != 0)
> >          {
> >            if (kill(pid,0) == -1)
> >            {
> >              (void)update_child_status (checkslot, SERVER_DEAD,
> >                                         (request_rec*)NULL);
> >              sprintf(errstr,"lost child slot %d (pid %d) reclaimed",
> >                      checkslot,pid);
> >              log_error(errstr, server_conf);
> >            }
> >          }
> >        }
> >        checkslot++;
> >        if (checkslot >= HARD_SERVER_LIMIT) checkslot = 0;
> >/*NW end */
> > 
> >        sync_scoreboard_image();
> >        if ((count_idle_servers() < daemons_min_free)
> >         && (child_slot = find_free_child_num()) >= 0
> >         && child_slot <= daemons_limit) {
> >            (void)update_child_status(child_slot,SERVER_STARTING,
> >             (request_rec*)NULL);
> >            make_child(server_conf, child_slot);
> >        }
> >    }
> > 
> >} /* standalone_main */
> 
> 
> -- 
> Rob Hartill (robh@imdb.com)
> The Internet Movie Database (IMDb)  http://www.imdb.com/
>            ...more movie info than you can poke a stick at.
> 

--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=--
brian@organic.com  www.apache.org  hyperreal.com  http://www.organic.com/JOBS