Hi,
I am looking at a problem where the parent process doesn't seem
to cleanup the exited children (with a webserver). We narrowed it
down to a simple testcase. Seems more like a lost SIG_CHILD.
I can easily reproduce the problem on my AMD64 machine.
Any thoughts on why this is happening ? Any known issues/fixes ?
Thanks,
Badari
elm3b29:~ # ./proctest 10 30
Parent process id: 30007
Spawned 10 children
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Waiting for 10 children to exit
Child 0 exiting. Executed 3682 forks
Child 3 exiting. Executed 3677 forks
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
Waiting for 8 children to exit
...
#ps -aef
....
root 30007 20480 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30009 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30011 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30015 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30017 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30019 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30023 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30026 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 30031 30007 0 02:35 pts/1 00:00:00 ./proctest 10 30
root 698 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 704 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 724 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 730 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 738 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 754 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 761 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 766 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 781 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 786 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 792 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 814 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 820 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 833 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 840 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 859 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 868 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 876 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 890 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 900 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 903 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 911 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 924 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 930 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 935 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 939 30023 0 02:35 pts/1 00:00:00 [child] <defunct>
root 2822 30017 0 02:35 pts/1 00:00:00 [child] <defunct>
root 2826 30017 0 02:35 pts/1 00:00:00 [child] <defunct>
root 2834 30017 0 02:35 pts/1 00:00:00 [child] <defunct>
root 2842 30017 0 02:35 pts/1 00:00:00 [child] <defunct>
...
...
elm3b29:~ # strace -p 30023
Process 30023 attached - interrupt to quit
futex(0x2aaaaaddf118, FUTEX_WAIT, 2, NULL
/* proctest.c
* Test the destruction of zombie processes
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <time.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/resource.h>
#define MIN_DELAY 500000
#define MAX_DELAY 2000000
#define MAX_RAND_DELAY MAX_DELAY-MIN_DELAY
static int child_count;
static int *pids;
void usage()
{
printf("USAGE: proctest [child count] [duration in sec]\n");
}
int rand_delay()
{
int delay = (int)(MAX_DELAY*rand()/(RAND_MAX+1.0));
return delay;
}
int child_process(int i, int duration)
{
time_t tt;
struct timeval tv;
struct timezone tz;
int cnt=0;
gettimeofday(&tv,&tz);
tt=tv.tv_sec;
while(tv.tv_sec - tt < duration ) {
cnt++;
if (!fork()) {
execve("./child",NULL,NULL);
_exit(0);
}
usleep(rand_delay());
gettimeofday(&tv,&tz);
}
fprintf(stderr,"Child %d exiting. Executed %d forks \n",i,cnt);
return 0;
}
int pids_remaining()
{
int i, count = 0;
for (i = 0; i < child_count; i++)
{
if (pids[i] != 0)
count++;
}
return count;
}
void pid_exited(int pid)
{
int i;
for (i = 0; i < child_count; i++) {
if (pids[i] == pid) {
pids[i] = 0;
break;
}
}
}
void sigchld_handler(int signo)
{
int status;
struct rusage ru;
int pid;
/* this delay forces some children into the zombie state temporarily */
usleep(rand_delay());
while(1) {
pid = wait4(-1, &status, WNOHANG, &ru);
if (pid <= 0) break;
// printf("SIGCHLD received for pid %d\n", pid);
pid_exited(pid);
}
}
int main(int argc, char* argv[])
{
int i, pid, delay, rem, duration;
if (argc != 3) {
usage();
exit(1);
}
printf("Parent process id: %d\n", getpid());
signal(SIGCHLD, sigchld_handler);
srand(time(NULL));
child_count = atoi(argv[1]);
duration = atoi(argv[2]);
/* storage for the children pids */
pids = (int *)calloc(child_count, sizeof(int));
for (i = 0; i < child_count; i++) {
pid = fork();
if (pid) {
pids[i] = pid;
} else {
child_process(i, duration);
_exit(0);
}
}
fprintf(stderr,"Spawned %d children\n", child_count);
while ((rem = pids_remaining()) > 0) {
fprintf(stderr,"Waiting for %d children to exit\n", rem);
usleep(1000000); /* 1 HZ */
}
fprintf(stderr,"All children have exited and been cleaned up\n");
return 0;
}
#include<stdio.h>
main()
{
// printf("PID is %d\n",getpid());
usleep(200000);
}
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
|
|