From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 3076 invoked by alias); 18 Mar 2013 05:39:05 -0000 Received: (qmail 3062 invoked by uid 22791); 18 Mar 2013 05:39:02 -0000 X-SWARE-Spam-Status: No, hits=-7.5 required=5.0 tests=AWL,BAYES_00,KHOP_RCVD_UNTRUST,RCVD_IN_DNSWL_HI,RCVD_IN_HOSTKARMA_W,RP_MATCHES_RCVD,SPF_HELO_PASS,TW_LR X-Spam-Check-By: sourceware.org Received: from mx1.redhat.com (HELO mx1.redhat.com) (209.132.183.28) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Mon, 18 Mar 2013 05:38:52 +0000 Received: from int-mx09.intmail.prod.int.phx2.redhat.com (int-mx09.intmail.prod.int.phx2.redhat.com [10.5.11.22]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id r2I5cpNv029796 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK) for ; Mon, 18 Mar 2013 01:38:51 -0400 Received: from host2.jankratochvil.net (ovpn-116-42.ams2.redhat.com [10.36.116.42]) by int-mx09.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id r2I5cjfl027198 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES128-SHA bits=128 verify=NO) for ; Mon, 18 Mar 2013 01:38:48 -0400 Date: Mon, 18 Mar 2013 09:21:00 -0000 From: Jan Kratochvil To: gdb-patches@sourceware.org Subject: Re: [commit+7.6] testsuite: Add more valgrind kills on cleanup Message-ID: <20130318053844.GA20285@host2.jankratochvil.net> References: <20130317204746.GA18446@host2.jankratochvil.net> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130317204746.GA18446@host2.jankratochvil.net> User-Agent: Mutt/1.5.21 (2010-09-15) X-IsSubscribed: yes Mailing-List: contact gdb-patches-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: gdb-patches-owner@sourceware.org X-SW-Source: 2013-03/txt/msg00709.txt.bz2 On Sun, 17 Mar 2013 21:47:46 +0100, Jan Kratochvil wrote: > As Tom asked off-list before this case was not caught by 'orphanripper' > http://pkgs.fedoraproject.org/cgit/gdb.git/tree/gdb-orphanripper.c > present in Fedora GDB as this tool got stale waiting on fd EOF with stuck > valgrind had this fd open for writing. This was exactly the reason why > I wrote 'orphanripper' and its SIGCHLD handling should have caught that, I do > not yet understand why 'orphanripper' failed in this case. I found a race that kernel reports Z (Zombie) state for a process but 'kill (child, 0)' still returns 0 at that moment. So the code parses /proc/CHILD/stat now. It is now unrelated to FSF GDB but it was discussed the 'orphanripper' testsuite wrapper could be upstreamed. Jan diff --git a/gdb-orphanripper.c b/gdb-orphanripper.c index f8e3f49..d79d93c 100644 --- a/gdb-orphanripper.c +++ b/gdb-orphanripper.c @@ -47,13 +47,10 @@ static const char *progname; -static volatile int signal_chld_hit = 0; static volatile pid_t child; static void signal_chld (int signo) { - if (child && kill (child, 0) != 0) - signal_chld_hit = 1; } static volatile int signal_alrm_hit = 0; @@ -104,6 +101,44 @@ static int read_out (int amaster) return 1; } +/* kill (child, 0) == 0 sometimes even when CHILD's state is already "Z". */ + +static int child_exited (void) +{ + char buf[200]; + int fd, i, retval; + ssize_t got; + char *state; + + snprintf (buf, sizeof (buf), "/proc/%ld/stat", (long) child); + fd = open (buf, O_RDONLY); + if (fd == -1) + { + perror ("open (/proc/CHILD/stat)"); + exit (EXIT_FAILURE); + } + got = read (fd, buf, sizeof(buf)); + if (got <= 0) + { + perror ("read (/proc/CHILD/stat)"); + exit (EXIT_FAILURE); + } + if (close (fd) != 0) + { + perror ("close (/proc/CHILD/stat)"); + exit (EXIT_FAILURE); + } + i = sscanf (buf, "%*d%*s%ms", &state); + if (i != 1) + { + perror ("sscanf (/proc/CHILD/stat)"); + exit (EXIT_FAILURE); + } + retval = strcmp (state, "Z") == 0; + free (state); + return retval; +} + static int spawn (char **argv, int timeout) { pid_t child_got; @@ -157,6 +192,11 @@ static int spawn (char **argv, int timeout) assert (i == STDIN_FILENO); #endif + i = sigemptyset (&set); + assert (i == 0); + i = sigprocmask (SIG_SETMASK, &set, NULL); + assert (i == 0); + /* Do not setpgrp(2) in the parent process as the process-group is shared for the whole sh(1) pipeline we could be a part of. The process-group is set according to PID of the first @@ -206,7 +246,7 @@ static int spawn (char **argv, int timeout) i = ppoll (&pollfd, 1, NULL, &set); if (i == -1 && errno == EINTR) { - if (signal_chld_hit) + if (child_exited ()) break; /* Non-CHILD child may have exited. */ continue; @@ -230,7 +270,7 @@ static int spawn (char **argv, int timeout) exit (EXIT_FAILURE); } /* Child exited? */ - if (signal_chld_hit) + if (child_exited ()) break; } @@ -279,12 +319,10 @@ static int spawn (char **argv, int timeout) exit (EXIT_FAILURE); } - /* In the POLLHUP case we may not have seen SIGCHLD so far. */ + /* Not used in fact. */ i = sigprocmask (SIG_SETMASK, &set, NULL); assert (i == 0); - assert (signal_chld_hit != 0); - /* Do not unset O_NONBLOCK as a stale child (the whole purpose of this program) having open its output pty would block us in read_out. */ #if 0