linux-rt-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] ssdd: mitigate tracee starvation
@ 2025-09-19 18:11 Derek Barbosa
  2025-09-19 18:58 ` John Kacur
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Derek Barbosa @ 2025-09-19 18:11 UTC (permalink / raw)
  To: williams, jkacur; +Cc: linux-rt-users, crwood, oleg, shichen

When ssdd is invoked with nforks > 100 && niters == 10000 on a tuned,
realtime kernel, the following error messages can be seen:

EXITING, ERROR: wait on PTRACE_SINGLESTEP #385: no SIGCHLD seen (signal count == 0), signo 5
EXITING, ERROR: wait on PTRACE_SINGLESTEP #398: no SIGCHLD seen (signal count == 0), signo 5
EXITING, ERROR: wait on PTRACE_SINGLESTEP #385: no SIGCHLD seen (signal count == 0), signo 5
...

This behavior is caused by ptrace_stop() being unable to sleep after
taking tasklist_lock().

As forktest() generates "niter" PTRACE_SINGLESTEP's for nforks, in the
rare event where nforks exceeds the defaults by a large order of
magnitude, the sporadic test failures caused by missing SIGCHLDs
indicates that the tracees are unable to effectively wait for their
asynchronous signals to arrive --as denoted in the previous sleeps for
check_sigchld().

Therefore, by performing an sigtimedwait() in check_sigchld(), we
give the tracee enough CPU time to call
do_notify_parent_cldstop()->send_signal_locked().

The observed behavior after appling this patch mitigates the
aforementioned issue in scenarios with a high number of nforks.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Crystal Wood <crwood@redhat.com>
Signed-off-by: Derek Barbosa <debarbos@redhat.com>
---
V1 -> V2: Addressed review comments, removed usleep() in favor of
sigtimedwait().
V2 -> V3: Addressed checkpatch.pl complaints.

 src/ssdd/ssdd.c | 65 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 17 deletions(-)

diff --git a/src/ssdd/ssdd.c b/src/ssdd/ssdd.c
index 50f7424..63130cd 100644
--- a/src/ssdd/ssdd.c
+++ b/src/ssdd/ssdd.c
@@ -30,6 +30,7 @@
 #include <getopt.h>
 #include <string.h>
 #include <signal.h>
+#include <time.h>
 #include <errno.h>
 
 #include <sys/types.h>
@@ -67,7 +68,7 @@ static const char *get_state_name(int state)
 static int quiet;
 static char jsonfile[MAX_PATH];
 
-static int got_sigchld;
+volatile int got_sigchld;
 
 enum option_value { OPT_NFORKS=1, OPT_NITERS, OPT_HELP, OPT_JSON, OPT_QUIET };
 
@@ -127,23 +128,33 @@ static int do_wait(pid_t *wait_pid, int *ret_sig)
 	return STATE_UNKNOWN;
 }
 
-static int check_sigchld(void)
+static int check_sigchld(sigset_t *set)
 {
-	int i;
+
+	struct timespec timeout;
+
+	timeout.tv_sec = 10;
+	timeout.tv_nsec = 0;
+	int recv_sig = 0;
+
 	/*
-	 * The signal is asynchronous so give it some
-	 * time to arrive.
+	 * Check the handler flag, then if need be, wait for the signal to
+	 * arrive
 	 */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(1000); /* 10 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(2000); /* 20 + 10 = 30 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(4000); /* 40 + 30 = 70 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(8000); /* 80 + 70 = 150 msecs */
-	for (i = 0; i < 10 && !got_sigchld; i++)
-		usleep(16000); /* 160 + 150 = 310 msecs */
+	if (!got_sigchld)
+		recv_sig = sigtimedwait(set, NULL, &timeout);
+
+	if (sigprocmask(SIG_UNBLOCK, set, NULL) == -1) {
+		printf("EXITING, ERROR: unable to mask signal set\n");
+		exit(1);
+	}
+
+	if (recv_sig == -1) {
+		printf("EXITING, ERROR: Timeout: no signal received in 10 seconds\n");
+		exit(1);
+	} else if (recv_sig == SIGCHLD) {
+		got_sigchld = 1;
+	}
 
 	return got_sigchld;
 }
@@ -195,6 +206,20 @@ static int forktests(int testid)
 		exit(1);
 	}
 
+	/*
+	 * Block the signal before it is generated
+	 * Ensures we can synchronously wait for it.
+	 */
+	sigset_t set;
+
+	sigemptyset(&set);
+	sigaddset(&set, SIGCHLD);
+
+	if (sigprocmask(SIG_BLOCK, &set, NULL) == -1) {
+		printf("EXITING, ERROR: unable to mask signal set\n");
+		exit(1);
+	}
+
 	/*
 	 * Attach to the child.
 	 */
@@ -224,7 +249,7 @@ static int forktests(int testid)
 		       ret_sig);
 		exit(1);
 	}
-	if (!check_sigchld()) {
+	if (!check_sigchld(&set)) {
 		printf("forktest#%d/%d: EXITING, ERROR: "
 		       "wait on PTRACE_ATTACH saw a SIGCHLD count of %d, should be 1\n",
 		       testid, getpid(), got_sigchld);
@@ -238,6 +263,12 @@ static int forktests(int testid)
 	 * step the tracee.
 	 */
 	for (i = 0; i < nsteps; i++) {
+
+		if (sigprocmask(SIG_BLOCK, &set, NULL) == -1) {
+			printf("EXITING, ERROR: unable to mask signal set\n");
+			exit(1);
+		}
+
 		pstatus = ptrace(PTRACE_SINGLESTEP, child, NULL, NULL);
 
 		if (pstatus) {
@@ -271,7 +302,7 @@ static int forktests(int testid)
 			       testid, getpid(), i, ret_sig);
 			exit(1);
 		}
-		if (!check_sigchld()) {
+		if (!check_sigchld(&set)) {
 			printf("forktest#%d/%d: EXITING, ERROR: "
 			       "wait on PTRACE_SINGLESTEP #%d: no SIGCHLD seen "
 			       "(signal count == 0), signo %d\n",
-- 
2.50.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-09-26 17:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-19 18:11 [PATCH v3] ssdd: mitigate tracee starvation Derek Barbosa
2025-09-19 18:58 ` John Kacur
2025-09-19 19:14 ` Crystal Wood
2025-09-26 17:16 ` John Kacur

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).