public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Aniket Gattani <aniketgattani@google.com>
To: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	 "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,  Ben Segall <bsegall@google.com>,
	Josh Don <joshdon@google.com>,
	linux-kernel@vger.kernel.org,  linux-kselftest@vger.kernel.org,
	Aniket Gattani <aniketgattani@google.com>,
	 kernel test robot <lkp@intel.com>
Subject: [PATCH v2 2/2] selftests/membarrier: Add rseq stress test for CFS throttle interactions
Date: Wed, 15 Apr 2026 23:21:06 +0000	[thread overview]
Message-ID: <20260415232106.2803644-3-aniketgattani@google.com> (raw)
In-Reply-To: <20260415232106.2803644-1-aniketgattani@google.com>

Add a new stress test to exercise the interaction between targeted
expedited membarrier commands and CFS bandwidth throttling.

The test creates a deep cgroup hierarchy and aggressively hammers the
membarrier syscall to expose lock contention and latency issues. This
serves as a reliable reproducer for the `membarrier_ipi_mutex` cascade
lockup, ensuring future changes to membarrier locking do not regress
targeted command latency.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/r/202604151516.Vc7Ro4LP-lkp@intel.com/
Signed-off-by: Aniket Gattani <aniketgattani@google.com>

---
Changes in v2:
- Removed #warning on unsupported architectures that causes build
failures with W=1 reported by the kernel test robot.
---
 tools/testing/selftests/membarrier/Makefile   |   5 +-
 .../membarrier/membarrier_rseq_stress.c       | 951 ++++++++++++++++++
 2 files changed, 954 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/membarrier/membarrier_rseq_stress.c

diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
index fc840e06ff56..829f95c83515 100644
--- a/tools/testing/selftests/membarrier/Makefile
+++ b/tools/testing/selftests/membarrier/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -g $(KHDR_INCLUDES)
+CFLAGS += -g $(KHDR_INCLUDES) -pthread -I../../../../tools/include
 LDLIBS += -lpthread
 
 TEST_GEN_PROGS := membarrier_test_single_thread \
-		membarrier_test_multi_thread
+		membarrier_test_multi_thread \
+		membarrier_rseq_stress
 
 include ../lib.mk
diff --git a/tools/testing/selftests/membarrier/membarrier_rseq_stress.c b/tools/testing/selftests/membarrier/membarrier_rseq_stress.c
new file mode 100644
index 000000000000..c188d7498610
--- /dev/null
+++ b/tools/testing/selftests/membarrier/membarrier_rseq_stress.c
@@ -0,0 +1,951 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Membarrier stress test for CFS throttle interactions.
+ *
+ * Reproducer for the interaction between CFS throttle and expedited membarrier.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <syscall.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <errno.h>
+#include <sched.h>
+#include <time.h>
+#include <signal.h>
+#include <stdatomic.h>
+#include <dirent.h>
+#include <sys/prctl.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+
+/* -- Architecture-specific rseq signature -- */
+#if defined(__x86_64__) || defined(__i386__)
+# define RSEQ_SIG  0x53053053U
+#elif defined(__aarch64__)
+# define RSEQ_SIG  0xd428bc00U
+#elif defined(__powerpc__) || defined(__powerpc64__)
+# define RSEQ_SIG  0x0f000000U
+#elif defined(__s390__) || defined(__s390x__)
+# define RSEQ_SIG  0x0c000000U
+#else
+# define RSEQ_SIG  0
+# define UNSUPPORTED_ARCH 1
+#endif
+
+/* -- rseq ABI (kernel uapi; define locally for portability) -- */
+#define RSEQ_CPU_ID_UNINITIALIZED       ((__u32)-1)
+
+#include <linux/compiler.h>
+
+struct rseq_abi {
+	__u32 cpu_id_start;
+	__u32 cpu_id;
+	__u64 rseq_cs;
+	__u32 flags;
+	__u32 node_id;
+	__u32 mm_cid;
+	char  end[0];
+} __aligned(32);
+
+/* -- membarrier constants (not in all distro headers) -- */
+#ifndef MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
+# define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ          (1 << 7)
+#endif
+#ifndef MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
+# define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8)
+#endif
+#ifndef MEMBARRIER_CMD_FLAG_CPU
+# define MEMBARRIER_CMD_FLAG_CPU  (1 << 0)
+#endif
+
+/* -- Test parameters -- */
+#define N_SIBLINGS          2000
+#define NEST_DEPTH		5
+static char g_cgroup_path[4096];
+static int use_cgroup_v2;
+
+#define CFS_QUOTA_US        1000
+#define CFS_PERIOD_US       5000
+#define N_HAMMER_PER_CPU    25
+#define N_BURNER_PER_CPU    50
+#define MAX_STRESS_CPUS     1024
+#define TEST_DURATION_SEC   20
+
+/* Latency thresholds for the sentinel */
+#define LATENCY_WARN_MS     50
+#define LATENCY_CRITICAL_MS 200
+
+/* Sentinel sampling interval */
+#define SENTINEL_INTERVAL_US  500
+
+/* -- Shared globals -- */
+static atomic_int  g_stop;
+static atomic_int  g_stop_sentinel;
+static atomic_long g_max_latency_us;
+static atomic_long g_interval_max_latency_us;
+static atomic_long g_mb_ok;
+static atomic_long g_mb_err;
+static int         g_ncpus_stress;
+static int *g_stress_cpus;
+
+static atomic_int  g_test_ready;
+
+/* Per-thread rseq ABI block registered with the kernel */
+static __thread struct rseq_abi tls_rseq
+	__attribute__((tls_model("initial-exec"))) __aligned(32) = {
+	.cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
+};
+
+/* -- Utility -- */
+static int write_file(const char *path, const char *val)
+{
+	int fd = open(path, O_WRONLY | O_CLOEXEC);
+
+	if (fd < 0)
+		return -errno;
+
+	size_t len = strlen(val);
+	ssize_t r = write(fd, val, len);
+
+	close(fd);
+	if (r < 0)
+		return -errno;
+	if ((size_t)r != len)
+		return -EIO;
+	return 0;
+}
+
+static uint64_t monotonic_us(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return (uint64_t)ts.tv_sec * 1000000ULL + ts.tv_nsec / 1000ULL;
+}
+
+static void update_max_latency(long lat)
+{
+	long old = atomic_load_explicit(&g_max_latency_us, memory_order_relaxed);
+
+	while (lat > old) {
+		if (atomic_compare_exchange_weak_explicit(&g_max_latency_us, &old, lat,
+				memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+
+	old = atomic_load_explicit(&g_interval_max_latency_us, memory_order_relaxed);
+	while (lat > old) {
+		if (atomic_compare_exchange_weak_explicit(&g_interval_max_latency_us, &old, lat,
+				memory_order_relaxed, memory_order_relaxed))
+			break;
+	}
+}
+
+static void init_stress_cpus(void)
+{
+	cpu_set_t set;
+	int capacity = MAX_STRESS_CPUS;
+
+	g_stress_cpus = malloc(capacity * sizeof(int));
+	if (!g_stress_cpus)
+		ksft_exit_fail_msg("malloc failed for g_stress_cpus\n");
+
+	if (sched_getaffinity(0, sizeof(set), &set) < 0)
+		ksft_exit_fail_msg("sched_getaffinity failed\n");
+
+	for (int i = 0; i < CPU_SETSIZE && g_ncpus_stress < capacity; i++) {
+		if (CPU_ISSET(i, &set))
+			g_stress_cpus[g_ncpus_stress++] = i;
+	}
+
+	if (g_ncpus_stress == 0)
+		ksft_exit_skip("No CPUs available for stress test\n");
+
+	ksft_print_msg("Stressing %d CPUs discovered via affinity\n", g_ncpus_stress);
+}
+
+/* -- rseq / membarrier helpers -- */
+static int rseq_register_thread(void)
+{
+	int r = syscall(SYS_rseq, &tls_rseq, sizeof(tls_rseq), 0, RSEQ_SIG);
+
+	return (r == 0 || errno == EBUSY || errno == EINVAL) ? 0 : -1;
+}
+
+static int rseq_register_thread_at(struct rseq_abi *rseq)
+{
+	int r = syscall(SYS_rseq, rseq, sizeof(*rseq), 0, RSEQ_SIG);
+
+	return (r == 0 || errno == EBUSY || errno == EINVAL) ? 0 : -1;
+}
+
+static int membarrier_register_rseq_mm(void)
+{
+	return syscall(SYS_membarrier,
+		       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0);
+}
+
+/* -- cgroup helpers -- */
+static void rm_cgroup_recursive(const char *path)
+{
+	DIR *dir = opendir(path);
+
+	if (!dir)
+		return;
+	struct dirent *entry;
+
+	while ((entry = readdir(dir)) != NULL) {
+		if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
+			continue;
+		if (entry->d_type == DT_DIR) {
+			char sub_path[4096];
+
+			snprintf(sub_path, sizeof(sub_path), "%s/%s", path, entry->d_name);
+			rm_cgroup_recursive(sub_path);
+		}
+	}
+	closedir(dir);
+	rmdir(path);
+}
+
+static void cgroup_teardown(void);
+
+static int cgroup_setup(void)
+{
+	struct stat st;
+
+	if (stat("/sys/fs/cgroup/cpu", &st) == 0) {
+		use_cgroup_v2 = 0;
+		snprintf(g_cgroup_path, sizeof(g_cgroup_path),
+			 "/sys/fs/cgroup/cpu/membarrier_stress_test");
+	} else if (stat("/dev/cgroup/cpu", &st) == 0) {
+		use_cgroup_v2 = 0;
+		snprintf(g_cgroup_path, sizeof(g_cgroup_path),
+			 "/dev/cgroup/cpu/membarrier_stress_test");
+	} else if (stat("/cgroup/cpu", &st) == 0) {
+		use_cgroup_v2 = 0;
+		snprintf(g_cgroup_path, sizeof(g_cgroup_path),
+			 "/cgroup/cpu/membarrier_stress_test");
+	} else if (stat("/sys/fs/cgroup/cgroup.controllers", &st) == 0) {
+		use_cgroup_v2 = 1;
+		snprintf(g_cgroup_path, sizeof(g_cgroup_path),
+			 "/sys/fs/cgroup/membarrier_stress_test");
+	} else {
+		ksft_print_msg("WARN: cgroup mount not found. Using v2 at /sys/fs/cgroup\n");
+		use_cgroup_v2 = 1;
+		snprintf(g_cgroup_path, sizeof(g_cgroup_path),
+			 "/sys/fs/cgroup/membarrier_stress_test");
+	}
+
+	/* Robust cleanup before setup */
+	cgroup_teardown();
+
+	if (use_cgroup_v2) {
+		/* Enable cpu controller in root cgroup */
+		if (write_file("/sys/fs/cgroup/cgroup.subtree_control", "+cpu") < 0)
+			ksft_print_msg("WARN: failed to enable cpu controller in /sys/fs/cgroup\n");
+	}
+
+	if (mkdir(g_cgroup_path, 0755) < 0 && errno != EEXIST) {
+		ksft_print_msg("mkdir base %s failed: %s\n", g_cgroup_path, strerror(errno));
+		return -1;
+	}
+
+	if (use_cgroup_v2) {
+		char ctrl_path[4096];
+
+		snprintf(ctrl_path, sizeof(ctrl_path), "%s/cgroup.subtree_control", g_cgroup_path);
+		if (write_file(ctrl_path, "+cpu") < 0)
+			ksft_print_msg("WARN: failed to enable cpu controller in %s\n",
+				       g_cgroup_path);
+	}
+
+	for (int i = 0; i < N_SIBLINGS; i++) {
+		char sibling_path[4096];
+
+		snprintf(sibling_path, sizeof(sibling_path), "%s/n%d", g_cgroup_path, i);
+		if (mkdir(sibling_path, 0755) < 0 && errno != EEXIST) {
+			ksft_print_msg("mkdir wide %s failed: %s\n", sibling_path, strerror(errno));
+			return -1;
+		}
+
+		if (use_cgroup_v2) {
+			char ctrl_path[4096];
+
+			snprintf(ctrl_path, sizeof(ctrl_path),
+				 "%s/cgroup.subtree_control", sibling_path);
+			if (write_file(ctrl_path, "+cpu") < 0)
+				ksft_print_msg("WARN: failed to enable cpu controller in %s\n",
+					       sibling_path);
+		}
+
+		char current_path[4096];
+
+		snprintf(current_path, sizeof(current_path), "%s", sibling_path);
+		for (int j = 0; j < NEST_DEPTH; j++) {
+			snprintf(current_path + strlen(current_path),
+				 sizeof(current_path) - strlen(current_path), "/d%d", j);
+			if (mkdir(current_path, 0755) < 0 && errno != EEXIST) {
+				ksft_print_msg("mkdir deep %s failed: %s\n",
+					       current_path, strerror(errno));
+				return -1;
+			}
+
+			/* Enable for all but the leaf */
+			if (use_cgroup_v2 && j < NEST_DEPTH - 1) {
+				char ctrl_path[4096];
+
+				snprintf(ctrl_path, sizeof(ctrl_path), "%s/cgroup.subtree_control",
+					 current_path);
+				if (write_file(ctrl_path, "+cpu") < 0)
+					ksft_print_msg("WARN: cannot enable cpu controller in %s\n",
+						       current_path);
+			}
+		}
+	}
+
+	char quota[64], period[64], max_str[128];
+
+	snprintf(quota, sizeof(quota), "%d", CFS_QUOTA_US);
+	snprintf(period, sizeof(period), "%d", CFS_PERIOD_US);
+	snprintf(max_str, sizeof(max_str), "%d %d", CFS_QUOTA_US, CFS_PERIOD_US);
+
+	if (use_cgroup_v2) {
+		char max_path[4096];
+
+		snprintf(max_path, sizeof(max_path), "%s/cpu.max", g_cgroup_path);
+		if (write_file(max_path, max_str) < 0) {
+			ksft_print_msg("ERROR: cannot write cpu.max at %s\n", max_path);
+			return -1;
+		}
+		ksft_print_msg("cgroup (v2) %s: cpu.max=%s\n", g_cgroup_path, max_str);
+	} else {
+		char quota_path[4096], period_path[4096];
+
+		snprintf(quota_path, sizeof(quota_path), "%s/cpu.cfs_quota_us", g_cgroup_path);
+		snprintf(period_path, sizeof(period_path), "%s/cpu.cfs_period_us", g_cgroup_path);
+
+		if (write_file(period_path, period) < 0) {
+			ksft_print_msg("ERROR: cannot write cpu.cfs_period_us at %s\n",
+				       period_path);
+			return -1;
+		}
+		if (write_file(quota_path, quota) < 0) {
+			ksft_print_msg("ERROR: cannot write cpu.cfs_quota_us at %s\n", quota_path);
+			return -1;
+		}
+		ksft_print_msg("cgroup (v1) %s: cpu.cfs_quota_us=%d cpu.cfs_period_us=%d\n",
+			       g_cgroup_path, CFS_QUOTA_US, CFS_PERIOD_US);
+	}
+
+	return 0;
+}
+
+static int cgroup_add_pid_to_path(pid_t pid, const char *path)
+{
+	char buf[32], file_path[4096];
+
+	snprintf(buf, sizeof(buf), "%d", (int)pid);
+	if (use_cgroup_v2) {
+		snprintf(file_path, sizeof(file_path), "%s/cgroup.procs", path);
+		return write_file(file_path, buf);
+	}
+	/* In v1, try tasks first, fallback to cgroup.procs */
+	snprintf(file_path, sizeof(file_path), "%s/tasks", path);
+	int r = write_file(file_path, buf);
+
+	if (r < 0) {
+		snprintf(file_path, sizeof(file_path), "%s/cgroup.procs", path);
+		r = write_file(file_path, buf);
+	}
+	return r;
+}
+
+static void cgroup_teardown(void)
+{
+	rm_cgroup_recursive(g_cgroup_path);
+}
+
+static void cgroup_unthrottle(void)
+{
+	if (use_cgroup_v2) {
+		char max_path[4096];
+
+		snprintf(max_path, sizeof(max_path), "%s/cpu.max", g_cgroup_path);
+		write_file(max_path, "max");
+	} else {
+		char quota_path[4096];
+
+		snprintf(quota_path, sizeof(quota_path), "%s/cpu.cfs_quota_us", g_cgroup_path);
+		write_file(quota_path, "-1");
+	}
+}
+
+/* -- CPU burner (inside throttled child process) -- */
+static void *burner_thread_fn(void *arg)
+{
+	struct rseq_abi my_rseq;
+	int cpu = (int)(uintptr_t)arg;
+
+	memset(&my_rseq, 0, sizeof(my_rseq));
+	my_rseq.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
+
+	if (rseq_register_thread_at(&my_rseq) < 0) {
+		perror("rseq_register (burner)");
+		return NULL;
+	}
+
+	cpu_set_t set;
+
+	CPU_ZERO(&set);
+	CPU_SET(cpu, &set);
+	if (sched_setaffinity(0, sizeof(set), &set) < 0)
+		perror("sched_setaffinity (burner)");
+
+	unsigned long sink = 0;
+
+	while (!atomic_load_explicit(&g_stop, memory_order_relaxed)) {
+		sink++;
+		/* Prevent compiler from optimizing the loop away */
+		asm volatile("" : "+g"(sink));
+	}
+
+	return NULL;
+}
+
+static int burner_thread_fn_wrapper(void *arg)
+{
+	burner_thread_fn(arg);
+	return 0;
+}
+
+static int leaf_child_fn(void *arg)
+{
+	int i = (int)(uintptr_t)arg;
+	int total_burners = g_ncpus_stress * N_BURNER_PER_CPU;
+	int n_threads_per_leaf = total_burners / N_SIBLINGS;
+
+	if (i < (total_burners % N_SIBLINGS))
+		n_threads_per_leaf++;
+
+	prctl(PR_SET_PDEATHSIG, SIGTERM);
+	if (getppid() == 1)
+		_exit(1);
+
+	char leaf_path[4096];
+
+	snprintf(leaf_path, sizeof(leaf_path), "%s/n%d", g_cgroup_path, i);
+	for (int j = 0; j < NEST_DEPTH; j++)
+		snprintf(leaf_path + strlen(leaf_path),
+			 sizeof(leaf_path) - strlen(leaf_path), "/d%d", j);
+
+		int r = cgroup_add_pid_to_path(getpid(), leaf_path);
+
+		if (r < 0) {
+			char buf[512];
+			int len = snprintf(buf, sizeof(buf),
+					   "[leaf child %d] failed to join cgroup %s: err %d\n",
+					   i, leaf_path, -r);
+			(void)!write(2, buf, len);
+			_exit(1);
+		}
+
+	for (int j = 0; j < n_threads_per_leaf; j++) {
+		int cpu = g_stress_cpus[(i * n_threads_per_leaf + j) % g_ncpus_stress];
+
+		/* Allocate stack via mmap (bypasses heap) */
+		size_t stack_size = 64 * 1024;
+		void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		if (stack == MAP_FAILED) {
+			const char *msg = "mmap stack failed\n";
+			(void)!write(2, msg, strlen(msg));
+			_exit(1);
+		}
+
+		/* Use raw clone to create a thread sharing the VM and thread group */
+		pid_t pid = clone(burner_thread_fn_wrapper, stack + stack_size,
+				  CLONE_VM | CLONE_THREAD | CLONE_SIGHAND,
+				  (void *)(uintptr_t)cpu);
+		if (pid < 0) {
+			const char *msg = "clone burner failed\n";
+			(void)!write(2, msg, strlen(msg));
+			_exit(1);
+		}
+	}
+
+	// Wait for SIGTERM
+	sigset_t mask;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGTERM);
+	int sig;
+
+	sigwait(&mask, &sig);
+
+	_exit(0);
+}
+
+struct leaf_info {
+	pid_t pid;
+	void *stack;
+};
+
+static int run_throttle_child(void *arg)
+{
+	(void)arg;
+	prctl(PR_SET_PDEATHSIG, SIGTERM);
+	if (getppid() == 1)
+		_exit(1);
+
+	int n_leafs = N_SIBLINGS;
+
+	/* Block signals before spawning to avoid missing early failures */
+	sigset_t mask;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGTERM);
+	sigaddset(&mask, SIGCHLD);
+	sigprocmask(SIG_BLOCK, &mask, NULL);
+
+	/* Use mmap for tracking structures to avoid glibc heap usage */
+	struct leaf_info *leaves = mmap(NULL, n_leafs * sizeof(struct leaf_info),
+					PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (leaves == MAP_FAILED) {
+		const char *msg = "mmap leaves array failed\n";
+		(void)!write(2, msg, strlen(msg));
+		_exit(1);
+	}
+
+	for (int i = 0; i < n_leafs; i++) {
+		size_t stack_size = 64 * 1024;
+		void *stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+				   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		if (stack == MAP_FAILED) {
+			const char *msg = "mmap leaf stack failed\n";
+			(void)!write(2, msg, strlen(msg));
+			_exit(1);
+		}
+
+		leaves[i].stack = stack;
+
+		pid_t pid = clone(leaf_child_fn, stack + stack_size,
+				  CLONE_VM | SIGCHLD, (void *)(uintptr_t)i);
+
+		if (pid < 0) {
+			const char *msg = "clone (leaf child) failed\n";
+			(void)!write(2, msg, strlen(msg));
+
+			/* Clean up successfully spawned children */
+			for (int j = 0; j < i; j++) {
+				kill(leaves[j].pid, SIGTERM);
+				waitpid(leaves[j].pid, NULL, 0);
+				munmap(leaves[j].stack, stack_size);
+			}
+			munmap(leaves, n_leafs * sizeof(struct leaf_info));
+
+			if (errno == EAGAIN)
+				_exit(4);
+			else
+				_exit(1);
+		}
+		leaves[i].pid = pid;
+	}
+
+	int failed = 0;
+
+	while (1) {
+		int sig;
+
+		sigwait(&mask, &sig);
+
+		if (sig == SIGTERM) {
+			break;
+		} else if (sig == SIGCHLD) {
+			int status;
+			pid_t pid;
+
+			// Reap all dead children
+			while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
+				for (int i = 0; i < n_leafs; i++) {
+					if (leaves[i].pid == pid) {
+						leaves[i].pid = 0;
+						break;
+					}
+				}
+				if ((WIFEXITED(status) && WEXITSTATUS(status) != 0) ||
+				    WIFSIGNALED(status)) {
+					char buf[128];
+					int len = snprintf(buf, sizeof(buf),
+							   "[manager] child %d died unexpectedly (status %d)\n",
+							   pid, WEXITSTATUS(status));
+					(void)!write(2, buf, len);
+					failed = 1;
+				}
+			}
+			if (failed)
+				break;
+		}
+	}
+
+	// Terminate all leaf kids
+	for (int i = 0; i < n_leafs; i++) {
+		if (leaves[i].pid > 0)
+			kill(leaves[i].pid, SIGTERM);
+	}
+
+	for (int i = 0; i < n_leafs; i++) {
+		if (leaves[i].pid > 0)
+			waitpid(leaves[i].pid, NULL, 0);
+		munmap(leaves[i].stack, 64 * 1024);
+	}
+
+	munmap(leaves, n_leafs * sizeof(struct leaf_info));
+
+	_exit(failed ? 1 : 0);
+}
+
+/* -- Membarrier hammer thread -- */
+static void *hammer_thread_fn(void *arg)
+{
+	int target_cpu = *(int *)arg;
+	long local_ok = 0;
+	long local_err = 0;
+	int count = 0;
+	const int batch_size = 1024;
+
+	if (rseq_register_thread() < 0) {
+		ksft_print_msg("[hammer] rseq_register failed: %s\n", strerror(errno));
+		return NULL;
+	}
+
+	membarrier_register_rseq_mm();
+
+	while (!atomic_load_explicit(&g_stop, memory_order_relaxed)) {
+		int r = syscall(SYS_membarrier,
+				MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+				MEMBARRIER_CMD_FLAG_CPU,
+				target_cpu);
+		if (__builtin_expect(r == 0, 1))
+			local_ok++;
+		else
+			local_err++;
+
+		count++;
+		if (__builtin_expect(count >= batch_size, 0)) {
+			atomic_fetch_add_explicit(&g_mb_ok, local_ok, memory_order_relaxed);
+			atomic_fetch_add_explicit(&g_mb_err, local_err, memory_order_relaxed);
+			local_ok = 0;
+			local_err = 0;
+			count = 0;
+		}
+	}
+
+	/* Flush any remaining counts on exit */
+	if (local_ok > 0)
+		atomic_fetch_add_explicit(&g_mb_ok, local_ok, memory_order_relaxed);
+	if (local_err > 0)
+		atomic_fetch_add_explicit(&g_mb_err, local_err, memory_order_relaxed);
+
+	return NULL;
+}
+
+/* -- Latency sentinel -- */
+static void *sentinel_thread_fn(void *arg)
+{
+	(void)arg;
+	struct sched_param sp = { .sched_priority = 20 };
+
+	if (sched_setscheduler(0, SCHED_FIFO, &sp) < 0)
+		ksft_print_msg("WARN: no SCHED_FIFO for sentinel (less precise)\n");
+
+	while (!atomic_load_explicit(&g_test_ready, memory_order_relaxed) &&
+	       !atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) {
+		struct timespec ts = {0, 1000 * 1000}; /* 1ms */
+
+		clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL);
+	}
+
+	uint64_t prev = monotonic_us();
+
+	while (!atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) {
+		struct timespec ts = {
+			.tv_sec  = 0,
+			.tv_nsec = SENTINEL_INTERVAL_US * 1000L,
+		};
+		clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL);
+
+		uint64_t now = monotonic_us();
+		long latency_us = (long)(now - prev) - SENTINEL_INTERVAL_US;
+
+		prev = now;
+
+		if (latency_us <= 0)
+			continue;
+
+		update_max_latency(latency_us);
+
+		if (latency_us > LATENCY_CRITICAL_MS * 1000L) {
+			ksft_print_msg("\n[SENTINEL] CRITICAL: %ld ms delay (lockup precursor!)\n",
+				latency_us / 1000);
+		} else if (latency_us > LATENCY_WARN_MS * 1000L) {
+			ksft_print_msg("\n[SENTINEL] WARN: %ld ms latency spike\n",
+				latency_us / 1000);
+		}
+	}
+	return NULL;
+}
+
+/* -- Progress reporter -- */
+static void *reporter_thread_fn(void *arg)
+{
+	(void)arg;
+	int elapsed = 0;
+
+	while (!atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed)) {
+		for (int i = 0; i < 5; i++) {
+			sleep(1);
+			if (atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed))
+				break;
+		}
+		if (atomic_load_explicit(&g_stop_sentinel, memory_order_relaxed))
+			break;
+		elapsed += 5;
+		long interval_max = atomic_exchange_explicit(&g_interval_max_latency_us,
+							     0, memory_order_relaxed);
+
+		ksft_print_msg("[%3ds] mb: ok=%-10ld err=%-8ld | max_lat=%ld us\n",
+		       elapsed,
+		       atomic_load(&g_mb_ok),
+		       atomic_load(&g_mb_err),
+		       interval_max);
+	}
+	return NULL;
+}
+
+/* -- Main -- */
+int main(void)
+{
+	ksft_print_header();
+#ifdef UNSUPPORTED_ARCH
+	ksft_exit_skip("Unsupported architecture\n");
+#endif
+	ksft_set_plan(1);
+
+	if (geteuid() != 0)
+		ksft_exit_skip("Must run as root (cgroup + SCHED_FIFO)\n");
+
+	init_stress_cpus();
+
+	ksft_print_msg("=== membarrier rseq + CFS unthrottle stress ===\n");
+	ksft_print_msg("Stressing CPUs: %d\n", g_ncpus_stress);
+	ksft_print_msg("Quota: %d/%d us  (~%d unthrottles/sec/CPU)\n",
+	       CFS_QUOTA_US, CFS_PERIOD_US,
+	       1000000 / CFS_PERIOD_US);
+	ksft_print_msg("Hammer threads: %d per CPU (%d total)\n",
+	       N_HAMMER_PER_CPU, g_ncpus_stress * N_HAMMER_PER_CPU);
+	ksft_print_msg("Duration: %d seconds\n\n", TEST_DURATION_SEC);
+
+	if (cgroup_setup() < 0) {
+		cgroup_teardown();
+		ksft_exit_skip("cgroup_setup failed (missing permissions or v2 ctrls?)\n");
+	}
+
+	if (rseq_register_thread() < 0) {
+		ksft_print_msg("rseq_register (%s) failed: %s\n", __func__, strerror(errno));
+		cgroup_teardown();
+		ksft_exit_skip("rseq syscall failed or not available\n");
+	}
+	if (membarrier_register_rseq_mm() < 0) {
+		ksft_print_msg("MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: %s\n"
+			"Kernel >= 5.10 with CONFIG_RSEQ required.\n",
+			strerror(errno));
+		cgroup_teardown();
+		ksft_exit_skip("membarrier register failed\n");
+	}
+	ksft_print_msg("rseq membarrier registered OK\n");
+
+	sigset_t sigmask;
+
+	sigemptyset(&sigmask);
+	sigaddset(&sigmask, SIGTERM);
+	sigprocmask(SIG_BLOCK, &sigmask, NULL);
+
+	void *stack = malloc(1024 * 1024);
+
+	if (!stack) {
+		perror("malloc stack");
+		cgroup_teardown();
+		ksft_exit_fail_msg("Malloc stack failed\n");
+	}
+	pid_t child = clone(run_throttle_child, stack + 1024 * 1024, CLONE_VM | SIGCHLD, NULL);
+
+	if (child < 0) {
+		perror("clone");
+		cgroup_teardown();
+		ksft_exit_fail_msg("Clone failed\n");
+	}
+
+	sigprocmask(SIG_UNBLOCK, &sigmask, NULL);
+	ksft_print_msg("Throttle child PID %d started\n", child);
+
+	int n_threads = g_ncpus_stress * N_HAMMER_PER_CPU + 2;
+	pthread_t *threads = (pthread_t *)calloc(n_threads, sizeof(pthread_t));
+	int       *cpuargs = (int *)calloc(g_ncpus_stress * N_HAMMER_PER_CPU, sizeof(int));
+
+	if (!threads || !cpuargs) {
+		perror("calloc");
+		kill(child, SIGTERM);
+		waitpid(child, NULL, 0);
+		cgroup_teardown();
+		ksft_exit_fail_msg("Thread allocation failed\n");
+	}
+
+	int ti = 0, ai = 0;
+	int r;
+
+	ksft_print_msg("Creating sentinel thread...\n");
+	r = pthread_create(&threads[ti], NULL, sentinel_thread_fn, NULL);
+	if (r != 0) {
+		kill(child, SIGTERM);
+		waitpid(child, NULL, 0);
+		cgroup_teardown();
+		free(threads);
+		free(cpuargs);
+		free(g_stress_cpus);
+		ksft_exit_fail_msg("pthread_create (sentinel) failed: %s\n", strerror(r));
+	}
+	ti++;
+
+	ksft_print_msg("Creating reporter thread...\n");
+	r = pthread_create(&threads[ti], NULL, reporter_thread_fn, NULL);
+	if (r != 0) {
+		atomic_store(&g_stop_sentinel, 1);
+		pthread_join(threads[0], NULL);
+		kill(child, SIGTERM);
+		waitpid(child, NULL, 0);
+		cgroup_teardown();
+		free(threads);
+		free(cpuargs);
+		free(g_stress_cpus);
+		ksft_exit_fail_msg("pthread_create (reporter) failed: %s\n", strerror(r));
+	}
+	ti++;
+
+	ksft_print_msg("Creating %d hammer threads...\n", g_ncpus_stress * N_HAMMER_PER_CPU);
+	for (int i = 0; i < g_ncpus_stress; i++) {
+		int cpu = g_stress_cpus[i];
+
+		for (int j = 0; j < N_HAMMER_PER_CPU; j++) {
+			cpuargs[ai] = cpu;
+			r = pthread_create(&threads[ti], NULL, hammer_thread_fn, &cpuargs[ai]);
+			if (r != 0) {
+				ksft_print_msg("pthread_create failed at thread %d: %s\n",
+					       ti, strerror(r));
+
+				atomic_store(&g_stop_sentinel, 1);
+				pthread_join(threads[0], NULL);
+				pthread_join(threads[1], NULL);
+
+				atomic_store(&g_stop, 1);
+				for (int k = 2; k < ti; k++)
+					pthread_join(threads[k], NULL);
+
+				kill(child, SIGTERM);
+				waitpid(child, NULL, 0);
+				cgroup_teardown();
+
+				free(threads);
+				free(cpuargs);
+				free(g_stress_cpus);
+
+				if (r == EAGAIN)
+					ksft_exit_skip("Resource limits prevent threads\n");
+				else
+					ksft_exit_fail_msg("Failed to create hammer thread\n");
+			}
+			ti++;
+			ai++;
+		}
+	}
+
+	ksft_print_msg("All threads running. Tip: monitor dmesg for lockups\n\n");
+
+	atomic_store_explicit(&g_test_ready, 1, memory_order_relaxed);
+	int child_failed = 0;
+	int child_status = 0;
+
+	for (int i = 0; i < TEST_DURATION_SEC; i++) {
+		sleep(1);
+		int r = waitpid(child, &child_status, WNOHANG);
+
+		if (r == child) {
+			child_failed = 1;
+			break;
+		}
+	}
+
+	atomic_store(&g_stop_sentinel, 1);
+	pthread_join(threads[0], NULL);
+	pthread_join(threads[1], NULL);
+
+	atomic_store(&g_stop, 1);
+
+	/* Unthrottle to allow children to exit quickly */
+	cgroup_unthrottle();
+
+	if (!child_failed) {
+		kill(child, SIGTERM);
+		waitpid(child, NULL, 0);
+	}
+	for (int i = 2; i < ti; i++)
+		pthread_join(threads[i], NULL);
+
+	long max_lat   = atomic_load(&g_max_latency_us);
+	long total_ok  = atomic_load(&g_mb_ok);
+	long total_err = atomic_load(&g_mb_err);
+
+	ksft_print_msg("\n=== RESULTS ===\n");
+	ksft_print_msg("membarrier syscalls : %ld ok  %ld errors\n", total_ok, total_err);
+	ksft_print_msg("Max scheduler latency: %ld us  (%ld ms)\n", max_lat, max_lat / 1000);
+	cgroup_teardown();
+	free(threads);
+	free(cpuargs);
+	free(g_stress_cpus);
+
+	if (child_failed) {
+		if (WIFEXITED(child_status) && WEXITSTATUS(child_status) == 4)
+			ksft_exit_skip("Manager child skipped (resource limits?)\n");
+		ksft_test_result_fail("membarrier_rseq_stress: Manager child died early\n");
+		ksft_exit_fail();
+	} else if (total_ok == 0) {
+		ksft_test_result_fail("membarrier_rseq_stress: No successful membarrier calls\n");
+		ksft_exit_fail();
+	} else if (total_err > 0) {
+		ksft_test_result_fail("membarrier_rseq_stress: syscall errors\n");
+		ksft_exit_fail();
+	} else if (max_lat > LATENCY_CRITICAL_MS * 1000L) {
+		ksft_test_result_fail("membarrier_rseq_stress: LOCKUP PRECURSOR\n");
+		ksft_exit_fail();
+	} else if (max_lat > LATENCY_WARN_MS * 1000L) {
+		ksft_test_result_fail("membarrier_rseq_stress: significant latency spike\n");
+		ksft_exit_fail();
+	} else {
+		ksft_test_result_pass("membarrier_rseq_stress\n");
+		ksft_exit_pass();
+	}
+
+	return 0;
+}
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


  parent reply	other threads:[~2026-04-15 23:21 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-15 23:21 [PATCH v2 0/2] sched/membarrier: Use per-CPU mutexes for targeted commands Aniket Gattani
2026-04-15 23:21 ` [PATCH v2 1/2] " Aniket Gattani
2026-04-28 12:48   ` Peter Zijlstra
2026-04-30  0:12     ` Aniket Gattani
2026-04-15 23:21 ` Aniket Gattani [this message]
2026-04-27 20:07 ` [PATCH v2] " Aniket Gattani

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260415232106.2803644-3-aniketgattani@google.com \
    --to=aniketgattani@google.com \
    --cc=bsegall@google.com \
    --cc=joshdon@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox