Netdev List
 help / color / mirror / Atom feed
* [PATCH net 2/2] selftests: net: add reuseport migration wakeup regression tests
From: Zhenzhong Wu @ 2026-04-18  4:16 UTC (permalink / raw)
  To: netdev
  Cc: edumazet, ncardwell, kuniyu, davem, dsahern, kuba, pabeni, horms,
	shuah, tamird, linux-kernel, linux-kselftest, Zhenzhong Wu
In-Reply-To: <20260418041633.691435-1-jt26wzz@gmail.com>

Add selftests that reproduce missing wakeups on the target listener
after SO_REUSEPORT migration from inet_csk_listen_stop().

The epoll case connects while only the first listener is active so the
child lands on its accept queue, registers the second listener with
epoll, then closes the first listener to trigger migration. It verifies
that the target listener both accepts the migrated child and becomes
readable via epoll.

The blocking accept case starts a thread blocked in accept() on the
target listener, closes the first listener to trigger migration, and
verifies that the blocked accept() wakes and returns the migrated
child. Wait until the helper thread is actually asleep in accept()
before triggering migration so the test does not race waiter
registration.

Run the tests in a private network namespace and enable
net.ipv4.tcp_migrate_req=1 there so they can exercise the migration
path without relying on a sk_reuseport/migrate BPF program. Treat a
missing or unwritable tcp_migrate_req sysctl as SKIP. Run both
scenarios for IPv4 and IPv6.

These tests cover the bug fixed by the preceding patch.

Signed-off-by: Zhenzhong Wu <jt26wzz@gmail.com>
---
 tools/testing/selftests/net/Makefile          |   3 +
 .../selftests/net/reuseport_migrate_accept.c  | 533 ++++++++++++++++++
 .../selftests/net/reuseport_migrate_epoll.c   | 353 ++++++++++++
 3 files changed, 889 insertions(+)
 create mode 100644 tools/testing/selftests/net/reuseport_migrate_accept.c
 create mode 100644 tools/testing/selftests/net/reuseport_migrate_epoll.c

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index a275ed584..2f8b6c44d 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -184,6 +184,8 @@ TEST_GEN_PROGS := \
 	reuseport_bpf_cpu \
 	reuseport_bpf_numa \
 	reuseport_dualstack \
+	reuseport_migrate_accept \
+	reuseport_migrate_epoll \
 	sk_bind_sendto_listen \
 	sk_connect_zero_addr \
 	sk_so_peek_off \
@@ -232,6 +234,7 @@ $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
 $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto
 $(OUTPUT)/tcp_inq: LDLIBS += -lpthread
 $(OUTPUT)/bind_bhash: LDLIBS += -lpthread
+$(OUTPUT)/reuseport_migrate_accept: LDLIBS += -lpthread
 $(OUTPUT)/io_uring_zerocopy_tx: CFLAGS += -I../../../include/
 
 include bpf.mk
diff --git a/tools/testing/selftests/net/reuseport_migrate_accept.c b/tools/testing/selftests/net/reuseport_migrate_accept.c
new file mode 100644
index 000000000..a516843a0
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_migrate_accept.c
@@ -0,0 +1,533 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#define ACCEPT_BLOCK_TIMEOUT_MS 1000
+#define ACCEPT_CLEANUP_TIMEOUT_MS 1000
+#define ACCEPT_WAKE_TIMEOUT_MS 2000
+#define TCP_MIGRATE_REQ_PATH "/proc/sys/net/ipv4/tcp_migrate_req"
+
+struct reuseport_migrate_case {
+	const char *name;
+	int family;
+	const char *addr;
+};
+
+struct accept_result {
+	int listener_fd;
+	atomic_int started;
+	atomic_int tid;
+	int accepted_fd;
+	int err;
+};
+
+static const struct reuseport_migrate_case test_cases[] = {
+	{
+		.name = "ipv4 blocking accept wake after reuseport migration",
+		.family = AF_INET,
+		.addr = "127.0.0.1",
+	},
+	{
+		.name = "ipv6 blocking accept wake after reuseport migration",
+		.family = AF_INET6,
+		.addr = "::1",
+	},
+};
+
+static void close_fd(int *fd)
+{
+	if (*fd >= 0) {
+		close(*fd);
+		*fd = -1;
+	}
+}
+
+static bool unsupported_addr_err(int family, int err)
+{
+	return family == AF_INET6 &&
+		(err == EAFNOSUPPORT ||
+		 err == EPROTONOSUPPORT ||
+		 err == EADDRNOTAVAIL);
+}
+
+static int make_sockaddr(const struct reuseport_migrate_case *test_case,
+			 unsigned short port,
+			 struct sockaddr_storage *addr,
+			 socklen_t *addrlen)
+{
+	memset(addr, 0, sizeof(*addr));
+
+	if (test_case->family == AF_INET) {
+		struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(port);
+		if (inet_pton(AF_INET, test_case->addr, &addr4->sin_addr) != 1)
+			return -1;
+
+		*addrlen = sizeof(*addr4);
+		return 0;
+	}
+
+	if (test_case->family == AF_INET6) {
+		struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(port);
+		if (inet_pton(AF_INET6, test_case->addr, &addr6->sin6_addr) != 1)
+			return -1;
+
+		*addrlen = sizeof(*addr6);
+		return 0;
+	}
+
+	return -1;
+}
+
+static int create_reuseport_socket(const struct reuseport_migrate_case *test_case)
+{
+	int one = 1;
+	int fd;
+
+	fd = socket(test_case->family, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (fd < 0)
+		return -1;
+
+	if (test_case->family == AF_INET6 &&
+	    setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one))) {
+		close(fd);
+		return -1;
+	}
+
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one))) {
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+static int enable_tcp_migrate_req(void)
+{
+	int len;
+	int fd;
+
+	fd = open(TCP_MIGRATE_REQ_PATH, O_RDWR | O_CLOEXEC);
+	if (fd < 0) {
+		if (errno == ENOENT || errno == EACCES ||
+		    errno == EPERM || errno == EROFS)
+			return KSFT_SKIP;
+		return KSFT_FAIL;
+	}
+
+	len = write(fd, "1", 1);
+	if (len != 1) {
+		if (errno == EACCES || errno == EPERM || errno == EROFS) {
+			close(fd);
+			return KSFT_SKIP;
+		}
+
+		close(fd);
+		return KSFT_FAIL;
+	}
+
+	close(fd);
+	return KSFT_PASS;
+}
+
+static void setup_netns(void)
+{
+	int ret;
+
+	if (unshare(CLONE_NEWNET))
+		ksft_exit_skip("unshare(CLONE_NEWNET): %s\n", strerror(errno));
+
+	if (system("ip link set lo up"))
+		ksft_exit_skip("failed to bring up lo interface in netns\n");
+
+	ret = enable_tcp_migrate_req();
+	if (ret == KSFT_SKIP)
+		ksft_exit_skip("failed to enable tcp_migrate_req\n");
+	if (ret == KSFT_FAIL)
+		ksft_exit_fail_msg("failed to enable tcp_migrate_req\n");
+}
+
+static void noop_handler(int sig)
+{
+	(void)sig;
+}
+
+static void *accept_thread(void *arg)
+{
+	struct accept_result *result = arg;
+
+	atomic_store_explicit(&result->tid, (int)syscall(SYS_gettid),
+			      memory_order_release);
+	atomic_store_explicit(&result->started, 1, memory_order_release);
+	result->accepted_fd = accept4(result->listener_fd, NULL, NULL,
+				      SOCK_CLOEXEC);
+	if (result->accepted_fd < 0)
+		result->err = errno;
+
+	return NULL;
+}
+
+static int read_thread_state(int tid, char *state)
+{
+	char *close_paren;
+	char path[64];
+	char buf[256];
+	ssize_t len;
+	int fd;
+
+	snprintf(path, sizeof(path), "/proc/self/task/%d/stat", tid);
+
+	fd = open(path, O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return -errno;
+
+	len = read(fd, buf, sizeof(buf) - 1);
+	close(fd);
+	if (len < 0)
+		return -errno;
+	if (!len)
+		return -EINVAL;
+
+	buf[len] = '\0';
+	close_paren = strrchr(buf, ')');
+	if (!close_paren || close_paren[1] != ' ' || !close_paren[2])
+		return -EINVAL;
+
+	*state = close_paren[2];
+	return 0;
+}
+
+static int wait_for_accept_to_block(const struct reuseport_migrate_case *test_case,
+				    int tid)
+{
+	char state = '\0';
+	int ret;
+	int i;
+
+	/*
+	 * A started thread is not enough here: we need to know the waiter
+	 * has actually gone to sleep in accept() before closing listener_a,
+	 * otherwise migration can race ahead of waiter registration. Poll
+	 * /proc task state because the pthread APIs can tell us whether the
+	 * thread has exited, but not whether it is already blocked in the
+	 * target syscall.
+	 */
+	for (i = 0; i < ACCEPT_BLOCK_TIMEOUT_MS; i++) {
+		ret = read_thread_state(tid, &state);
+		if (!ret) {
+			if (state == 'S' || state == 'D')
+				return KSFT_PASS;
+			if (state == 'Z')
+				break;
+		} else if (ret == -ENOENT) {
+			break;
+		}
+
+		usleep(1000);
+	}
+
+	ksft_print_msg("%s: accept waiter never blocked before migration\n",
+		       test_case->name);
+	return KSFT_FAIL;
+}
+
+static int join_thread_with_timeout(pthread_t thread, int timeout_ms,
+				    bool *timed_out)
+{
+	struct timespec deadline;
+	int err;
+
+	*timed_out = false;
+
+	if (clock_gettime(CLOCK_REALTIME, &deadline))
+		return KSFT_FAIL;
+
+	deadline.tv_nsec += timeout_ms * 1000000LL;
+	deadline.tv_sec += deadline.tv_nsec / 1000000000LL;
+	deadline.tv_nsec %= 1000000000LL;
+
+	err = pthread_timedjoin_np(thread, NULL, &deadline);
+	if (!err)
+		return KSFT_PASS;
+
+	if (err != ETIMEDOUT)
+		return KSFT_FAIL;
+
+	*timed_out = true;
+	return KSFT_FAIL;
+}
+
+static int interrupt_accept_thread(pthread_t thread)
+{
+	int err;
+
+	err = pthread_kill(thread, SIGUSR1);
+	if (err && err != ESRCH)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int stop_accept_thread(pthread_t thread, bool *timed_out)
+{
+	if (interrupt_accept_thread(thread))
+		return KSFT_FAIL;
+
+	return join_thread_with_timeout(thread, ACCEPT_CLEANUP_TIMEOUT_MS,
+					timed_out);
+}
+
+static int run_test(const struct reuseport_migrate_case *test_case)
+{
+	struct accept_result result = {
+		.listener_fd = -1,
+		.started = 0,
+		.tid = -1,
+		.accepted_fd = -1,
+		.err = 0,
+	};
+	struct sockaddr_storage addr;
+	struct sigaction sa = {
+		.sa_handler = noop_handler,
+	};
+	bool thread_joined = false;
+	bool cleanup_timed_out;
+	int listener_a = -1;
+	int listener_b = -1;
+	int ret = KSFT_FAIL;
+	socklen_t addrlen;
+	pthread_t thread;
+	int client = -1;
+	bool timed_out;
+	int probe = -1;
+	int tid;
+
+	if (make_sockaddr(test_case, 0, &addr, &addrlen)) {
+		ksft_print_msg("%s: failed to build socket address\n",
+			       test_case->name);
+		goto out;
+	}
+
+	if (sigemptyset(&sa.sa_mask)) {
+		ksft_perror("sigemptyset");
+		goto out;
+	}
+
+	if (sigaction(SIGUSR1, &sa, NULL)) {
+		ksft_perror("sigaction(SIGUSR1)");
+		goto out;
+	}
+
+	listener_a = create_reuseport_socket(test_case);
+	if (listener_a < 0) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("socket(listener_a)");
+		goto out;
+	}
+
+	if (bind(listener_a, (struct sockaddr *)&addr, addrlen)) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("bind(listener_a)");
+		goto out;
+	}
+
+	if (listen(listener_a, 1)) {
+		ksft_perror("listen(listener_a)");
+		goto out;
+	}
+
+	addrlen = sizeof(addr);
+	if (getsockname(listener_a, (struct sockaddr *)&addr, &addrlen)) {
+		ksft_perror("getsockname(listener_a)");
+		goto out;
+	}
+
+	listener_b = create_reuseport_socket(test_case);
+	if (listener_b < 0) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("socket(listener_b)");
+		goto out;
+	}
+
+	if (bind(listener_b, (struct sockaddr *)&addr, addrlen)) {
+		ksft_perror("bind(listener_b)");
+		goto out;
+	}
+
+	client = socket(test_case->family, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (client < 0) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("socket(client)");
+		goto out;
+	}
+
+	/* Connect while only listener_a is listening, ensuring the
+	 * child lands in listener_a's accept queue deterministically.
+	 */
+	if (connect(client, (struct sockaddr *)&addr, addrlen)) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("connect(client)");
+		goto out;
+	}
+
+	if (listen(listener_b, 1)) {
+		ksft_perror("listen(listener_b)");
+		goto out;
+	}
+
+	result.listener_fd = listener_b;
+	if (pthread_create(&thread, NULL, accept_thread, &result)) {
+		ksft_perror("pthread_create");
+		goto out;
+	}
+
+	while (!atomic_load_explicit(&result.started, memory_order_acquire))
+		sched_yield();
+
+	tid = atomic_load_explicit(&result.tid, memory_order_acquire);
+	if (wait_for_accept_to_block(test_case, tid))
+		goto out_with_thread;
+
+	close_fd(&listener_a);
+
+	ret = join_thread_with_timeout(thread, ACCEPT_WAKE_TIMEOUT_MS, &timed_out);
+	if (ret == KSFT_PASS) {
+		thread_joined = true;
+		if (result.accepted_fd < 0) {
+			ksft_print_msg("%s: blocking accept() returned err=%d (%s)\n",
+				       test_case->name, result.err,
+				       strerror(result.err));
+			ret = KSFT_FAIL;
+		}
+
+		goto out_with_thread;
+	}
+
+	if (!timed_out) {
+		ksft_print_msg("%s: join_thread_with_timeout() failed\n",
+			       test_case->name);
+		goto out_with_thread;
+	}
+
+	if (stop_accept_thread(thread, &cleanup_timed_out) == KSFT_FAIL) {
+		ksft_print_msg("%s: failed to stop blocking accept waiter\n",
+			       test_case->name);
+		goto out_with_thread;
+	}
+	thread_joined = true;
+
+	if (result.accepted_fd >= 0) {
+		ksft_print_msg("%s: blocking accept() completed only in cleanup\n",
+			       test_case->name);
+		goto out_with_thread;
+	}
+
+	if (result.err != EINTR) {
+		ksft_print_msg("%s: blocking accept() returned err=%d (%s)\n",
+			       test_case->name, result.err,
+			       strerror(result.err));
+		goto out_with_thread;
+	}
+
+	probe = accept4(listener_b, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);
+	if (probe >= 0) {
+		ksft_print_msg("%s: accept queue was populated, but blocking accept() timed out\n",
+			       test_case->name);
+	} else if (errno == EAGAIN || errno == EWOULDBLOCK) {
+		ksft_print_msg("%s: target listener had no queued child after migration\n",
+			       test_case->name);
+	} else {
+		ksft_perror("accept4(listener_b)");
+	}
+
+out_with_thread:
+	close_fd(&probe);
+	if (!thread_joined) {
+		if (stop_accept_thread(thread, &cleanup_timed_out) == KSFT_FAIL) {
+			ksft_print_msg("%s: failed to stop blocking accept waiter\n",
+				       test_case->name);
+			ret = KSFT_FAIL;
+			goto out;
+		}
+
+		thread_joined = true;
+	}
+	if (thread_joined)
+		close_fd(&result.accepted_fd);
+
+out:
+	close_fd(&client);
+	close_fd(&listener_b);
+	close_fd(&listener_a);
+
+	return ret;
+}
+
+int main(void)
+{
+	int status = KSFT_PASS;
+	int ret;
+	int i;
+
+	setup_netns();
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(test_cases));
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		ret = run_test(&test_cases[i]);
+		ksft_test_result_code(ret, test_cases[i].name, NULL);
+
+		if (ret == KSFT_FAIL)
+			status = KSFT_FAIL;
+	}
+
+	if (status == KSFT_FAIL)
+		ksft_exit_fail();
+
+	ksft_finished();
+}
diff --git a/tools/testing/selftests/net/reuseport_migrate_epoll.c b/tools/testing/selftests/net/reuseport_migrate_epoll.c
new file mode 100644
index 000000000..9cbfb58c4
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_migrate_epoll.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#define EPOLL_TIMEOUT_MS 500
+#define TCP_MIGRATE_REQ_PATH "/proc/sys/net/ipv4/tcp_migrate_req"
+
+struct reuseport_migrate_case {
+	const char *name;
+	int family;
+	const char *addr;
+};
+
+static const struct reuseport_migrate_case test_cases[] = {
+	{
+		.name = "ipv4 epoll wake after reuseport migration",
+		.family = AF_INET,
+		.addr = "127.0.0.1",
+	},
+	{
+		.name = "ipv6 epoll wake after reuseport migration",
+		.family = AF_INET6,
+		.addr = "::1",
+	},
+};
+
+static void close_fd(int *fd)
+{
+	if (*fd >= 0) {
+		close(*fd);
+		*fd = -1;
+	}
+}
+
+static bool unsupported_addr_err(int family, int err)
+{
+	return family == AF_INET6 &&
+		(err == EAFNOSUPPORT ||
+		 err == EPROTONOSUPPORT ||
+		 err == EADDRNOTAVAIL);
+}
+
+static int make_sockaddr(const struct reuseport_migrate_case *test_case,
+			 unsigned short port,
+			 struct sockaddr_storage *addr,
+			 socklen_t *addrlen)
+{
+	memset(addr, 0, sizeof(*addr));
+
+	if (test_case->family == AF_INET) {
+		struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(port);
+		if (inet_pton(AF_INET, test_case->addr, &addr4->sin_addr) != 1)
+			return -1;
+
+		*addrlen = sizeof(*addr4);
+		return 0;
+	}
+
+	if (test_case->family == AF_INET6) {
+		struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(port);
+		if (inet_pton(AF_INET6, test_case->addr, &addr6->sin6_addr) != 1)
+			return -1;
+
+		*addrlen = sizeof(*addr6);
+		return 0;
+	}
+
+	return -1;
+}
+
+static int create_reuseport_socket(const struct reuseport_migrate_case *test_case)
+{
+	int one = 1;
+	int fd;
+
+	fd = socket(test_case->family, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (fd < 0)
+		return -1;
+
+	if (test_case->family == AF_INET6 &&
+	    setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &one, sizeof(one))) {
+		close(fd);
+		return -1;
+	}
+
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one))) {
+		close(fd);
+		return -1;
+	}
+
+	return fd;
+}
+
+static int set_nonblocking(int fd)
+{
+	int flags;
+
+	flags = fcntl(fd, F_GETFL);
+	if (flags < 0)
+		return -1;
+
+	return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+static int enable_tcp_migrate_req(void)
+{
+	int len;
+	int fd;
+
+	fd = open(TCP_MIGRATE_REQ_PATH, O_RDWR | O_CLOEXEC);
+	if (fd < 0) {
+		if (errno == ENOENT || errno == EACCES ||
+		    errno == EPERM || errno == EROFS)
+			return KSFT_SKIP;
+		return KSFT_FAIL;
+	}
+
+	len = write(fd, "1", 1);
+	if (len != 1) {
+		if (errno == EACCES || errno == EPERM || errno == EROFS) {
+			close(fd);
+			return KSFT_SKIP;
+		}
+
+		close(fd);
+		return KSFT_FAIL;
+	}
+
+	close(fd);
+	return KSFT_PASS;
+}
+
+static void setup_netns(void)
+{
+	int ret;
+
+	if (unshare(CLONE_NEWNET))
+		ksft_exit_skip("unshare(CLONE_NEWNET): %s\n", strerror(errno));
+
+	if (system("ip link set lo up"))
+		ksft_exit_skip("failed to bring up lo interface in netns\n");
+
+	ret = enable_tcp_migrate_req();
+	if (ret == KSFT_SKIP)
+		ksft_exit_skip("failed to enable tcp_migrate_req\n");
+	if (ret == KSFT_FAIL)
+		ksft_exit_fail_msg("failed to enable tcp_migrate_req\n");
+}
+
+static int run_test(const struct reuseport_migrate_case *test_case)
+{
+	struct sockaddr_storage addr;
+	struct epoll_event ev = {
+		.events = EPOLLIN,
+	};
+	int listener_a = -1;
+	int listener_b = -1;
+	int ret = KSFT_FAIL;
+	socklen_t addrlen;
+	int accepted = -1;
+	int client = -1;
+	int epfd = -1;
+	int n;
+
+	if (make_sockaddr(test_case, 0, &addr, &addrlen)) {
+		ksft_print_msg("%s: failed to build socket address\n",
+			       test_case->name);
+		goto out;
+	}
+
+	listener_a = create_reuseport_socket(test_case);
+	if (listener_a < 0) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("socket(listener_a)");
+		goto out;
+	}
+
+	if (bind(listener_a, (struct sockaddr *)&addr, addrlen)) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("bind(listener_a)");
+		goto out;
+	}
+
+	if (listen(listener_a, 1)) {
+		ksft_perror("listen(listener_a)");
+		goto out;
+	}
+
+	addrlen = sizeof(addr);
+	if (getsockname(listener_a, (struct sockaddr *)&addr, &addrlen)) {
+		ksft_perror("getsockname(listener_a)");
+		goto out;
+	}
+
+	listener_b = create_reuseport_socket(test_case);
+	if (listener_b < 0) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("socket(listener_b)");
+		goto out;
+	}
+
+	if (bind(listener_b, (struct sockaddr *)&addr, addrlen)) {
+		ksft_perror("bind(listener_b)");
+		goto out;
+	}
+
+	client = socket(test_case->family, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+	if (client < 0) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("socket(client)");
+		goto out;
+	}
+
+	/* Connect while only listener_a is listening, ensuring the
+	 * child lands in listener_a's accept queue deterministically.
+	 */
+	if (connect(client, (struct sockaddr *)&addr, addrlen)) {
+		if (unsupported_addr_err(test_case->family, errno)) {
+			ret = KSFT_SKIP;
+			goto out;
+		}
+
+		ksft_perror("connect(client)");
+		goto out;
+	}
+
+	if (listen(listener_b, 1)) {
+		ksft_perror("listen(listener_b)");
+		goto out;
+	}
+
+	if (set_nonblocking(listener_b)) {
+		ksft_perror("set_nonblocking(listener_b)");
+		goto out;
+	}
+
+	epfd = epoll_create1(EPOLL_CLOEXEC);
+	if (epfd < 0) {
+		ksft_perror("epoll_create1");
+		goto out;
+	}
+
+	ev.data.fd = listener_b;
+	if (epoll_ctl(epfd, EPOLL_CTL_ADD, listener_b, &ev)) {
+		ksft_perror("epoll_ctl(ADD listener_b)");
+		goto out;
+	}
+
+	close_fd(&listener_a);
+
+	n = epoll_wait(epfd, &ev, 1, EPOLL_TIMEOUT_MS);
+	if (n < 0) {
+		ksft_perror("epoll_wait");
+		goto out;
+	}
+
+	accepted = accept4(listener_b, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);
+	if (accepted < 0) {
+		if (errno == EAGAIN || errno == EWOULDBLOCK) {
+			ksft_print_msg("%s: target listener had no queued child after migration\n",
+				       test_case->name);
+			goto out;
+		}
+
+		ksft_perror("accept4(listener_b)");
+		goto out;
+	}
+
+	if (n != 1) {
+		ksft_print_msg("%s: accept queue was populated, but epoll_wait() timed out\n",
+			       test_case->name);
+		goto out;
+	}
+
+	if (ev.data.fd != listener_b || !(ev.events & EPOLLIN)) {
+		ksft_print_msg("%s: unexpected epoll event fd=%d events=%#x\n",
+			       test_case->name, ev.data.fd, ev.events);
+		goto out;
+	}
+
+	ret = KSFT_PASS;
+
+out:
+	close_fd(&accepted);
+	close_fd(&epfd);
+	close_fd(&client);
+	close_fd(&listener_b);
+	close_fd(&listener_a);
+
+	return ret;
+}
+
+int main(void)
+{
+	int status = KSFT_PASS;
+	int ret;
+	int i;
+
+	setup_netns();
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(test_cases));
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		ret = run_test(&test_cases[i]);
+		ksft_test_result_code(ret, test_cases[i].name, NULL);
+
+		if (ret == KSFT_FAIL)
+			status = KSFT_FAIL;
+	}
+
+	if (status == KSFT_FAIL)
+		ksft_exit_fail();
+
+	ksft_finished();
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH net 1/2] tcp: call sk_data_ready() after listener migration
From: Zhenzhong Wu @ 2026-04-18  4:16 UTC (permalink / raw)
  To: netdev
  Cc: edumazet, ncardwell, kuniyu, davem, dsahern, kuba, pabeni, horms,
	shuah, tamird, linux-kernel, linux-kselftest, Zhenzhong Wu,
	stable
In-Reply-To: <20260418041633.691435-1-jt26wzz@gmail.com>

When inet_csk_listen_stop() migrates an established child socket from
a closing listener to another socket in the same SO_REUSEPORT group,
the target listener gets a new accept-queue entry via
inet_csk_reqsk_queue_add(), but that path never notifies the target
listener's waiters.

As a result, a nonblocking accept() still succeeds because it checks
the accept queue directly, but waiters that sleep for listener
readiness can remain asleep until another connection generates a
wakeup. This affects poll()/epoll_wait()-based waiters, and can also
leave a blocking accept() asleep after migration even though the
child is already in the target listener's accept queue.

This was observed in a local test where listener A completed the
handshake, queued the child, and was closed before userspace called
accept(). The child was migrated to listener B, but listener B never
received a wakeup for the migrated accept-queue entry.

Call READ_ONCE(nsk->sk_data_ready)(nsk) after a successful migration
in inet_csk_listen_stop().

The reqsk_timer_handler() path does not need the same change:
half-open requests only become readable to userspace when the final
ACK completes the handshake, and tcp_child_process() already wakes
the listener in that case.

Fixes: 54b92e841937 ("tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.")
Cc: stable@vger.kernel.org
Signed-off-by: Zhenzhong Wu <jt26wzz@gmail.com>
---
 net/ipv4/inet_connection_sock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4ac3ae1bc..da1ce082f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1483,6 +1483,7 @@ void inet_csk_listen_stop(struct sock *sk)
 					__NET_INC_STATS(sock_net(nsk),
 							LINUX_MIB_TCPMIGRATEREQSUCCESS);
 					reqsk_migrate_reset(req);
+					READ_ONCE(nsk->sk_data_ready)(nsk);
 				} else {
 					__NET_INC_STATS(sock_net(nsk),
 							LINUX_MIB_TCPMIGRATEREQFAILURE);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net 0/2] tcp: fix listener wakeup after reuseport migration
From: Zhenzhong Wu @ 2026-04-18  4:16 UTC (permalink / raw)
  To: netdev
  Cc: edumazet, ncardwell, kuniyu, davem, dsahern, kuba, pabeni, horms,
	shuah, tamird, linux-kernel, linux-kselftest, Zhenzhong Wu

Hi,

this small series fixes a missing wakeup after listener migration in
the SO_REUSEPORT close path and adds regression selftests.

The issue shows up when a fully established child has already been
queued on listener A, userspace has not accepted it yet, and
listener A is then closed. The kernel migrates that child to
listener B in the same SO_REUSEPORT group via
inet_csk_reqsk_queue_add(), but the target listener's waiters are
not notified.

As a result, a nonblocking accept() still succeeds because it checks
the accept queue directly, but waiters that sleep for listener
readiness can remain asleep until another connection generates a
wakeup. This affects poll()/epoll_wait()-based waiters, and can also
leave a blocking accept() asleep after migration even though the
child is already in the target listener's accept queue.

The fix is to notify the target listener after a successful
inet_csk_reqsk_queue_add() in inet_csk_listen_stop().

I also checked the half-open migration path in
reqsk_timer_handler(). That path does not need an extra wakeup here
because the listener becomes readable only after the final ACK
completes the handshake, and tcp_child_process() already wakes the
parent listener at that point.

The series adds selftests under tools/testing/selftests/net/ that
reproduce the regression for both IPv4 and IPv6. They cover both
epoll-based waiters and a blocking accept() waiter.

Patch 1 contains only the runtime fix so it can stand on its own and
be considered for stable backporting. Patch 2 adds the selftest
coverage.

Testing:

On an unpatched host kernel:

  unshare -Ur sh -c \
    './tools/testing/selftests/net/reuseport_migrate_epoll'
  unshare -Ur sh -c \
    './tools/testing/selftests/net/reuseport_migrate_accept'

The epoll selftest fails for both IPv4 and IPv6 with:

  accept queue was populated, but epoll_wait() timed out

The blocking accept selftest fails for both IPv4 and IPv6, for example
with:

  blocking accept() completed only in cleanup

On a patched kernel booted under QEMU with a minimal initramfs, both
selftests pass:

  ok 1 ipv4 epoll wake after reuseport migration
  ok 2 ipv6 epoll wake after reuseport migration
  reuseport_migrate_epoll_RC=0

  ok 1 ipv4 blocking accept wake after reuseport migration
  ok 2 ipv6 blocking accept wake after reuseport migration
  reuseport_migrate_accept_RC=0

Zhenzhong Wu (2):
  tcp: call sk_data_ready() after listener migration
  selftests: net: add reuseport migration wakeup regression tests

 net/ipv4/inet_connection_sock.c               |   1 +
 tools/testing/selftests/net/Makefile          |   3 +
 .../selftests/net/reuseport_migrate_accept.c  | 533 ++++++++++++++++++
 .../selftests/net/reuseport_migrate_epoll.c   | 353 ++++++++++++
 4 files changed, 890 insertions(+)
 create mode 100644 tools/testing/selftests/net/reuseport_migrate_accept.c
 create mode 100644 tools/testing/selftests/net/reuseport_migrate_epoll.c


base-commit: 52bcb57a4e8a0865a76c587c2451906342ae1b2d
-- 
2.43.0

^ permalink raw reply

* [PATCH net v8 6/6] net/sched: netem: check for negative latency and jitter
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev
  Cc: jiri, jhs, horms, Stephen Hemminger, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Dave Taht, open list
In-Reply-To: <20260418032027.900913-1-stephen@networkplumber.org>

Reject requests with negative latency or jitter.
A negative value added to current timestamp (u64) wraps
to an enormous time_to_send, disabling dequeue.
The original UAPI used u32 for these values; the conversion to 64-bit
time values via TCA_NETEM_LATENCY64 and TCA_NETEM_JITTER64
allowed signed values to reach the kernel without validation.

Jitter is already silently clamped by an abs() in netem_change();
that abs() can be removed in a follow-up once this rejection is in
place.

Fixes: 99803171ef04 ("netem: add uapi to express delay and jitter in nanoseconds")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/sched/sch_netem.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 475c14b3dbdb..bc18e1976b6e 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -826,6 +826,16 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
 	return 0;
 }
 
+static int validate_time(const struct nlattr *attr, const char *name,
+			 struct netlink_ext_ack *extack)
+{
+	if (nla_get_s64(attr) < 0) {
+		NL_SET_ERR_MSG_ATTR_FMT(extack, attr, "negative %s", name);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static int validate_slot(const struct nlattr *attr, struct netlink_ext_ack *extack)
 {
 	const struct tc_netem_slot *c = nla_data(attr);
@@ -1068,6 +1078,18 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 			goto table_free;
 	}
 
+	if (tb[TCA_NETEM_LATENCY64]) {
+		ret = validate_time(tb[TCA_NETEM_LATENCY64], "latency", extack);
+		if (ret)
+			goto table_free;
+	}
+
+	if (tb[TCA_NETEM_JITTER64]) {
+		ret = validate_time(tb[TCA_NETEM_JITTER64], "jitter", extack);
+		if (ret)
+			goto table_free;
+	}
+
 	sch_tree_lock(sch);
 	/* backup q->clg and q->loss_model */
 	old_clg = q->clg;
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v8 5/6] net/sched: netem: fix slot delay calculation overflow
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev
  Cc: jiri, jhs, horms, Stephen Hemminger, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Yousuk Seung,
	Neal Cardwell, open list
In-Reply-To: <20260418032027.900913-1-stephen@networkplumber.org>

get_slot_next() computes a random delay between min_delay and
max_delay using:

  get_random_u32() * (max_delay - min_delay) >> 32

This overflows signed 64-bit arithmetic when the delay range exceeds
approximately 2.1 seconds (2^31 nanoseconds), producing a negative
result that effectively disables slot-based pacing. This is a
realistic configuration for WAN emulation (e.g., slot 1s 5s).

Use mul_u64_u32_shr() which handles the widening multiply without
overflow.

Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 640b51be807a..475c14b3dbdb 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -659,9 +659,8 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
 
 	if (!q->slot_dist)
 		next_delay = q->slot_config.min_delay +
-				(get_random_u32() *
-				 (q->slot_config.max_delay -
-				  q->slot_config.min_delay) >> 32);
+			mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay,
+					get_random_u32(), 32);
 	else
 		next_delay = tabledist(q->slot_config.dist_delay,
 				       (s32)(q->slot_config.dist_jitter),
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v8 4/6] net/sched: netem: validate slot configuration
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev
  Cc: jiri, jhs, horms, Stephen Hemminger, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Dave Taht, open list
In-Reply-To: <20260418032027.900913-1-stephen@networkplumber.org>

Reject slot configurations that have no defensible meaning:

  - negative min_delay or max_delay
  - min_delay greater than max_delay
  - negative dist_delay or dist_jitter
  - negative max_packets or max_bytes

Negative or out-of-order delays underflow in get_slot_next(),
producing garbage intervals. Negative limits trip the per-slot
accounting (packets_left/bytes_left <= 0) on the first packet of
every slot, defeating the rate-limiting half of the slot feature.

Note that dist_jitter has been silently coerced to its absolute
value by get_slot() since the feature was introduced; rejecting
negatives here converts that silent coercion into -EINVAL. The
abs() can be removed in a follow-up.

Fixes: 836af83b54e3 ("netem: support delivering packets in delayed time slots")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/sched/sch_netem.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 556f9747f0e7..640b51be807a 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -827,6 +827,29 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
 	return 0;
 }
 
+static int validate_slot(const struct nlattr *attr, struct netlink_ext_ack *extack)
+{
+	const struct tc_netem_slot *c = nla_data(attr);
+
+	if (c->min_delay < 0 || c->max_delay < 0) {
+		NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot delay");
+		return -EINVAL;
+	}
+	if (c->min_delay > c->max_delay) {
+		NL_SET_ERR_MSG_ATTR(extack, attr, "slot min delay greater than max delay");
+		return -EINVAL;
+	}
+	if (c->dist_delay < 0 || c->dist_jitter < 0) {
+		NL_SET_ERR_MSG_ATTR(extack, attr, "negative dist delay");
+		return -EINVAL;
+	}
+	if (c->max_packets < 0 || c->max_bytes < 0) {
+		NL_SET_ERR_MSG_ATTR(extack, attr, "negative slot limit");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 {
 	const struct tc_netem_slot *c = nla_data(attr);
@@ -1040,6 +1063,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 			goto table_free;
 	}
 
+	if (tb[TCA_NETEM_SLOT]) {
+		ret = validate_slot(tb[TCA_NETEM_SLOT], extack);
+		if (ret)
+			goto table_free;
+	}
+
 	sch_tree_lock(sch);
 	/* backup q->clg and q->loss_model */
 	old_clg = q->clg;
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v8 3/6] net/sched: netem: only reseed PRNG when seed is explicitly provided
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev
  Cc: jiri, jhs, horms, Stephen Hemminger, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, François Michel,
	open list
In-Reply-To: <20260418032027.900913-1-stephen@networkplumber.org>

netem_change() unconditionally reseeds the PRNG on every tc change
command. If TCA_NETEM_PRNG_SEED is not specified, a new random seed
is generated, destroying reproducibility for users who set a
deterministic seed on a previous change.

Move the initial random seed generation to netem_init() and only
reseed in netem_change() when TCA_NETEM_PRNG_SEED is explicitly
provided by the user.

Fixes: 4072d97ddc44 ("netem: add prng attribute to netem_sched_data")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index d400a730eadd..556f9747f0e7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1112,11 +1112,10 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 	/* capping jitter to the range acceptable by tabledist() */
 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
 
-	if (tb[TCA_NETEM_PRNG_SEED])
+	if (tb[TCA_NETEM_PRNG_SEED]) {
 		q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
-	else
-		q->prng.seed = get_random_u64();
-	prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+		prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+	}
 
 unlock:
 	sch_tree_unlock(sch);
@@ -1139,6 +1138,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 
 	q->loss_model = CLG_RANDOM;
+	q->prng.seed = get_random_u64();
+	prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+
 	ret = netem_change(sch, opt, extack);
 	if (ret)
 		pr_info("netem: change failed\n");
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v8 2/6] net/sched: netem: fix queue limit check to include reordered packets
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev
  Cc: jiri, jhs, horms, Stephen Hemminger, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Martin Ottens,
	open list
In-Reply-To: <20260418032027.900913-1-stephen@networkplumber.org>

The queue limit check in netem_enqueue() uses q->t_len which only
counts packets in the internal tfifo. Packets placed in sch->q by
the reorder path (__qdisc_enqueue_head) are not counted, allowing
the total queue occupancy to exceed sch->limit under reordering.

Include sch->q.qlen in the limit check.

Fixes: f8d4bc455047 ("net/sched: netem: account for backlog updates from child qdisc")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 8ee72cac1faf..d400a730eadd 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -524,7 +524,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				1 << get_random_u32_below(8);
 	}
 
-	if (unlikely(q->t_len >= sch->limit)) {
+	if (unlikely(sch->q.qlen >= sch->limit)) {
 		/* re-link segs, so that qdisc_drop_all() frees them all */
 		skb->next = segs;
 		qdisc_drop_all(skb, sch, to_free);
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v8 1/6] net/sched: netem: fix probability gaps in 4-state loss model
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev
  Cc: jiri, jhs, horms, Stephen Hemminger, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, open list
In-Reply-To: <20260418032027.900913-1-stephen@networkplumber.org>

The 4-state Markov chain in loss_4state() has gaps at the boundaries
between transition probability ranges. The comparisons use:

  if (rnd < a4)
  else if (a4 < rnd && rnd < a1 + a4)

When rnd equals a boundary value exactly, neither branch matches and
no state transition occurs. The redundant lower-bound check (a4 < rnd)
is already implied by being in the else branch.

Remove the unnecessary lower-bound comparisons so the ranges are
contiguous and every random value produces a transition, matching
the GI (General and Intuitive) loss model specification.

This bug goes back to original implementation of this model.

Fixes: 661b79725fea ("netem: revised correlated loss generator")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 20df1c08b1e9..8ee72cac1faf 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -227,10 +227,10 @@ static bool loss_4state(struct netem_sched_data *q)
 		if (rnd < clg->a4) {
 			clg->state = LOST_IN_GAP_PERIOD;
 			return true;
-		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+		} else if (rnd < clg->a1 + clg->a4) {
 			clg->state = LOST_IN_BURST_PERIOD;
 			return true;
-		} else if (clg->a1 + clg->a4 < rnd) {
+		} else {
 			clg->state = TX_IN_GAP_PERIOD;
 		}
 
@@ -247,9 +247,9 @@ static bool loss_4state(struct netem_sched_data *q)
 	case LOST_IN_BURST_PERIOD:
 		if (rnd < clg->a3)
 			clg->state = TX_IN_BURST_PERIOD;
-		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+		else if (rnd < clg->a2 + clg->a3) {
 			clg->state = TX_IN_GAP_PERIOD;
-		} else if (clg->a2 + clg->a3 < rnd) {
+		} else {
 			clg->state = LOST_IN_BURST_PERIOD;
 			return true;
 		}
-- 
2.53.0


^ permalink raw reply related

* [PATCH v8 net 0/6] netem: bug fixes
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev; +Cc: jiri, jhs, horms, Stephen Hemminger

These bugs were found when doing AI-assisted review of sch_netem.c
during investigation of the packet duplication recursion problem
addressed in Jamal's series.

The fixes cover:

 - probability gaps in the 4-state Markov loss model
 - queue limit not accounting for reordered packets
 - PRNG reseeded on every tc change, breaking reproducibility
 - slot configuration not validated (inverted ranges, negative
   delays, negative limits)
 - slot delay arithmetic overflow for ranges above ~2.1 seconds
 - negative latency and jitter wrapping to huge time_to_send
   values via u64 arithmetic

v8 - added check for negative TCA_NETEM_LATENCY64 and TCA_NETEM_JITTER64
   - extended slot validation to cover dist_delay, dist_jitter,
     max_packets and max_bytes

Stephen Hemminger (6):
  net/sched: netem: fix probability gaps in 4-state loss model
  net/sched: netem: fix queue limit check to include reordered packets
  net/sched: netem: only reseed PRNG when seed is explicitly provided
  net/sched: netem: validate slot configuration
  net/sched: netem: fix slot delay calculation overflow
  net/sched: netem: check for negative latency and jitter

 net/sched/sch_netem.c | 76 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 12 deletions(-)

-- 
2.53.0


^ permalink raw reply

* [PATCH v1 net] tcp: Disable usec TS for SYN Cookie.
From: Kuniyuki Iwashima @ 2026-04-18  2:49 UTC (permalink / raw)
  To: Eric Dumazet, Neal Cardwell, David S. Miller, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

cookie_tcp_reqsk_alloc() sets tcp_rsk(req)->req_usec_ts to false
unconditionally.

If want_cookie is true in tcp_conn_request(), we should not set
tcp_rsk(req)->req_usec_ts.

Let's not call dst_tcp_usec_ts() for SYN Cookie.

Fixes: 614e8316aa4c ("tcp: add support for usec resolution in TCP TS values")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv4/syncookies.c | 3 ---
 net/ipv4/tcp_input.c  | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b5f0a65c6786..f5cd9e325d01 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -76,12 +76,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 	if (ts > ts_now)
 		ts -= (1UL << TSBITS);
 
-	if (tcp_rsk(req)->req_usec_ts)
-		return ts * NSEC_PER_USEC;
 	return ts * NSEC_PER_MSEC;
 }
 
-
 static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
 				   __be16 dport, __u32 sseq, __u32 data)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cba89733d121..8bf202b95c68 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7720,7 +7720,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		st = af_ops->init_seq_and_ts_off(net, skb);
 
 	if (tmp_opt.tstamp_ok) {
-		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
+		if (!want_cookie)
+			tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
 		tcp_rsk(req)->ts_off = st.ts_off;
 	}
 	if (!want_cookie && !isn) {
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH net v3 2/2] bnge: remove unsupported backing store type
From: Vikas Gupta @ 2026-04-18  2:34 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: netdev, linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, dharmender.garg,
	rahul-rg.gupta, Vikas Gupta
In-Reply-To: <20260418023438.1597876-1-vikas.gupta@broadcom.com>

The backing store type, BNGE_CTX_MRAV, is not applicable in Thor Ultra
devices. Remove it from the backing store configuration, as the firmware
will not populate entities in this backing store type, due to which the
driver load fails.

Fixes: 29c5b358f385 ("bng_en: Add backing store support")
Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Dharmender Garg <dharmender.garg@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnge/bnge_rmem.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
index 94f15e08a88c..b066ee887a09 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
@@ -324,7 +324,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
 	u32 l2_qps, qp1_qps, max_qps;
 	u32 ena, entries_sp, entries;
 	u32 srqs, max_srqs, min;
-	u32 num_mr, num_ah;
 	u32 extra_srqs = 0;
 	u32 extra_qps = 0;
 	u32 fast_qpmd_qps;
@@ -390,21 +389,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
 	if (!bnge_is_roce_en(bd))
 		goto skip_rdma;
 
-	ctxm = &ctx->ctx_arr[BNGE_CTX_MRAV];
-	/* 128K extra is needed to accommodate static AH context
-	 * allocation by f/w.
-	 */
-	num_mr = min_t(u32, ctxm->max_entries / 2, 1024 * 256);
-	num_ah = min_t(u32, num_mr, 1024 * 128);
-	ctxm->split_entry_cnt = BNGE_CTX_MRAV_AV_SPLIT_ENTRY + 1;
-	if (!ctxm->mrav_av_entries || ctxm->mrav_av_entries > num_ah)
-		ctxm->mrav_av_entries = num_ah;
-
-	rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, num_mr + num_ah, 2);
-	if (rc)
-		return rc;
-	ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV;
-
 	ctxm = &ctx->ctx_arr[BNGE_CTX_TIM];
 	rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, l2_qps + qp1_qps + extra_qps, 1);
 	if (rc)
-- 
2.47.1


^ permalink raw reply related

* [PATCH net v3 1/2] bnge: fix initial HWRM sequence
From: Vikas Gupta @ 2026-04-18  2:34 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: netdev, linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, dharmender.garg,
	rahul-rg.gupta, Vikas Gupta
In-Reply-To: <20260418023438.1597876-1-vikas.gupta@broadcom.com>

Firmware may not advertize correct resources if backing store is not
enabled before resource information is queried.
Fix the initial sequence of HWRMs so that driver gets capabilities
and resource information correctly.

Fixes: 3fa9e977a0cd ("bng_en: Initialize default configuration")
Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Rahul Gupta <rahul-rg.gupta@broadcom.com>
---
 .../net/ethernet/broadcom/bnge/bnge_core.c    | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_core.c b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
index 1c14c5fe8d61..68b74eb2c3a2 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_core.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
@@ -74,6 +74,13 @@ static int bnge_func_qcaps(struct bnge_dev *bd)
 		return rc;
 	}
 
+	return 0;
+}
+
+static int bnge_func_qrcaps_qcfg(struct bnge_dev *bd)
+{
+	int rc;
+
 	rc = bnge_hwrm_func_resc_qcaps(bd);
 	if (rc) {
 		dev_err(bd->dev, "query resc caps failure rc: %d\n", rc);
@@ -133,23 +140,28 @@ static int bnge_fw_register_dev(struct bnge_dev *bd)
 
 	bnge_hwrm_fw_set_time(bd);
 
-	rc =  bnge_hwrm_func_drv_rgtr(bd);
+	/* Get the resources and configuration from firmware */
+	rc = bnge_func_qcaps(bd);
 	if (rc) {
-		dev_err(bd->dev, "Failed to rgtr with firmware rc: %d\n", rc);
+		dev_err(bd->dev, "Failed querying caps rc: %d\n", rc);
 		return rc;
 	}
 
 	rc = bnge_alloc_ctx_mem(bd);
 	if (rc) {
 		dev_err(bd->dev, "Failed to allocate ctx mem rc: %d\n", rc);
-		goto err_func_unrgtr;
+		goto err_free_ctx_mem;
 	}
 
-	/* Get the resources and configuration from firmware */
-	rc = bnge_func_qcaps(bd);
+	rc = bnge_hwrm_func_drv_rgtr(bd);
 	if (rc) {
-		dev_err(bd->dev, "Failed initial configuration rc: %d\n", rc);
-		rc = -ENODEV;
+		dev_err(bd->dev, "Failed to rgtr with firmware rc: %d\n", rc);
+		goto err_free_ctx_mem;
+	}
+
+	rc = bnge_func_qrcaps_qcfg(bd);
+	if (rc) {
+		dev_err(bd->dev, "Failed querying resources rc: %d\n", rc);
 		goto err_func_unrgtr;
 	}
 
@@ -158,7 +170,9 @@ static int bnge_fw_register_dev(struct bnge_dev *bd)
 	return 0;
 
 err_func_unrgtr:
-	bnge_fw_unregister_dev(bd);
+	bnge_hwrm_func_drv_unrgtr(bd);
+err_free_ctx_mem:
+	bnge_free_ctx_mem(bd);
 	return rc;
 }
 
-- 
2.47.1


^ permalink raw reply related

* [PATCH net v3 0/2] bnge fixes
From: Vikas Gupta @ 2026-04-18  2:34 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: netdev, linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, dharmender.garg,
	rahul-rg.gupta, Vikas Gupta

Hi,
 This series fix two issues.

Patch-1: 
    Due to wrong HWRM sequence, driver do not get the correct
    information regarding resources and capabilities.
    The patch fixes the initial HWRM sequence.
Patch-2:
    Remove the unsupported backing store type initialization, which is
    not supported in Thor Ultra devices.

Thanks,
Vikas

v2->v3:
  Addressed Jakub Kicinski's comments.
https://lore.kernel.org/netdev/CAHLZf_uARgZzoTPnnPjxRu5AGeHEOw3yyTEbNHYP3brfwuW0Sw@mail.gmail.com/

v1->v2: 
   Include Fixes tags.


Vikas Gupta (2):
  bnge: fix initial HWRM sequence
  bnge: remove unsupported backing store type

 .../net/ethernet/broadcom/bnge/bnge_core.c    | 30 ++++++++++++++-----
 .../net/ethernet/broadcom/bnge/bnge_rmem.c    | 16 ----------
 2 files changed, 22 insertions(+), 24 deletions(-)

-- 
2.47.1


^ permalink raw reply

* Re: [PATCH bpf v3 2/2] selftests/bpf: Test TCP_NODELAY in TCP hdr opt callbacks
From: KaFai Wan @ 2026-04-18  2:19 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: daniel, john.fastabend, sdf, ast, andrii, eddyz87, memxor, song,
	yonghong.song, jolsa, davem, edumazet, kuba, pabeni, horms, shuah,
	jiayuan.chen, bpf, netdev, linux-kernel, linux-kselftest
In-Reply-To: <2026417162132.9MRI.martin.lau@linux.dev>

On Fri, 2026-04-17 at 09:25 -0700, Martin KaFai Lau wrote:
> On Fri, Apr 17, 2026 at 05:20:35PM +0800, KaFai Wan wrote:
> > diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > index 56685fc03c7e..7b9dbbb84316 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > @@ -461,7 +461,7 @@ static void misc(void)
> >  	const unsigned int nr_data = 2;
> >  	struct bpf_link *link;
> >  	struct sk_fds sk_fds;
> > -	int i, ret;
> > +	int i, ret, true_val = 1;
> >  
> >  	lport_linum_map_fd = bpf_map__fd(misc_skel->maps.lport_linum_map);
> >  
> > @@ -477,6 +477,10 @@ static void misc(void)
> >  		return;
> >  	}
> >  
> > +	ret = setsockopt(sk_fds.active_fd, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
> 
> Same comment as in v2. Why this setsockopt is needed?

Sorry I miss this. It's from the review of v1, my first version would break the syscall setsockopt
and other CB besides HDR_OPT_LEN/WRITE_HDR_OPT. So in the test I check setsockopt() and
bpf_setsockopt() in PASSIVE_ESTABLISHED_CB to make sure patch#1 would not break user space and other
CB.

> The setsockopt in userspace is unnecessary. 

Is bpf_setsockopt() in PASSIVE_ESTABLISHED_CB also unnecessary? I'll respin if they are unnecessary.

> In the future,
> we may need to understand why it is needed here in the first place.

Okay, I'll remember that. Thanks for the review and guidance.

-- 
Thanks,
KaFai

^ permalink raw reply

* [PATCH net-next] r8169: report per-queue statistics through netdev qstats
From: Gustavo Arantes @ 2026-04-18  2:12 UTC (permalink / raw)
  To: Heiner Kallweit, nic_swsd
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-kernel

r8169 maintains synchronized per-CPU software counters for packet and byte
accounting, but does not expose them through the netdev qstats interface.

Add netdev_stat_ops callbacks and report the existing software counters
through queue 0 for both Rx and Tx. Provide zero base stats so device-scope
qstats report the packet and byte counters as supported and match the
existing RTNL statistics.

Signed-off-by: Gustavo Arantes <dev.gustavoa@gmail.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 70 +++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 791277e750ba..9d833b446383 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5175,6 +5175,75 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	pm_runtime_put_noidle(&pdev->dev);
 }
 
+static void rtl8169_fetch_sw_stats(struct net_device *dev,
+				   struct netdev_queue_stats_rx *rx,
+				   struct netdev_queue_stats_tx *tx)
+{
+	const struct pcpu_sw_netstats *stats;
+	unsigned int start;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+
+		stats = per_cpu_ptr(dev->tstats, cpu);
+		do {
+			start = u64_stats_fetch_begin(&stats->syncp);
+			rx_packets = u64_stats_read(&stats->rx_packets);
+			rx_bytes = u64_stats_read(&stats->rx_bytes);
+			tx_packets = u64_stats_read(&stats->tx_packets);
+			tx_bytes = u64_stats_read(&stats->tx_bytes);
+		} while (u64_stats_fetch_retry(&stats->syncp, start));
+
+		rx->packets += rx_packets;
+		rx->bytes += rx_bytes;
+		tx->packets += tx_packets;
+		tx->bytes += tx_bytes;
+	}
+}
+
+static void rtl8169_get_queue_stats_rx(struct net_device *dev, int idx,
+				       struct netdev_queue_stats_rx *rx)
+{
+	struct netdev_queue_stats_tx tx = {};
+
+	if (idx)
+		return;
+
+	rx->packets = 0;
+	rx->bytes = 0;
+	rtl8169_fetch_sw_stats(dev, rx, &tx);
+}
+
+static void rtl8169_get_queue_stats_tx(struct net_device *dev, int idx,
+				       struct netdev_queue_stats_tx *tx)
+{
+	struct netdev_queue_stats_rx rx = {};
+
+	if (idx)
+		return;
+
+	tx->packets = 0;
+	tx->bytes = 0;
+	rtl8169_fetch_sw_stats(dev, &rx, tx);
+}
+
+static void rtl8169_get_base_stats(struct net_device *dev,
+				   struct netdev_queue_stats_rx *rx,
+				   struct netdev_queue_stats_tx *tx)
+{
+	rx->packets = 0;
+	rx->bytes = 0;
+	tx->packets = 0;
+	tx->bytes = 0;
+}
+
+static const struct netdev_stat_ops rtl8169_stat_ops = {
+	.get_queue_stats_rx	= rtl8169_get_queue_stats_rx,
+	.get_queue_stats_tx	= rtl8169_get_queue_stats_tx,
+	.get_base_stats		= rtl8169_get_base_stats,
+};
+
 static void rtl8169_net_suspend(struct rtl8169_private *tp)
 {
 	netif_device_detach(tp->dev);
@@ -5615,6 +5684,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
 	dev->netdev_ops = &rtl_netdev_ops;
+	dev->stat_ops = &rtl8169_stat_ops;
 	tp = netdev_priv(dev);
 	tp->dev = dev;
 	tp->pci_dev = pdev;
-- 
2.51.2


^ permalink raw reply related

* [v2 PATCH] rhashtable: Restore insecure_elasticity toggle
From: Herbert Xu @ 2026-04-18  1:41 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev, NeilBrown
In-Reply-To: <aeLgjAeJuidWNy3N@gondor.apana.org.au>

This one actually compiles.
---8<---
Some users of rhashtable cannot handle insertion failures, and
are happy to accept the consequences of a hash table that having
very long chains.

Restore the insecure_elasticity toggle for these users.  In
addition to disabling the chain length checks, this also removes
the emergency resize that would otherwise occur when the hash
table occupancy hits 100% (an async resize is still scheduled
at 75%).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 015c8298bebc..72082428d6c6 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
+ * @insecure_elasticity: Set to true to disable chain length checks
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
@@ -61,6 +62,7 @@ struct rhashtable_params {
 	u16			head_offset;
 	unsigned int		max_size;
 	u16			min_size;
+	bool			insecure_elasticity;
 	bool			automatic_shrinking;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 0480509a6339..7def3f0f556b 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -821,14 +821,15 @@ static __always_inline void *__rhashtable_insert_fast(
 		goto out;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !params.insecure_elasticity)
 		goto slow_path;
 
 	data = ERR_PTR(-E2BIG);
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		goto out_unlock;
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !params.insecure_elasticity)
 		goto slow_path;
 
 	/* Inserting at head of list makes unlocking free. */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6074ed5f66f3..fb2b7bc137ba 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -538,7 +538,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		return NULL;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !ht->p.insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	return ERR_PTR(-ENOENT);
@@ -568,7 +568,8 @@ static struct bucket_table *rhashtable_insert_one(
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		return ERR_PTR(-E2BIG);
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !ht->p.insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	head = rht_ptr(bkt, tbl, hash);
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* [PATCH] rhashtable: Restore insecure_elasticity toggle
From: Herbert Xu @ 2026-04-18  1:38 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev, NeilBrown
In-Reply-To: <aeLWH_HgSHF4buiJ@gondor.apana.org.au>

Some users of rhashtable cannot handle insertion failures, and
are happy to accept the consequences of a hash table that having
very long chains.

Restore the insecure_elasticity toggle for these users.  In
addition to disabling the chain length checks, this also removes
the emergency resize that would otherwise occur when the hash
table occupancy hits 100% (an async resize is still scheduled
at 75%).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 015c8298bebc..72082428d6c6 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
+ * @insecure_elasticity: Set to true to disable chain length checks
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
@@ -61,6 +62,7 @@ struct rhashtable_params {
 	u16			head_offset;
 	unsigned int		max_size;
 	u16			min_size;
+	bool			insecure_elasticity;
 	bool			automatic_shrinking;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 0480509a6339..c793849d3f61 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -821,14 +821,15 @@ static __always_inline void *__rhashtable_insert_fast(
 		goto out;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !params->insecure_elasticity)
 		goto slow_path;
 
 	data = ERR_PTR(-E2BIG);
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		goto out_unlock;
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !params->insecure_elasticity)
 		goto slow_path;
 
 	/* Inserting at head of list makes unlocking free. */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6074ed5f66f3..b60d55e5b19b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -538,7 +538,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		return NULL;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !ht->p->insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	return ERR_PTR(-ENOENT);
@@ -568,7 +568,8 @@ static struct bucket_table *rhashtable_insert_one(
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		return ERR_PTR(-E2BIG);
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !ht->p->insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	head = rht_ptr(bkt, tbl, hash);
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Kuniyuki Iwashima @ 2026-04-18  1:06 UTC (permalink / raw)
  To: Ahmed, Aaron
  Cc: stable@vger.kernel.org, netdev@vger.kernel.org,
	ncardwell@google.com, edumazet@google.com
In-Reply-To: <CAAVpQUCfMsWBpPpywbwBLRCdHUqWqFBoDK=17dwDkG6T0dQxzw@mail.gmail.com>

On Fri, Apr 17, 2026 at 5:44 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> Hi Aaron :)
>
> Thanks for the report.
>
> On Fri, Apr 17, 2026 at 5:20 PM Ahmed, Aaron <aarnahmd@amazon.com> wrote:
> >
> > Hi,
> >
> > We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.
> >
> > Overview:
> >
> > The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.
> >
> > Reproducer:
> > ```
> > /* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
> >  *
> >  * Build:  gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
> >  * Run:    sudo sysctl -w net.core.wmem_max=4194304
> >  *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
> >  *         ./tcp_linger_memleak
> >  */
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <string.h>
> > #include <unistd.h>
> > #include <errno.h>
> > #include <fcntl.h>
> > #include <signal.h>
> > #include <sys/socket.h>
> > #include <sys/wait.h>
> > #include <netinet/in.h>
> >
> > #define NUM_CONNS 5000
> > #define PORT      6666
> >
> > static void print_mem(const char *label) {
> >     FILE *f;
> >     char line[256];
> >     f = fopen("/proc/meminfo", "r");
> >     while (fgets(line, sizeof(line), f))
> >         if (strncmp(line, "MemAvailable:", 13) == 0)
> >             printf("%s: %s", label, line);
> >     fclose(f);
> >     f = fopen("/proc/net/sockstat", "r");
> >     while (fgets(line, sizeof(line), f))
> >         if (strncmp(line, "TCP:", 4) == 0)
> >             printf("%s: %s", label, line);
> >     fclose(f);
> > }
> >
> > int main(void) {
> >     struct sockaddr_in addr = {
> >         .sin_family = AF_INET,
> >         .sin_port = htons(PORT),
> >         .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
> >     };
> >     int opt = 1;
> >     signal(SIGPIPE, SIG_IGN);
> >
> >     int lsn = socket(AF_INET, SOCK_STREAM, 0);
> >     setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
> >     bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
> >     listen(lsn, NUM_CONNS);
> >
> >     /* Fork client: connect N times, never read */
> >     pid_t child = fork();
> >     if (child == 0) {
> >         int fds[NUM_CONNS];
> >         for (int i = 0; i < NUM_CONNS; i++) {
> >             fds[i] = socket(AF_INET, SOCK_STREAM, 0);
> >             connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
> >         }
> >         pause(); /* sit forever, never read */
> >         _exit(0);
> >     }
> >
> >     /* Accept all connections */
> >     int clients[NUM_CONNS];
> >     for (int i = 0; i < NUM_CONNS; i++)
> >         clients[i] = accept(lsn, NULL, NULL);
> >
> >     /* Freeze client so it stops reading */
> >     kill(child, SIGSTOP);
> >     printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
> >     print_mem("BEFORE");
> >
> >     /* Fill buffers and close with SO_LINGER(1,0) */
> >     char buf[2048];
> >     memset(buf, 'A', sizeof(buf));
> >     for (int i = 0; i < NUM_CONNS; i++) {
> >         int flags = fcntl(clients[i], F_GETFL, 0);
> >         fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
> >         while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
> >         struct linger lg = { .l_onoff = 1, .l_linger = 0 };
> >         setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
> >         close(clients[i]);
> >     }
> >
> >     sleep(2);
> >     printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
> >     print_mem("AFTER");
> >     kill(child, SIGKILL);
> >     waitpid(child, NULL, 0);
> >     close(lsn);
> >     return 0;
> > }
> > ```
> > Output (Tested on 6.18.20):
> > ```
> > === 5000 connections established, client frozen ===
> > BEFORE: MemAvailable:   95491288 kB
> > BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0
> >
> > === All sockets closed with SO_LINGER(1,0) ===
> > AFTER: MemAvailable:   95321800 kB
> > AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
> > ```
>
> Unfortunately, it dies immediately on my end.
>
> === 5000 connections established, client frozen ===
> Segmentation fault         (core dumped) ./linux/tcp_linger

This was due to small ulimit -n and fopen() returned
NULL being passed to fgets().

But I don't see any leak of memory nor counter after
the repro.

Note that the tcp_mem counter could be cached in
per-cpu counters, see proto_memory_pcpu_drain() etc.

---8<---
[root@fedora ~]# unshare -n
[root@fedora ~]# ip link set lo up
[root@fedora ~]# echo clear > /sys/kernel/debug/kmemleak
[root@fedora ~]# ulimit -n 100000 && ./linux/tcp_linger
=== 5000 connections established, client frozen ===
BEFORE: MemAvailable:   54683048 kB
BEFORE: TCP: inuse 10001 orphan 0 tw 0 alloc 10008 mem 0
=== All sockets closed with SO_LINGER(1,0) ===
AFTER: MemAvailable:   54616304 kB
AFTER: TCP: inuse 1 orphan 0 tw 0 alloc 5008 mem 3842
[root@fedora ~]# cat /proc/net/sockstat
sockets: used 0
TCP: inuse 0 orphan 0 tw 0 alloc 7 mem 0
UDP: inuse 0 mem 0
RAW: inuse 0
FRAG: inuse 0 memory 0
[root@fedora ~]# cat /proc/meminfo | grep Available
MemAvailable:   54732456 kB
[root@fedora ~]# echo scan > /sys/kernel/debug/kmemleak
[root@fedora ~]#
---8<---

^ permalink raw reply

* Re: [PATCH for-7.1-fixes 1/2] rhashtable: add no_sync_grow option
From: Herbert Xu @ 2026-04-18  0:53 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev
In-Reply-To: <aeLV6aDhM0-S4oQ1@slm.duckdns.org>

On Fri, Apr 17, 2026 at 02:52:57PM -1000, Tejun Heo wrote:
>
> I see. Thanks, that should work. How should we go about reverting the
> removal?

I'll work on that today and then you can include it in your
two-patch series.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH for-7.1-fixes 1/2] rhashtable: add no_sync_grow option
From: Tejun Heo @ 2026-04-18  0:52 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev
In-Reply-To: <aeLT8eB_xfzLxqbI@gondor.apana.org.au>

Hello,

On Sat, Apr 18, 2026 at 08:44:33AM +0800, Herbert Xu wrote:
> On Fri, Apr 17, 2026 at 06:25:22AM -1000, Tejun Heo wrote:
> >
> > That'd be great but looking at the commit, I'm not sure it reliably avoids
> > allocation in the synchronous path.
> 
> If insecure_elasticity is set it should skip the slow path
> altogether and just do the insertion unconditionally.  So
> there will be no kmallocs at all.

I see. Thanks, that should work. How should we go about reverting the
removal?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Kuniyuki Iwashima @ 2026-04-18  0:44 UTC (permalink / raw)
  To: Ahmed, Aaron
  Cc: stable@vger.kernel.org, netdev@vger.kernel.org,
	ncardwell@google.com, edumazet@google.com
In-Reply-To: <48BADABE-4DFB-4DAD-8248-E94D8F5238D2@amazon.com>

Hi Aaron :)

Thanks for the report.

On Fri, Apr 17, 2026 at 5:20 PM Ahmed, Aaron <aarnahmd@amazon.com> wrote:
>
> Hi,
>
> We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.
>
> Overview:
>
> The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.
>
> Reproducer:
> ```
> /* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
>  *
>  * Build:  gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
>  * Run:    sudo sysctl -w net.core.wmem_max=4194304
>  *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
>  *         ./tcp_linger_memleak
>  */
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <signal.h>
> #include <sys/socket.h>
> #include <sys/wait.h>
> #include <netinet/in.h>
>
> #define NUM_CONNS 5000
> #define PORT      6666
>
> static void print_mem(const char *label) {
>     FILE *f;
>     char line[256];
>     f = fopen("/proc/meminfo", "r");
>     while (fgets(line, sizeof(line), f))
>         if (strncmp(line, "MemAvailable:", 13) == 0)
>             printf("%s: %s", label, line);
>     fclose(f);
>     f = fopen("/proc/net/sockstat", "r");
>     while (fgets(line, sizeof(line), f))
>         if (strncmp(line, "TCP:", 4) == 0)
>             printf("%s: %s", label, line);
>     fclose(f);
> }
>
> int main(void) {
>     struct sockaddr_in addr = {
>         .sin_family = AF_INET,
>         .sin_port = htons(PORT),
>         .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
>     };
>     int opt = 1;
>     signal(SIGPIPE, SIG_IGN);
>
>     int lsn = socket(AF_INET, SOCK_STREAM, 0);
>     setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
>     bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
>     listen(lsn, NUM_CONNS);
>
>     /* Fork client: connect N times, never read */
>     pid_t child = fork();
>     if (child == 0) {
>         int fds[NUM_CONNS];
>         for (int i = 0; i < NUM_CONNS; i++) {
>             fds[i] = socket(AF_INET, SOCK_STREAM, 0);
>             connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
>         }
>         pause(); /* sit forever, never read */
>         _exit(0);
>     }
>
>     /* Accept all connections */
>     int clients[NUM_CONNS];
>     for (int i = 0; i < NUM_CONNS; i++)
>         clients[i] = accept(lsn, NULL, NULL);
>
>     /* Freeze client so it stops reading */
>     kill(child, SIGSTOP);
>     printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
>     print_mem("BEFORE");
>
>     /* Fill buffers and close with SO_LINGER(1,0) */
>     char buf[2048];
>     memset(buf, 'A', sizeof(buf));
>     for (int i = 0; i < NUM_CONNS; i++) {
>         int flags = fcntl(clients[i], F_GETFL, 0);
>         fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
>         while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
>         struct linger lg = { .l_onoff = 1, .l_linger = 0 };
>         setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
>         close(clients[i]);
>     }
>
>     sleep(2);
>     printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
>     print_mem("AFTER");
>     kill(child, SIGKILL);
>     waitpid(child, NULL, 0);
>     close(lsn);
>     return 0;
> }
> ```
> Output (Tested on 6.18.20):
> ```
> === 5000 connections established, client frozen ===
> BEFORE: MemAvailable:   95491288 kB
> BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0
>
> === All sockets closed with SO_LINGER(1,0) ===
> AFTER: MemAvailable:   95321800 kB
> AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
> ```

Unfortunately, it dies immediately on my end.

=== 5000 connections established, client frozen ===
Segmentation fault         (core dumped) ./linux/tcp_linger


Did you see actual memory leak with kmemleak or is it
just the tcp_mem counter that is really leaked ?

# echo clear > /sys/kernel/debug/kmemleak
~ run repro ~
# echo scan > /sys/kernel/debug/kmemleak

^ permalink raw reply

* Re: [PATCH for-7.1-fixes 1/2] rhashtable: add no_sync_grow option
From: Herbert Xu @ 2026-04-18  0:44 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev
In-Reply-To: <aeJe8oIyYUi-NtCQ@slm.duckdns.org>

On Fri, Apr 17, 2026 at 06:25:22AM -1000, Tejun Heo wrote:
>
> That'd be great but looking at the commit, I'm not sure it reliably avoids
> allocation in the synchronous path.

If insecure_elasticity is set it should skip the slow path
altogether and just do the insertion unconditionally.  So
there will be no kmallocs at all.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Ahmed, Aaron @ 2026-04-18  0:19 UTC (permalink / raw)
  To: stable@vger.kernel.org, netdev@vger.kernel.org
  Cc: ncardwell@google.com, edumazet@google.com, kuniyu@google.com

Hi,

We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.

Overview:

The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.

Reproducer:
```
/* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
 *
 * Build:  gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
 * Run:    sudo sysctl -w net.core.wmem_max=4194304
 *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
 *         ./tcp_linger_memleak
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
	
#define NUM_CONNS 5000
#define PORT      6666

static void print_mem(const char *label) {
    FILE *f;
    char line[256];
    f = fopen("/proc/meminfo", "r");
    while (fgets(line, sizeof(line), f))
        if (strncmp(line, "MemAvailable:", 13) == 0)
            printf("%s: %s", label, line);
    fclose(f);
    f = fopen("/proc/net/sockstat", "r");
    while (fgets(line, sizeof(line), f))
        if (strncmp(line, "TCP:", 4) == 0)
            printf("%s: %s", label, line);
    fclose(f);
}

int main(void) {
    struct sockaddr_in addr = {
        .sin_family = AF_INET,
        .sin_port = htons(PORT),
        .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
    };
    int opt = 1;
    signal(SIGPIPE, SIG_IGN);

    int lsn = socket(AF_INET, SOCK_STREAM, 0);
    setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
    bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
    listen(lsn, NUM_CONNS);

    /* Fork client: connect N times, never read */
    pid_t child = fork();
    if (child == 0) {
        int fds[NUM_CONNS];
        for (int i = 0; i < NUM_CONNS; i++) {
            fds[i] = socket(AF_INET, SOCK_STREAM, 0);
            connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
        }
        pause(); /* sit forever, never read */
        _exit(0);
    }

    /* Accept all connections */
    int clients[NUM_CONNS];
    for (int i = 0; i < NUM_CONNS; i++)
        clients[i] = accept(lsn, NULL, NULL);

    /* Freeze client so it stops reading */
    kill(child, SIGSTOP);
    printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
    print_mem("BEFORE");

    /* Fill buffers and close with SO_LINGER(1,0) */
    char buf[2048];
    memset(buf, 'A', sizeof(buf));
    for (int i = 0; i < NUM_CONNS; i++) {
        int flags = fcntl(clients[i], F_GETFL, 0);
        fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
        while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
        struct linger lg = { .l_onoff = 1, .l_linger = 0 };
        setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
        close(clients[i]);
    }

    sleep(2);
    printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
    print_mem("AFTER");
    kill(child, SIGKILL);
    waitpid(child, NULL, 0);
    close(lsn);
    return 0;
}
```
Output (Tested on 6.18.20):
```
=== 5000 connections established, client frozen ===
BEFORE: MemAvailable:   95491288 kB
BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0

=== All sockets closed with SO_LINGER(1,0) ===
AFTER: MemAvailable:   95321800 kB
AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
```

Thanks,
Aaron Ahmed



^ permalink raw reply

* [PATCH net] ipv6: Apply max_dst_opts_cnt to ip6_tnl_parse_tlv_enc_lim
From: Daniel Borkmann @ 2026-04-17 22:03 UTC (permalink / raw)
  To: kuba; +Cc: edumazet, dsahern, tom, willemdebruijn.kernel, idosch, pabeni,
	netdev

Commit 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and
Destination options") added net.ipv6.max_{hbh,dst}_opts_{cnt,len}
and applied them in ip6_parse_tlv(), the generic TLV walker
invoked from ipv6_destopt_rcv() and ipv6_parse_hopopts().

ip6_tnl_parse_tlv_enc_lim() does not go through ip6_parse_tlv();
it has its own hand-rolled TLV scanner inside its NEXTHDR_DEST
branch which looks for IPV6_TLV_TNL_ENCAP_LIMIT. That inner
loop is bounded only by optlen, which can be up to 2048 bytes.
Stuffing the Destination Options header with 2046 Pad1 (type=0)
entries advances the scanner a single byte at a time, yielding
~2000 TLV iterations per extension header.

Reuse max_dst_opts_cnt to bound the TLV iterations, matching
the semantics from 47d3d7ac656a.

Fixes: 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and Destination options")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv6/ip6_tunnel.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 907c6a2af331..0ab76f93c136 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -430,11 +430,16 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
 				break;
 		}
 		if (nexthdr == NEXTHDR_DEST) {
+			int tlv_max = READ_ONCE(init_net.ipv6.sysctl.max_dst_opts_cnt);
+			int tlv_cnt = 0;
 			u16 i = 2;
 
 			while (1) {
 				struct ipv6_tlv_tnl_enc_lim *tel;
 
+				if (unlikely(tlv_cnt++ >= tlv_max))
+					break;
+
 				/* No more room for encapsulation limit */
 				if (i + sizeof(*tel) > optlen)
 					break;
-- 
2.43.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox