Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 4/4] selftest: Add tests for useful handling of LSM denials on SCM_RIGHTS
From: Jori Koolstra @ 2026-06-16 14:30 UTC (permalink / raw)
  To: brauner, cyphar, Shuah Khan, Kuniyuki Iwashima, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman
  Cc: linux-fsdevel, Jori Koolstra, open list,
	open list:KERNEL SELFTEST FRAMEWORK,
	open list:NETWORKING [GENERAL]
In-Reply-To: <20260616143020.3458085-1-jkoolstra@xs4all.nl>

Tests SCM_RIGHTS fd passing on a socket with the new socket option
SO_RIGHTS_NOTRUNC turned on.

The test uses the following Smack labels:

   "Sender"   - label for the sending process
   "Receiver" - label for the receiving process
   "SecretX"   - labels for the files being passed

Socket communication (Sender <-> Receiver) is always allowed.
The tests control whether Receiver can access "SecretX"-labeled fds.
When the LSM blocks an fd, we should see a sentinel that corresponds to
the error returned by the LSM, such as -EACCES.

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
 tools/testing/selftests/Makefile              |   1 +
 .../net/af_unix/scm_rights_denial/.gitignore  |   2 +
 .../net/af_unix/scm_rights_denial/Makefile    |  13 ++
 .../net/af_unix/scm_rights_denial/helper.h    |  38 ++++
 .../net/af_unix/scm_rights_denial/receiver.c  | 195 ++++++++++++++++++
 .../scm_rights_denial/scm_rights_denial.sh    | 171 +++++++++++++++
 .../net/af_unix/scm_rights_denial/sender.c    | 126 +++++++++++
 7 files changed, 546 insertions(+)
 create mode 100644 tools/testing/selftests/net/af_unix/scm_rights_denial/.gitignore
 create mode 100644 tools/testing/selftests/net/af_unix/scm_rights_denial/Makefile
 create mode 100644 tools/testing/selftests/net/af_unix/scm_rights_denial/helper.h
 create mode 100644 tools/testing/selftests/net/af_unix/scm_rights_denial/receiver.c
 create mode 100755 tools/testing/selftests/net/af_unix/scm_rights_denial/scm_rights_denial.sh
 create mode 100644 tools/testing/selftests/net/af_unix/scm_rights_denial/sender.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 8d4db2241cc2..7ff876692267 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -74,6 +74,7 @@ TARGETS += mseal_system_mappings
 TARGETS += nci
 TARGETS += net
 TARGETS += net/af_unix
+TARGETS += net/af_unix/scm_rights_denial
 TARGETS += net/can
 TARGETS += net/forwarding
 TARGETS += net/hsr
diff --git a/tools/testing/selftests/net/af_unix/scm_rights_denial/.gitignore b/tools/testing/selftests/net/af_unix/scm_rights_denial/.gitignore
new file mode 100644
index 000000000000..5a1c58ff005f
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights_denial/.gitignore
@@ -0,0 +1,2 @@
+sender
+receiver
diff --git a/tools/testing/selftests/net/af_unix/scm_rights_denial/Makefile b/tools/testing/selftests/net/af_unix/scm_rights_denial/Makefile
new file mode 100644
index 000000000000..03eb6d1427d7
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights_denial/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+top_srcdir := ../../../../../..
+include $(top_srcdir)/scripts/Makefile.compiler
+
+cc-option = $(call __cc-option, $(CC),,$(1),$(2))
+
+CFLAGS += $(KHDR_INCLUDES) -Wall $(call cc-option,-Wflex-array-member-not-at-end)
+
+TEST_PROGS := scm_rights_denial.sh
+
+TEST_GEN_FILES := sender receiver
+
+include ../../../lib.mk
diff --git a/tools/testing/selftests/net/af_unix/scm_rights_denial/helper.h b/tools/testing/selftests/net/af_unix/scm_rights_denial/helper.h
new file mode 100644
index 000000000000..2ecdf2b8b973
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights_denial/helper.h
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <unistd.h>
+#include <fcntl.h>
+
+#ifndef SO_RIGHTS_NOTRUNC
+#define SO_RIGHTS_NOTRUNC 85
+#endif
+
+#define CMSG_IS_SCM_RIGHTS(cmsg) ({		\
+	typeof(cmsg) _cmsg = (cmsg);		\
+	_cmsg &&				\
+	_cmsg->cmsg_level == SOL_SOCKET &&	\
+	_cmsg->cmsg_type == SCM_RIGHTS;		\
+})
+
+#define MIN(a, b) ({ \
+	typeof(a) _a = (a); \
+	typeof(b) _b = (b); \
+	_a < _b ? _a : _b; \
+})
+
+#define MAX_FDS 10
+
+static inline int read_current_label(char *label, size_t size)
+{
+	int fd = open("/proc/self/attr/current", O_RDONLY);
+	if (fd < 0)
+		return fd;
+
+	ssize_t r = read(fd, label, size - 1);
+	close(fd);
+	if (r < 0)
+		return r;
+
+	label[r] = '\0';
+
+	return 0;
+}
diff --git a/tools/testing/selftests/net/af_unix/scm_rights_denial/receiver.c b/tools/testing/selftests/net/af_unix/scm_rights_denial/receiver.c
new file mode 100644
index 000000000000..a9bd49a6e214
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights_denial/receiver.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * receiver.c - Receive a file descriptor over a Unix domain socket via SCM_RIGHTS
+ *
+ * Usage: ./receiver <socket_path>
+ *
+ * Listens on the given Unix socket path, accepts a connection, and
+ * attempts to receive file descriptors via SCM_RIGHTS. Reports
+ * whether the fds were delivered or blocked.
+ *
+ * Used for testing LSM (Smack) blocking of fd passing.
+ */
+
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+
+#include "helper.h"
+
+#define RECV_LOG(fmt, ...) printf("receiver: " fmt, ##__VA_ARGS__)
+#define RECV_ERR(fmt, ...) fprintf(stderr, "receiver: " fmt, ##__VA_ARGS__)
+
+static int recv_fds(int sock, int *fds)
+{
+	char buf[1];
+	char ctrl[CMSG_SPACE(MAX_FDS * sizeof(int))];
+
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len  = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_iov        = &iov,
+		.msg_iovlen     = 1,
+		.msg_control    = ctrl,
+		.msg_controllen = sizeof(ctrl),
+	};
+
+	ssize_t bytes_read = recvmsg(sock, &msg, 0);
+	if (bytes_read < 0) {
+		perror("receiver: recvmsg");
+		return -1;
+	}
+	if (bytes_read == 0) {
+		RECV_ERR("connection closed, no data received\n");
+		return -1;
+	}
+
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+	if (!CMSG_IS_SCM_RIGHTS(cmsg)) {
+		RECV_ERR("no SCM_RIGHTS in control message\n");
+		return -1;
+	}
+
+	int num_fd_slots = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+	memcpy(fds, CMSG_DATA(cmsg), num_fd_slots  * sizeof(int));
+
+	RECV_LOG("got %d fd slot(s):", num_fd_slots);
+	for (int i = 0; i < num_fd_slots ; i++) {
+		if (fds[i] < 0)
+			printf(" %s", strerrorname_np(-fds[i]));
+		else
+			printf(" %d", fds[i]);
+	}
+	putchar('\n');
+
+	return num_fd_slots;
+}
+
+static inline int print_current_label(void)
+{
+	char label[256];
+	if (!read_current_label(label, sizeof(label))) {
+		RECV_LOG("running with Smack label '%s'\n", label);
+		return 0;
+	}
+	return -1;
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s <socket_path>\n", argv[0]);
+		return -1;
+	}
+
+	if (print_current_label()) {
+		RECV_ERR("cannot read process Smack label");
+		return -1;
+	}
+
+	int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (listen_sock < 0) {
+		perror("receiver: socket");
+		return -1;
+	}
+
+	struct sockaddr_un addr = {};
+	addr.sun_family = AF_UNIX;
+	strncpy(addr.sun_path, argv[1], sizeof(addr.sun_path) - 1);
+
+	/* Remove any stale socket file */
+	unlink(argv[1]);
+
+	if (bind(listen_sock, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		perror("receiver: bind");
+		return -1;
+	}
+
+	if (listen(listen_sock, 1) < 0) {
+		perror("receiver: listen");
+		return -1;
+	}
+
+	RECV_LOG("listening on '%s'\n", argv[1]);
+
+	int conn_sock = accept(listen_sock, NULL, NULL);
+	if (conn_sock < 0) {
+		perror("receiver: accept");
+		return -1;
+	}
+
+	RECV_LOG("connection accepted\n");
+
+	int one = 1;
+	if (setsockopt(conn_sock, SOL_SOCKET, SO_RIGHTS_NOTRUNC,
+		       &one, sizeof(one)) < 0) {
+		perror("receiver: setsockopt(SO_RIGHTS_NOTRUNC)");
+		goto out_sock;
+	}
+
+	/* Try to receive the fds */
+	int fds[MAX_FDS];
+	int num_fds = recv_fds(conn_sock, fds);
+	if (num_fds < 0)
+		goto out_sock;
+
+	/* Try to use the received fds -- read and print their contents */
+	RECV_LOG("attempting to read from received fds...\n");
+	int i;
+	for (i = 0; i < num_fds; ++i) {
+		char readbuf[256];
+
+		if (fds[i] < 0) {
+			RECV_LOG("fd in position %i blocked\n", i);
+			continue;
+		} else if (fds[i] == 0) {
+			RECV_LOG("bad fd in position %i\n", i);
+			goto out_recv;
+		}
+
+		ssize_t n = read(fds[i], readbuf, sizeof(readbuf) - 1);
+		if (n < 0) {
+			perror("receiver: read from received fd");
+			goto out_recv;
+		}
+
+		readbuf[n] = '\0';
+		RECV_LOG("read %zd bytes from fd at position %i: '%s'\n", n, i, readbuf);
+	}
+
+	RECV_LOG("final result:\n");
+	for (int j = 0; j < num_fds; ++j) {
+		if (fds[j] < 0) {
+			printf("BLOCKED");
+		} else {
+			printf("PASSED");
+			close(fds[j]);
+		}
+		putchar(' ');
+	}
+
+	close(conn_sock);
+	close(listen_sock);
+	unlink(argv[1]);
+	return 0;
+
+out_recv:
+	for (int j = 0; j < num_fds; ++j) {
+		if (fds[j] > 0)
+			close(fds[j]);
+	}
+
+out_sock:
+	close(conn_sock);
+	close(listen_sock);
+	unlink(argv[1]);
+	return -1;
+}
diff --git a/tools/testing/selftests/net/af_unix/scm_rights_denial/scm_rights_denial.sh b/tools/testing/selftests/net/af_unix/scm_rights_denial/scm_rights_denial.sh
new file mode 100755
index 000000000000..9d7d4530cadd
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights_denial/scm_rights_denial.sh
@@ -0,0 +1,171 @@
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# test_scm_rights_smack.sh - Test SCM_RIGHTS fd passing using Smack LSM blocking
+#
+# Must be run as root on a kernel with Smack enabled (security=smack).
+# Requires: capsh (libcap), setfattr/getfattr (attr)
+#
+# We use the following Smack labels:
+#   "Sender"   - label for the sending process
+#   "Receiver" - label for the receiving process
+#   "SecretX"   - labels for the files being passed
+#
+# Socket communication (Sender <-> Receiver) is always allowed.
+# The tests control whether Receiver can access "SecretX"-labeled fds.
+#
+
+set -e
+
+readonly SENDER="./sender"
+readonly RECEIVER="./receiver"
+
+readonly TESTDIR="$(mktemp -d)"
+readonly SOCK="$TESTDIR/scm_test.sock"
+readonly TESTFILE1="$TESTDIR/secret_1"
+readonly TESTFILE2="$TESTDIR/secret_2"
+
+trap 'rm -rf "$TESTDIR"' EXIT
+
+run_tests() {
+
+	preflight
+	setup
+
+	run_test "TEST 1" \
+		"Receiver should NOT have access to Secret1." \
+		"Receiver Secret1 ---
+Receiver Secret2 ---" \
+		"$TESTFILE1" \
+		"BLOCKED"
+
+	run_test "TEST 2" \
+		"Receiver should have access to Secret1." \
+		"Receiver Secret1 r--
+Receiver Secret2 ---" \
+		"$TESTFILE1" \
+		"PASSED"
+
+	run_test "TEST 3" \
+		"Receiver should have access to Secret2, but NOT Secret1." \
+		"Receiver Secret1 ---
+Receiver Secret2 r--" \
+		"$TESTFILE1 $TESTFILE2" \
+		"BLOCKED PASSED"
+}
+
+run_test() {
+	local name="$1"
+	local description="$2"
+	local rules="$3"
+	local files="$4"
+	local expected="$5"
+
+	echo ""
+	echo "$name: $description"
+	echo "Rules:"
+	echo "$rules"
+	echo "Expected: $expected"
+	echo ""
+
+	while IFS= read -r rule; do
+		[ -n "$rule" ] && echo "$rule" > /sys/fs/smackfs/load2
+	done <<< "$rules"
+
+	local output status last_line
+	output=$(send_fds "$SOCK" $files)
+	status=$?
+	echo "$output"
+	last_line=$(echo "$output" | tail -n 1 | xargs)
+
+	if [ "$status" -ne 0 ]; then
+		echo "TEST FAILED: receiver returned $status"
+		return 1
+	fi
+
+	if [[ "$last_line" == "$expected" ]]; then
+		echo "TEST PASSED: outcome was $expected as expected"
+		return 0
+	else
+		echo "TEST FAILED: expected $expected, got '$last_line'"
+		return 1
+	fi
+}
+
+setup() {
+
+	printf "Secret 1" > "$TESTFILE1"
+	printf "Secret 2" > "$TESTFILE2"
+
+	setfattr -n security.SMACK64 -v "Secret1" "$TESTFILE1"
+	setfattr -n security.SMACK64 -v "Secret2" "$TESTFILE2"
+	setfattr -n security.SMACK64 -v "Tmp" /tmp
+	setfattr -n security.SMACK64 -v "Tmp" "$TESTDIR"
+
+	echo "Sender	Receiver	-w-" > /sys/fs/smackfs/load2
+	echo "Receiver	Sender		-w-" > /sys/fs/smackfs/load2
+	echo "Sender	Tmp 		rwx" > /sys/fs/smackfs/load2
+	echo "Receiver	Tmp		rwx" > /sys/fs/smackfs/load2
+	echo "Sender	Secret1		r--" > /sys/fs/smackfs/load2
+	echo "Sender	Secret2		r--" > /sys/fs/smackfs/load2
+}
+
+send_fds() {
+
+	local sk="$1"
+	shift
+	local files="$*"
+
+	(
+	    echo "Receiver" > /proc/self/attr/current
+	    exec capsh --drop=cap_mac_override,cap_mac_admin -- -c "$RECEIVER $sk"
+	) &
+	local recv_pid=$!
+	sleep 1
+
+	(
+	    echo "Sender" > /proc/self/attr/current
+	    exec capsh --drop=cap_mac_override,cap_mac_admin -- -c "$SENDER $sk $files"
+	) || true
+
+	local recv_status=0
+	wait "$recv_pid" || recv_status=$?
+
+	if [ "$recv_status" -ne 0 ]; then
+	    echo "receiver exited with $recv_status"
+	fi
+	return "$recv_status"
+}
+
+preflight() {
+
+	if [ "$(id -u)" -ne 0 ]; then
+	    echo "ERROR: must be run as root"
+	    exit 1
+	fi
+
+	if ! grep -q smack /sys/kernel/security/lsm 2>/dev/null; then
+	    echo "ERROR: Smack is not active"
+	    echo "  Check: cat /sys/kernel/security/lsm"
+	    echo "  Boot with: security=smack"
+	    exit 1
+	fi
+
+	if ! mountpoint -q /sys/fs/smackfs 2>/dev/null; then
+	    echo "Mounting smackfs..."
+	    mount -t smackfs smackfs /sys/fs/smackfs
+	fi
+
+	if ! command -v capsh &>/dev/null; then
+	    echo "ERROR: capsh not found (install libcap)"
+	    exit 1
+	fi
+
+	if [ ! -x "$SENDER" ] || [ ! -x "$RECEIVER" ]; then
+	    echo "ERROR: $SENDER / $RECEIVER not built (run 'make' first)"
+	    exit 1
+	fi
+
+}
+
+run_tests
diff --git a/tools/testing/selftests/net/af_unix/scm_rights_denial/sender.c b/tools/testing/selftests/net/af_unix/scm_rights_denial/sender.c
new file mode 100644
index 000000000000..b1c76d23b8bd
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights_denial/sender.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * sender.c - Send file descriptors over a Unix domain socket via SCM_RIGHTS
+ *
+ * Usage: ./sender <socket_path> <file_to_send> [<file_to_send>...]
+ *
+ * Opens the specified files and sends their fds to a receiver connected
+ * on the given Unix socket path. Used for testing LSM blocking of fd
+ * passing.
+ */
+
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+
+#include "helper.h"
+
+#define SEND_LOG(fmt, ...) fprintf(stdout, "sender: " fmt, ##__VA_ARGS__)
+#define SEND_ERR(fmt, ...) fprintf(stderr, "sender: " fmt, ##__VA_ARGS__)
+
+static int send_fds(int sock, int *fds, int num_fds)
+{
+	if (num_fds > MAX_FDS)
+		return -1;
+
+	char buf[1] = { 'X' };
+	char ctrl[CMSG_SPACE(MAX_FDS * sizeof(int))] = { 0 };
+
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len  = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_iov        = &iov,
+		.msg_iovlen     = 1,
+		.msg_control    = ctrl,
+		.msg_controllen = CMSG_SPACE(num_fds * sizeof(int)),
+	};
+
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type  = SCM_RIGHTS;
+	cmsg->cmsg_len   = CMSG_LEN(num_fds * sizeof(int));
+	memcpy(CMSG_DATA(cmsg), fds, num_fds * sizeof(int));
+
+	ssize_t bytes_send = sendmsg(sock, &msg, 0);
+	if (bytes_send < 0) {
+		perror("sender: sendmsg");
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline int print_current_label(void)
+{
+	char label[256];
+	if (!read_current_label(label, sizeof(label))) {
+		SEND_LOG("running with Smack label '%s'\n", label);
+		return 0;
+	}
+	return -1;
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc < 3 || argc > 2 + MAX_FDS) {
+		fprintf(stderr, "Usage: %s <socket_path> <file_to_send> [<file_to_send>...]\\n",
+			argv[0]);
+		fprintf(stderr, "Up to a maximum of %d files", MAX_FDS);
+		return -1;
+	}
+
+	if (print_current_label()) {
+		SEND_ERR("cannot read process Smack label");
+		return -1;
+	}
+
+	int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (sock < 0) {
+		perror("sender: socket");
+		return -1;
+	}
+
+	struct sockaddr_un addr = {};
+	addr.sun_family = AF_UNIX;
+	strncpy(addr.sun_path, argv[1], sizeof(addr.sun_path) - 1);
+
+	if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+		perror("sender: connect");
+		goto out_sock;
+	}
+
+	SEND_LOG("connected to '%s'\n", argv[1]);
+
+	int num_files = argc - 2;
+	int fds[MAX_FDS];
+	int i;
+	for (i = 0; i < num_files; i++) {
+		fds[i] = open(argv[2 + i], O_RDONLY);
+		if (fds[i] < 0) {
+			perror("sender: open file");
+			goto out_opened;
+		}
+		SEND_LOG("opened '%s' as fd %d\n", argv[2 + i], fds[i]);
+	}
+
+	if (send_fds(sock, fds, num_files) < 0)
+		goto out_opened;
+
+	SEND_LOG("fds successfully sent:");
+	for (int j = 0; j < num_files; j++)
+		printf(" %d", fds[j]);
+	putchar('\n');
+
+out_opened:
+	for (int j = 0; j < i; j++)
+		close(fds[j]);
+out_sock:
+	close(sock);
+	return -1;
+}
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 3/4] net: af_unix: replace copy_from_sockptr() with copy_safe_from_sockptr()
From: Jori Koolstra @ 2026-06-16 14:30 UTC (permalink / raw)
  To: brauner, cyphar, Kuniyuki Iwashima, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman
  Cc: linux-fsdevel, Jori Koolstra, open list:NETWORKING [GENERAL],
	open list
In-Reply-To: <20260616143020.3458085-1-jkoolstra@xs4all.nl>

Replace deprecated call to copy_from_sockptr() with
copy_safe_from_sockptr().

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
 net/unix/af_unix.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4e1463ee2815..eb4051f3aae7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -933,6 +933,7 @@ static int unix_setsockopt(struct socket *sock, int level, int optname,
 {
 	struct unix_sock *u = unix_sk(sock->sk);
 	struct sock *sk = sock->sk;
+	int error;
 	int val;
 
 	if (level != SOL_SOCKET)
@@ -941,11 +942,9 @@ static int unix_setsockopt(struct socket *sock, int level, int optname,
 	if (!unix_custom_sockopt(optname))
 		return sock_setsockopt(sock, level, optname, optval, optlen);
 
-	if (optlen != sizeof(int))
-		return -EINVAL;
-
-	if (copy_from_sockptr(&val, optval, sizeof(val)))
-		return -EFAULT;
+	error = copy_safe_from_sockptr(&val, sizeof(val), optval, optlen);
+	if (error)
+		return error;
 
 	switch (optname) {
 	case SO_INQ:
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 2/4] net: af_unix: Useful handling of LSM denials on SCM_RIGHTS
From: Jori Koolstra @ 2026-06-16 14:30 UTC (permalink / raw)
  To: brauner, cyphar, Alexander Viro, Jan Kara, Kuniyuki Iwashima,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Arnd Bergmann, Willem de Bruijn
  Cc: linux-fsdevel, Jori Koolstra, Jeff Layton, open list,
	open list:NETWORKING [GENERAL],
	open list:GENERIC INCLUDE/ASM HEADER FILES
In-Reply-To: <20260616143020.3458085-1-jkoolstra@xs4all.nl>

Right now if some LSM such as Smack denies an AF_UNIX socket peer to
receive an SCM_RIGHTS fd, the SCM_RIGHTS fd array will be cut short at
that point, and MSG_CTRUNC is set on return of recvmsg(). This is
highly problematic behaviour, because it leaves the receiver
wondering what happened. As per man page MSG_CTRUNC is supposed to
indicate that the control buffer was sized too short, but suddenly
a permission error might result in the exact same flag being set.
Moreover, the receiver has no chance to determine how many fds got
originally sent and how many were suppressed.[1]

Add a SO_RIGHTS_NOTRUNC option to UNIX sockets to enable more useful
handling of LSM denials when receiving SCM_RIGHTS messages: instead of
truncating the message at the first blocked fd, keep every fd slot
and store the LSM errno in the blocked slot.

[1]: https://github.com/uapi-group/kernel-features#useful-handling-of-lsm-denials-on-scm_rights

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
 fs/file.c                         | 48 ++++++++++++++++++++-----------
 include/linux/file.h              |  2 ++
 include/net/af_unix.h             |  1 +
 include/net/scm.h                 | 15 +++++++---
 include/uapi/asm-generic/socket.h |  3 ++
 net/compat.c                      |  4 +--
 net/core/scm.c                    | 13 +++++----
 net/unix/af_unix.c                |  9 ++++++
 8 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 628ca07dc4b1..2bc22cc69e84 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1367,6 +1367,25 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
 	return err;
 }
 
+static int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
+{
+	int error;
+
+	FD_PREPARE(fdf, o_flags, file);
+	if (fdf.err)
+		return fdf.err;
+	get_file(file);
+
+	if (ufd) {
+		error = put_user(fd_prepare_fd(fdf), ufd);
+		if (error)
+			return error;
+	}
+
+	__receive_sock(fd_prepare_file(fdf));
+	return fd_publish(fdf);
+}
+
 /**
  * receive_fd() - Install received file into file descriptor table
  * @file: struct file that was received from another process
@@ -1384,27 +1403,24 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
  */
 int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
 {
-	int error;
-
-	error = security_file_receive(file);
+	int error = security_file_receive(file);
 	if (error)
 		return error;
+	return __receive_fd(file, ufd, o_flags);
+}
+EXPORT_SYMBOL_GPL(receive_fd);
 
-	FD_PREPARE(fdf, o_flags, file);
-	if (fdf.err)
-		return fdf.err;
-	get_file(file);
-
-	if (ufd) {
-		error = put_user(fd_prepare_fd(fdf), ufd);
-		if (error)
-			return error;
+int receive_fd_filtered(struct file *file, int __user *ufd, unsigned int o_flags,
+		bool *filtered)
+{
+	int error = security_file_receive(file);
+	if (error) {
+		*filtered = true;
+		return error;
 	}
-
-	__receive_sock(fd_prepare_file(fdf));
-	return fd_publish(fdf);
+	*filtered = false;
+	return __receive_fd(file, ufd, o_flags);
 }
-EXPORT_SYMBOL_GPL(receive_fd);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
 {
diff --git a/include/linux/file.h b/include/linux/file.h
index 27484b444d31..748f08470bb4 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -119,6 +119,8 @@ DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))
 extern void fd_install(unsigned int fd, struct file *file);
 
 int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);
+int receive_fd_filtered(struct file *file, int __user *ufd, unsigned int o_flags,
+		bool *filtered);
 
 int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);
 
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 34f53dde65ce..bb1b3dee02e8 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -49,6 +49,7 @@ struct unix_sock {
 	struct scm_stat		scm_stat;
 	int			inq_len;
 	bool			recvmsg_inq;
+	bool			scm_rights_notrunc;
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 	struct sk_buff		*oob_skb;
 #endif
diff --git a/include/net/scm.h b/include/net/scm.h
index c52519669349..761cda0803fb 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -50,8 +50,8 @@ struct scm_cookie {
 #endif
 };
 
-void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm);
-void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm);
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm, bool notrunc);
+void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm, bool notrunc);
 int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm);
 void __scm_destroy(struct scm_cookie *scm);
 struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl);
@@ -108,11 +108,18 @@ void scm_recv_unix(struct socket *sock, struct msghdr *msg,
 		   struct scm_cookie *scm, int flags);
 
 static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
-				  unsigned int flags)
+				  unsigned int flags, bool notrunc)
 {
+	bool filtered;
+	int error;
+
 	if (!ufd)
 		return -EFAULT;
-	return receive_fd(f, ufd, flags);
+
+	error = receive_fd_filtered(f, ufd, flags, &filtered);
+	if (filtered && notrunc)
+		return put_user(error, ufd);
+	return error;
 }
 
 #endif /* __LINUX_NET_SCM_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 53b5a8c002b1..c5fb2ee96830 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -150,6 +150,9 @@
 #define SO_INQ			84
 #define SCM_INQ			SO_INQ
 
+#define SO_RIGHTS_NOTRUNC	85
+#define SCM_RIGHTS_NOTRUNC	SO_RIGHTS_NOTRUNC
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/compat.c b/net/compat.c
index d68cf9c3aad5..6bdf4a2c9077 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -286,7 +286,7 @@ static int scm_max_fds_compat(struct msghdr *msg)
 	return (msg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
 }
 
-void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
+void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm, bool notrunc)
 {
 	struct compat_cmsghdr __user *cm =
 		(struct compat_cmsghdr __user *)msg->msg_control_user;
@@ -296,7 +296,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
 	int err = 0, i;
 
 	for (i = 0; i < fdmax; i++) {
-		err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
+		err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags, notrunc);
 		if (err < 0)
 			break;
 	}
diff --git a/net/core/scm.c b/net/core/scm.c
index a73b1eb30fd2..1ef4e9431661 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -351,7 +351,7 @@ static int scm_max_fds(struct msghdr *msg)
 	return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int);
 }
 
-void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm, bool notrunc)
 {
 	struct cmsghdr __user *cm =
 		(__force struct cmsghdr __user *)msg->msg_control_user;
@@ -365,12 +365,12 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 		return;
 
 	if (msg->msg_flags & MSG_CMSG_COMPAT) {
-		scm_detach_fds_compat(msg, scm);
+		scm_detach_fds_compat(msg, scm, notrunc);
 		return;
 	}
 
 	for (i = 0; i < fdmax; i++) {
-		err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
+		err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags, notrunc);
 		if (err < 0)
 			break;
 	}
@@ -542,8 +542,11 @@ void scm_recv_unix(struct socket *sock, struct msghdr *msg,
 	if (!__scm_recv_common(sock->sk, msg, scm, flags))
 		return;
 
-	if (scm->fp)
-		scm_detach_fds(msg, scm);
+	if (scm->fp) {
+		struct unix_sock *u = unix_sk(sock->sk);
+		bool notrunc = READ_ONCE(u->scm_rights_notrunc);
+		scm_detach_fds(msg, scm, notrunc);
+	}
 
 	if (sock->sk->sk_scm_pidfd)
 		scm_pidfd_recv(msg, scm);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0d9cd977c7b7..4e1463ee2815 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -921,6 +921,7 @@ static bool unix_custom_sockopt(int optname)
 {
 	switch (optname) {
 	case SO_INQ:
+	case SO_RIGHTS_NOTRUNC:
 		return true;
 	default:
 		return false;
@@ -956,6 +957,14 @@ static int unix_setsockopt(struct socket *sock, int level, int optname,
 
 		WRITE_ONCE(u->recvmsg_inq, val);
 		break;
+
+	case SO_RIGHTS_NOTRUNC:
+		if (val > 1 || val < 0)
+			return -EINVAL;
+
+		WRITE_ONCE(u->scm_rights_notrunc, val);
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
2.54.0


^ permalink raw reply related

* [PATCH v2 1/4] net: scm: move scm_detach_fds() from common path to scm_recv_unix()
From: Jori Koolstra @ 2026-06-16 14:30 UTC (permalink / raw)
  To: brauner, cyphar, Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni,
	Willem de Bruijn, David S. Miller, Jakub Kicinski, Simon Horman
  Cc: linux-fsdevel, Jori Koolstra, open list:NETWORKING [GENERAL],
	open list

scm->fp can only be set when using UNIX sockets, therefore we should
move it out of the common path __scm_recv_common() into
scm_recv_unix().

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
 net/core/scm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/core/scm.c b/net/core/scm.c
index eec13f50ecaf..a73b1eb30fd2 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -523,9 +523,6 @@ static bool __scm_recv_common(struct sock *sk, struct msghdr *msg,
 
 	scm_passec(sk, msg, scm);
 
-	if (scm->fp)
-		scm_detach_fds(msg, scm);
-
 	return true;
 }
 
@@ -545,6 +542,9 @@ void scm_recv_unix(struct socket *sock, struct msghdr *msg,
 	if (!__scm_recv_common(sock->sk, msg, scm, flags))
 		return;
 
+	if (scm->fp)
+		scm_detach_fds(msg, scm);
+
 	if (sock->sk->sk_scm_pidfd)
 		scm_pidfd_recv(msg, scm);
 

base-commit: 6b5a2b7d9bc156e505f09e698d85d6a1547c1206
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH 0/4] vhost/vsock: add support for VHOST_RESET_OWNER and CPR migration
From: Stefano Garzarella @ 2026-06-16 14:28 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <4fa88fa6-a188-4c63-876c-ed748809bf0b@virtuozzo.com>

On Tue, Jun 16, 2026 at 05:01:34PM +0300, Andrey Drobyshev wrote:
>Hello Stefano,
>
>On 6/16/26 4:35 PM, Stefano Garzarella wrote:
>> Hi Andrey,
>> thanks for the series!
>>
>> On Fri, Jun 12, 2026 at 07:57:14PM +0300, Andrey Drobyshev wrote:
>>> Host<-->guest connections via AF_VSOCK sockets aren't supposed to
>>> outlive VM migration, since VM is moving to another host.  However
>>> there's a special case, which is QEMU live-update, or CPR
>>> (checkpoint-restore) migration.  In this case, VM remains on the same
>>> host, and we'd like such connections to persist.
>>
>> In the spec we have VIRTIO_VSOCK_EVENT_TRANSPORT_RESET which is usually
>> sent by the device after a migration.
>>
>> IIUC the specs don't say this has to be done all the time, so we don't
>> need to change anything in the specs, right?
>>
>> We just need to avoid sending it (which I think is what we're doing
>> here... I still need to look at the patches).
>>
>
>Sending this exact ioctl is guarded by one of my patches in the QEMU
>counterpart series:
>
>https://lore.kernel.org/qemu-devel/20260612165110.431376-6-andrey.drobyshev@virtuozzo.com/
>
>So we indeed avoid sending it on migration target in case of CPR migration.

Great, so we are aligned :-)

>
>>>
>>> For this to work, we need to be able to transfer device ownership from
>>> source QEMU to dest QEMU.  Namely, source needs to reset ownership by
>>> issuing VHOST_RESET_OWNER ioctl, and then target has to claim it by
>>> calling VHOST_SET_OWNER.
>>>
>>> Since VHOST_RESET_OWNER isn't yet implemented for vhost-vsock, let's add
>>> such implementation (patches 1-2).  Also fix regression introduced by
>>> the earlier commit [1] (patch 3), and fix the deadlock bug (commit 4).
>>
>> If it's a regression, should we fix it separately?
>>
>> Or is it related to this series?
>>
>
>Probably my wording wasn't quite correct.  I posted this patch here
>because we found the problem during testing this particular
>functionality, i.e. vsock data transfer + CPR migration.  And the
>problem was introduced by a recent commit, which is fine on its own, but
>breaks the CPR case.

Yeah, I figured out while reviewing the patch.
I'd avoid "regression" here and use just "issue", because at the end is 
just affecting this work that is not yet merged, so it can be a 
regression.

Thanks,
Stefano


^ permalink raw reply

* Re: [PATCH net 1/1] net: smc: fix splice entry lifetime imbalance in smc_rx_splice
From: Sidraya Jayagond @ 2026-06-16 14:27 UTC (permalink / raw)
  To: Ren Wei, linux-rdma, linux-s390, netdev
  Cc: alibuda, dust.li, wenjia, mjambigi, tonylu, guwen, ubraun,
	stefan.raspl, davem, yuantan098, zcliangcn, bird, lx24,
	d4n.for.sec
In-Reply-To: <192d1b44ed358ca143f44ef167d14153bccc51e9.1781097957.git.d4n.for.sec@gmail.com>



On 10/06/26 11:24 pm, Ren Wei wrote:
> From: Daming Li <d4n.for.sec@gmail.com>
> 
> smc_rx_splice() hands candidate pages to splice_to_pipe() without taking
> references for the lifetime of each splice entry first. That breaks the
> splice ownership contract in the VM-backed RMB path.
> 
> splice_to_pipe() drops unqueued entries through spd_release(), while
> queued entries are later dropped through the pipe buffer release
> callback. The current code only tries to take page references after the
> splice succeeds, and it derives the number of queued VM pages from a
> mutated offset value. This can underflow page refcounts and trigger a
> use-after-free. It also leaves the socket lifetime imbalanced in the
> multi-page VM case, where one sock_hold() can be followed by multiple
> sock_put() calls.
> 
> Fix this by taking the page and socket references for every candidate
> splice entry before calling splice_to_pipe(), and by releasing the
> matching private state, page reference, and socket reference from
> smc_rx_spd_release() for entries that never get queued. This makes the
> SMC splice path follow the normal splice lifetime rules and removes the
> broken post-splice VM page counting entirely.
> 
> Fixes: 9014db202cb7 ("smc: add support for splice()")
> Cc: stable@vger.kernel.org
> Reported-by: Yuan Tan <yuantan098@gmail.com>
> Reported-by: Zhengchuan Liang <zcliangcn@gmail.com>
> Reported-by: Xin Liu <bird@lzu.edu.cn>
> Assisted-by: Codex:GPT-5.4
> Co-developed-by: Liu Xiao <lx24@stu.ynu.edu.cn>
> Signed-off-by: Liu Xiao <lx24@stu.ynu.edu.cn>
> Signed-off-by: Daming Li <d4n.for.sec@gmail.com>
> Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
> ---
>  net/smc/smc_rx.c | 21 +++++++++++----------
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
> index c1d9b923938d..88aee0d93597 100644
> --- a/net/smc/smc_rx.c
> +++ b/net/smc/smc_rx.c
> @@ -150,18 +150,23 @@ static const struct pipe_buf_operations smc_pipe_ops = {
>  static void smc_rx_spd_release(struct splice_pipe_desc *spd,
>  			       unsigned int i)
>  {
> +	struct smc_spd_priv *priv = (struct smc_spd_priv *)spd->partial[i].private;
> +	struct sock *sk = &priv->smc->sk;
> +
> +	kfree(priv);
>  	put_page(spd->pages[i]);
> +	sock_put(sk);
>  }
>  
>  static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
>  			 struct smc_sock *smc)
>  {
>  	struct smc_link_group *lgr = smc->conn.lgr;
> -	int offset = offset_in_page(src);
>  	struct partial_page *partial;
>  	struct splice_pipe_desc spd;
>  	struct smc_spd_priv **priv;
>  	struct page **pages;
> +	int offset = offset_in_page(src);
>  	int bytes, nr_pages;
>  	int i;
>  
> @@ -209,6 +214,10 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
>  			offset = 0;
>  		}
>  	}
> +	for (i = 0; i < nr_pages; i++) {
> +		get_page(pages[i]);
> +		sock_hold(&smc->sk);
> +	}
>  	spd.nr_pages_max = nr_pages;
>  	spd.nr_pages = nr_pages;
>  	spd.pages = pages;
> @@ -217,16 +226,8 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
>  	spd.spd_release = smc_rx_spd_release;
>  
>  	bytes = splice_to_pipe(pipe, &spd);
> -	if (bytes > 0) {
> -		sock_hold(&smc->sk);
> -		if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) {
> -			for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++)
> -				get_page(pages[i]);
> -		} else {
> -			get_page(smc->conn.rmb_desc->pages);
> -		}
> +	if (bytes > 0)
>  		atomic_add(bytes, &smc->conn.splice_pending);
> -	}
>  	kfree(priv);
>  	kfree(partial);
>  	kfree(pages);
Code changes looks good to me.
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>

^ permalink raw reply

* [syzbot] [net?] WARNING in tls_err_abort
From: syzbot @ 2026-06-16 14:27 UTC (permalink / raw)
  To: davem, edumazet, horms, john.fastabend, kuba, linux-kernel,
	netdev, pabeni, sd, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    f6033078a9e6 ip6_tunnel: annotate data-races around t->err..
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=122a98ae580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=8697a140486f5628
dashboard link: https://syzkaller.appspot.com/bug?extid=cca46a9d1276f38af2ae
compiler:       Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/7af9eb2b9b5a/disk-f6033078.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/4b7e03b76e68/vmlinux-f6033078.xz
kernel image: https://storage.googleapis.com/syzbot-assets/38042dd09caa/bzImage-f6033078.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+cca46a9d1276f38af2ae@syzkaller.appspotmail.com

------------[ cut here ]------------
err >= 0
WARNING: net/tls/tls_sw.c:73 at tls_err_abort+0x5d/0x80 net/tls/tls_sw.c:73, CPU#0: kworker/0:11/6099
Modules linked in:
CPU: 0 UID: 0 PID: 6099 Comm: kworker/0:11 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026
Workqueue: pencrypt_serial padata_serial_worker
RIP: 0010:tls_err_abort+0x5d/0x80 net/tls/tls_sw.c:73
Code: e8 03 48 b9 00 00 00 00 00 fc ff df 0f b6 04 08 84 c0 75 1b 89 ab 9c 01 00 00 48 89 df 5b 5d e9 c9 a2 32 ff e8 a4 60 8a f7 90 <0f> 0b 90 eb c3 89 f9 80 e1 07 80 c1 03 38 c1 7c d9 e8 1d 9f f5 f7
RSP: 0018:ffffc900069379e0 EFLAGS: 00010293
RAX: ffffffff8a3adf8c RBX: ffff88807d1e0d80 RCX: ffff888058bfdd00
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000ffffffff
RBP: 0000000000000000 R08: ffffe8ffffc513e3 R09: 1ffffd1ffff8a27c
R10: dffffc0000000000 R11: ffffffff8a3c4d70 R12: ffff888028eaf400
R13: ffff88804441030c R14: dffffc0000000000 R15: ffff888028eaf460
FS:  0000000000000000(0000) GS:ffff8881252a0000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f521f503ff8 CR3: 0000000086fc2000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 tls_encrypt_done+0x223/0x480 net/tls/tls_sw.c:500
 padata_serial_worker+0x2b9/0x430 kernel/padata.c:343
 process_one_work kernel/workqueue.c:3314 [inline]
 process_scheduled_works+0xa8e/0x14e0 kernel/workqueue.c:3397
 worker_thread+0xa47/0xfb0 kernel/workqueue.c:3478
 kthread+0x389/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH 2/4] vhost/vsock: add VHOST_RESET_OWNER ioctl
From: Stefano Garzarella @ 2026-06-16 14:26 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <129f5833-3a7f-4b2d-a965-20903e4e2fb5@virtuozzo.com>

On Tue, Jun 16, 2026 at 05:10:38PM +0300, Andrey Drobyshev wrote:
>On 6/16/26 4:48 PM, Stefano Garzarella wrote:
>> On Fri, Jun 12, 2026 at 07:57:16PM +0300, Andrey Drobyshev wrote:
>>> From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>>>
>>> This ioctl is needed for QEMU's CPR (checkpoint-restore) migration of
>>> the guest with vhost-vsock device.  For this to work, we need to reset
>>> the device ownership on the source side by calling RESET_OWNER, and then
>>> claim it on the dest side by calling SET_OWNER.  We expect not to lose any
>>> AF_VSOCK connection while this happens.
>>>
>>> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>>> ---
>>> drivers/vhost/vsock.c | 28 ++++++++++++++++++++++++++++
>>> 1 file changed, 28 insertions(+)
>>>
>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>> index b12221ce6faf..e629886e5cf8 100644
>>> --- a/drivers/vhost/vsock.c
>>> +++ b/drivers/vhost/vsock.c
>>> @@ -894,6 +894,32 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>>> 	return -EFAULT;
>>> }
>>>
>>> +static int vhost_vsock_reset_owner(struct vhost_vsock *vsock)
>>> +{
>>> +	struct vhost_iotlb *umem;
>>> +	long err;
>>> +
>>> +	mutex_lock(&vsock->dev.mutex);
>>> +	err = vhost_dev_check_owner(&vsock->dev);
>>> +	if (err)
>>> +		goto done;
>>> +	umem = vhost_dev_reset_owner_prepare();
>>> +	if (!umem) {
>>> +		err = -ENOMEM;
>>> +		goto done;
>>> +	}
>>> +	/* Follows vhost_vsock_dev_release closely except for guest_cid drop */
>>> +	vsock_for_each_connected_socket(&vhost_transport.transport,
>>> +					vhost_vsock_reset_orphans);
>>
>> In vhost_vsock_reset_orphans() we have:
>>
>> 	rcu_read_lock();
>>
>> 	/* If the peer is still valid, no need to reset connection */
>> 	if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk))) {
>> 		rcu_read_unlock();
>> 		return;
>> 	}
>>
>> IIUC we are not removing the guest cid from the hash table, so this
>> check will be always true, and nothing is done.
>>
>> So, is this call really useful?
>>
>
>You're right, and it's most probably an artifact from mimicking the
>vhost_vsock_dev_release() implementation, as mentioned in the comment.
>In our case this whole iteration is a no-op, we better remove it.
>
>BTW earlier I received some feedback from Sashiko AI reviewer, which
>also spotted that same issue (and some more interesting races):
>
>https://sashiko.dev/#/patchset/20260612165718.433546-1-andrey.drobyshev@virtuozzo.com

Oh they seems similar to claude comments I included in my comment on 
patch 3.

Yeah, we should takes a look, they seems real issues.

>
>Apparently it only CC's its reviews to kvm@vger.kernel.org so you can't
>see them right away.  Just wanted to let you know to save your time
>here.  I'll send a v2 with respect to Sashiko remarks.  But of course
>would be great if you spot some more issues here.
>

Thanks for pointing that out, but in general I try to do my reviews 
before looking at AI reviews (both sashiko or claude locally) to avoid 
to be too much biased.

Thanks,
Stefano


^ permalink raw reply

* [PATCH net] net: ena: clean up XDP TX queues when regular TX setup fails
From: Dawei Feng @ 2026-06-16 14:24 UTC (permalink / raw)
  To: akiyano
  Cc: darinzon, andrew+netdev, davem, edumazet, kuba, pabeni, ast,
	daniel, hawk, john.fastabend, sdf, sameehj, netdev, linux-kernel,
	bpf, jianhao.xu, Dawei Feng, stable

create_queues_with_size_backoff() creates XDP TX queues before setting
up the regular TX path. If the subsequent allocation or creation of
regular TX queues fails, the error handling paths omit the teardown of the
XDP TX queues, leading to a resource leak.

Fix this by explicitly destroying the XDP TX queue subset at the two
missing failure points.

The bug was first flagged by an experimental analysis tool we are
developing for kernel memory-management bugs while analyzing
v6.13-rc1. The tool is still under development and is not yet publicly
available. Manual inspection confirms that the bug is still
present in v7.1-rc7.

An x86_64 allyesconfig build showed no new warnings. As we do not have
an ENA device to test with, no runtime testing was able to be performed.

Fixes: 548c4940b9f1 ("net: ena: Implement XDP_TX action")
Cc: stable@vger.kernel.org
Signed-off-by: Dawei Feng <dawei.feng@seu.edu.cn>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 23 ++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 92d149d4f091..5d05020a6d05 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -752,6 +752,18 @@ static void ena_destroy_all_tx_queues(struct ena_adapter *adapter)
 	}
 }
 
+static void ena_destroy_xdp_tx_queues(struct ena_adapter *adapter)
+{
+	u16 ena_qid;
+	int i;
+
+	for (i = adapter->xdp_first_ring;
+	     i < adapter->xdp_first_ring + adapter->xdp_num_queues; i++) {
+		ena_qid = ENA_IO_TXQ_IDX(i);
+		ena_com_destroy_io_queue(adapter->ena_dev, ena_qid);
+	}
+}
+
 static void ena_destroy_all_rx_queues(struct ena_adapter *adapter)
 {
 	u16 ena_qid;
@@ -2078,14 +2090,21 @@ static int create_queues_with_size_backoff(struct ena_adapter *adapter)
 		rc = ena_setup_tx_resources_in_range(adapter,
 						     0,
 						     adapter->num_io_queues);
-		if (rc)
+		if (rc) {
+			ena_destroy_xdp_tx_queues(adapter);
+			ena_free_all_io_tx_resources_in_range(adapter,
+							      adapter->xdp_first_ring,
+							      adapter->xdp_num_queues);
 			goto err_setup_tx;
+		}
 
 		rc = ena_create_io_tx_queues_in_range(adapter,
 						      0,
 						      adapter->num_io_queues);
-		if (rc)
+		if (rc) {
+			ena_destroy_xdp_tx_queues(adapter);
 			goto err_create_tx_queues;
+		}
 
 		rc = ena_setup_all_rx_resources(adapter);
 		if (rc)
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH 4/4] vhost/vsock: re-scan TX virtqueue on device start
From: Stefano Garzarella @ 2026-06-16 14:23 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <20260612165718.433546-5-andrey.drobyshev@virtuozzo.com>

On Fri, Jun 12, 2026 at 07:57:18PM +0300, Andrey Drobyshev wrote:
>During QEMU CPR live-update (and VHOST_RESET_OWNER in general) the guest
>keeps running while the host drops and later re-attaches vhost backends.
>If the guest adds a buffer to the TX virtqueue (guest->host) and kicks
>while the backend is temporarily NULL (between vhost_vsock_drop_backends()
>and the next vhost_vsock_start()), then the kick is delivered to the
>vhost worker, handle_tx_kick() sees a NULL backend and returns, and the
>kick signal is consumed.  The buffer is then left in the ring.
>
>Then upon device start vhost_vsock_start() only re-kicks the RX send
>worker, never the TX VQ, so the buffer is processed only if the guest
>happens to kick again.  But if the guest itself is now waiting for data
>from the host, it will never kick TX VQ again, and we end up in a
>deadlock.
>
>The deadlock is reproduced during active host->guest socat data transfer
>under multiple consecutive CPR live-update's.
>
>To fix this, in vhost_vsock_start(), after kicking the RX send worker, also
>queue the TX vq poll so any buffers the guest enqueued while we were paused
>get scanned.

Again, it seems like we're fixing an issue that existed before this 
series, but IIUC without support for VHOST_RESET_OWNER, this could never 
have happened, so the wording should be changed to make it clear that 
this is can happen only with the new VHOST_RESET_OWNER support.

In addition, this patch must also be applied before the 
VHOST_RESET_OWNER support or merged into it.

>
>Signed-off-by: Andrey Drobyshev <andrey.drobyshev@virtuozzo.com>
>---
> drivers/vhost/vsock.c | 6 ++++++
> 1 file changed, 6 insertions(+)
>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index bcaba36becd7..1fcfe71d18be 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -655,6 +655,12 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
> 	 */
> 	vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);
>
>+	/*
>+	 * Some packets might've also been queued in TX VQ.  Re-scan it here,
>+	 * mirroring the RX send-worker kick above.
>+	 */

Can we also mention that this is related to VHOST_RESET_OWNER?

Thanks,
Stefano

>+	vhost_poll_queue(&vsock->vqs[VSOCK_VQ_TX].poll);
>+
> 	mutex_unlock(&vsock->dev.mutex);
> 	return 0;
>
>-- 
>2.47.1
>


^ permalink raw reply

* Re: [PATCH 3/4] vhost/vsock: suppress EHOSTUNREACH fast-fail during CPR pause
From: Stefano Garzarella @ 2026-06-16 14:18 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <20260612165718.433546-4-andrey.drobyshev@virtuozzo.com>

On Fri, Jun 12, 2026 at 07:57:17PM +0300, Andrey Drobyshev wrote:
>From: "Denis V. Lunev" <den@openvz.org>
>
>Earlier commit ("ms/vhost/vsock: Refuse the connection immediately when

Please follow 
https://docs.kernel.org/process/submitting-patches.html#describe-your-changes 
on how to refer to a commit.

>guest isn't ready") added a fast-fail in vhost_transport_send_pkt().  It
>rejects every host send with -EHOSTUNREACH until the destination calls
>SET_RUNNING(1).  The fast-fail condition checks whether device's backends
>are dropped, and if they're, the guest is considered to be not ready.

Okay, so it's not a regression, I mean without this series that patch is 
not adding any regression, no?

If it's the case, I'll change the wording in the cover letter.

>
>However, there might be other reasons for backends to be nulled.  In
>particular, when QEMU is performing CPR (checkpoint-restore) migration,
>device ownership is being RESET and SET again, which leads to backends
>drop and reattach.  If we end up connecting during this window, an
>AF_VSOCK client gets -EHOSTUNREACH, which is wrong.

Please add this change before starting to support VHOST_RESET_OWNER 
ioctl in vhost-vsock, otherwise we are breaking the bisectability.

>
>Add a cpr_paused flag set inside vhost_vsock_drop_backends() when the
>backend was previously live, cleared by vhost_vsock_start(). When set,
>vhost_transport_send_pkt() queues the skb instead of fast-failing; the
>existing kick of send_pkt_work in vhost_vsock_start() drains it on
>resume. A device that has never run keeps cpr_paused == false and the
>boot-time fast-fail behaviour is preserved.
>
>Pair the cpr_paused store with the backend store using an
>smp_wmb()/smp_rmb() pair so a concurrent sender on a weakly-ordered
>architecture never observes (NULL backend, !paused):
>
>Signed-off-by: Denis V. Lunev <den@openvz.org>
>---
> drivers/vhost/vsock.c | 22 +++++++++++++++++++---
> 1 file changed, 19 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index e629886e5cf8..bcaba36becd7 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -61,6 +61,7 @@ struct vhost_vsock {
>
> 	u32 guest_cid;
> 	bool seqpacket_allow;
>+	bool cpr_paused;	/* between stop and next start */
> };
>
> static u32 vhost_transport_get_local_cid(void)
>@@ -311,11 +312,17 @@ vhost_transport_send_pkt(struct sk_buff *skb, struct net *net)
> 	 * the mutex would be too expensive in this hot path, and we already have
> 	 * all the outcomes covered: if the backend becomes NULL right after the check,
> 	 * vhost_transport_do_send_pkt() will check it under the mutex anyway.
>+	 *
>+	 * Don't fast-fail if cpr_paused is set, keep queueing skbs instead.
>+	 * The kick in vhost_vsock_start() will drain them on resume.
> 	 */
> 	if (unlikely(!data_race(vhost_vq_get_backend(&vsock->vqs[VSOCK_VQ_RX])))) {
>-		rcu_read_unlock();
>-		kfree_skb(skb);
>-		return -EHOSTUNREACH;
>+		smp_rmb();	/* pairs with smp_wmb() in start/drop_backends */
>+		if (!READ_ONCE(vsock->cpr_paused)) {

Can we avoid this which is not really readable and maybe add a single 
variable to control the fast-fail at all?

I mean replacing both cpr_paused + backend-pointer with a single 
`started` flag: set it to false at open, true on start via 
smp_store_release(), back to false on normal stop, and leave it true 
during CPR pause.

The reader in send_pkt can do just:

     if (!smp_load_acquire(&vsock->started))
         return -EHOSTUNREACH;

WDYT?

>+			rcu_read_unlock();
>+			kfree_skb(skb);
>+			return -EHOSTUNREACH;
>+		}


That said claude here is reporting a potential issue that I think we 
should consider:
     After VHOST_RESET_OWNER, the guest CID stays in the hash, so 
     vhost_transport_send_pkt() can still find the vsock, skip the 
     fast-fail (cpr_paused=true), and call vhost_vq_work_queue() while 
     vhost_workers_free() is freeing workers without a synchronize_rcu() 
     — risking a use-after-free. Also, any send_pkt_work queued between 
     the last flush and worker teardown gets its VHOST_WORK_QUEUED bit 
     stuck (the vhost task exits without draining), deadlocking 
     host→guest traffic after restart.

     A synchronize_rcu() in vhost_workers_free() between the 
     rcu_assign_pointer(NULL) loop and the destroy loop would close the 
     use-after-free, and reinitializing send_pkt_work via 
     vhost_work_init() after vhost_dev_reset_owner() returns would clear 
     the stuck QUEUED bit.


> 	}
>
> 	if (virtio_vsock_skb_reply(skb))
>@@ -640,6 +647,9 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
> 		mutex_unlock(&vq->mutex);
> 	}
>
>+	smp_wmb();	/* pairs with smp_rmb() in send_pkt */
>+	WRITE_ONCE(vsock->cpr_paused, false);
>+
> 	/* Some packets may have been queued before the device was started,
> 	 * let's kick the send worker to send them.
> 	 */
>@@ -671,6 +681,11 @@ static void vhost_vsock_drop_backends(struct vhost_vsock *vsock)
>
> 	lockdep_assert_held(&vsock->dev.mutex);
>
>+	if (vhost_vq_get_backend(&vsock->vqs[VSOCK_VQ_RX])) {
>+		WRITE_ONCE(vsock->cpr_paused, true);
>+		smp_wmb();	/* pairs with smp_rmb() in send_pkt */
>+	}

Why here and not in vhost_vsock_reset_owner()?

Also having this here will set it to true also with 
VHOST_VSOCK_SET_RUNNING(0), is that right?

Thanks,
Stefano

>+
> 	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
> 		vq = &vsock->vqs[i];
>
>@@ -728,6 +743,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
>
> 	vsock->guest_cid = 0; /* no CID assigned yet */
> 	vsock->seqpacket_allow = false;
>+	vsock->cpr_paused = false;
>
> 	atomic_set(&vsock->queued_replies, 0);
>
>-- 
>2.47.1
>


^ permalink raw reply

* Re: [PATCH RFC 3/9] net: stmmac: qcom-ethqos: fix RGMII_ID mode to use DLL bypass
From: Konrad Dybcio @ 2026-06-16 14:14 UTC (permalink / raw)
  To: Andrew Lunn, Mohd Ayaan Anwar, Bjorn Andersson,
	Bartosz Golaszewski, Eric Chanudet, Lucas Karpinski,
	Andrew Halaney
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Richard Cochran, Bjorn Andersson, Konrad Dybcio, Maxime Coquelin,
	Alexandre Torgue, Russell King, linux-arm-msm, netdev, devicetree,
	linux-kernel, linux-stm32, linux-arm-kernel
In-Reply-To: <82705420-771d-41bf-a4d9-ed94dff86ff0@lunn.ch>

On 6/15/26 6:48 PM, Andrew Lunn wrote:
> On Mon, Jun 15, 2026 at 09:24:07AM +0530, Mohd Ayaan Anwar wrote:
>> Hello Andrew,
>> On Thu, Jun 11, 2026 at 10:54:37PM +0200, Andrew Lunn wrote:
>>> On Fri, Jun 12, 2026 at 12:06:59AM +0530, Mohd Ayaan Anwar wrote:
>>>> When "rgmii-id" is selected the PHY supplies both TX and RX delays, so
>>>> the MAC must not add its own.  The driver currently falls through to the
>>>> generic DLL initialisation path which programs it to add a delay.
>>>>
>>>> Power down the DLL and set DDR bypass mode for RGMII_ID, then program
>>>> the IO_MACRO via a new ethqos_rgmii_id_macro_init() helper.  Also fix
>>>> ethqos_set_clk_tx_rate() to not double the clock rate in bypass mode at
>>>> 100M/10M, and remove RGMII_ID from the phase-shift suppression in
>>>> ethqos_rgmii_macro_init() since RGMII_ID no longer reaches that path.
>>>
>>> I'm curious how this works at the moment? Do no boards make use of
>>> RGMII ID? Are all current boards broken?
>>
>> Searching through the DTS, I found that we have two boards using "rgmii"
>> (qcs404-evb-4000.dts and sa8155-adp.dts) and another board using
>> "rgmii-txid" (sa8540p-ride.dts). No board which uses RGMII ID.
> 
> So this causes problems. We cannot break existing boards, yet it would
> be good to fix the current broken behaviour.

These are a funny bunch.. QCS404 is a stuck in a perpetual cycle of
"no one has the hardware" and "someone has the hw but zero interest or
time". I think we've considered it for removal at one point..

I'm not sure to what degree the two SA8xxx boards are used. They
may have been stuck in some sort of a limbo. Maybe Bjorn knows?

Also +Cc some of the folks that contributed to them in the past

Konrad

^ permalink raw reply

* [PATCH net] net: serialize netif_running() check in enqueue_to_backlog()
From: Eric Dumazet @ 2026-06-16 14:13 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, netdev, eric.dumazet,
	Eric Dumazet, syzbot+965506b59a2de0b6905c, Julian Anastasov

Syzbot reported a KASAN slab-use-after-free in fib_rules_lookup().

The root cause is a race condition where packets can escape the backlog
flushing during device unregistration (e.g., during netns exit).

Commit e9e4dd3267d0 ("net: do not process device backlog during unregistration")
introduced a lockless netif_running() check in enqueue_to_backlog() to
prevent queuing packets to an unregistering device.

However, this creates a TOCTOU race window.

A lockless transmitter (like veth_xmit) can pass
the check before dev_close() clears IFF_UP. If the transmitter is then
delayed, flush_all_backlogs() can run and finish before the transmitter
grabs the backlog lock and queues the packet. The packet then escapes
the flush and triggers UAF later when processed.

Fix this by moving the netif_running() check inside the backlog lock.
This serializes the check with the flush work (which also grabs the lock).
We then either queue the packet before the flush runs (so it gets flushed),
or check netif_running() after the flush/close completes (so it gets dropped).

Fixes: e9e4dd3267d0 ("net: do not process device backlog during unregistration")
Reported-by: syzbot+965506b59a2de0b6905c@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/6a315824.b0403584.28d0ff.0000.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
---
 net/core/dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 731e661d7be6574d5eca4a600e0a5623be4c2485..f81ce83fb3250d591ffa5eeb4c3067f8b75a54ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5381,8 +5381,6 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	u32 tail;

 	reason = SKB_DROP_REASON_DEV_READY;
-	if (unlikely(!netif_running(skb->dev)))
-		goto bad_dev;

 	sd = &per_cpu(softnet_data, cpu);

@@ -5394,6 +5392,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	backlog_lock_irq_save(sd, &flags);
 	qlen = skb_queue_len(&sd->input_pkt_queue);
 	if (likely(qlen <= max_backlog)) {
+		if (unlikely(!netif_running(skb->dev))) {
+			backlog_unlock_irq_restore(sd, flags);
+			goto bad_dev;
+		}
 		if (!qlen) {
 			/* Schedule NAPI for backlog device. We can use
 			 * non atomic operation as we own the queue lock.
-- 
2.54.0.1189.g8c84645362-goog

^ permalink raw reply related

* Re: [PATCH 2/4] vhost/vsock: add VHOST_RESET_OWNER ioctl
From: Andrey Drobyshev @ 2026-06-16 14:10 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <ajFRRmA9req1muX6@sgarzare-redhat>

On 6/16/26 4:48 PM, Stefano Garzarella wrote:
> On Fri, Jun 12, 2026 at 07:57:16PM +0300, Andrey Drobyshev wrote:
>> From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>>
>> This ioctl is needed for QEMU's CPR (checkpoint-restore) migration of
>> the guest with vhost-vsock device.  For this to work, we need to reset
>> the device ownership on the source side by calling RESET_OWNER, and then
>> claim it on the dest side by calling SET_OWNER.  We expect not to lose any
>> AF_VSOCK connection while this happens.
>>
>> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>> ---
>> drivers/vhost/vsock.c | 28 ++++++++++++++++++++++++++++
>> 1 file changed, 28 insertions(+)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index b12221ce6faf..e629886e5cf8 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -894,6 +894,32 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
>> 	return -EFAULT;
>> }
>>
>> +static int vhost_vsock_reset_owner(struct vhost_vsock *vsock)
>> +{
>> +	struct vhost_iotlb *umem;
>> +	long err;
>> +
>> +	mutex_lock(&vsock->dev.mutex);
>> +	err = vhost_dev_check_owner(&vsock->dev);
>> +	if (err)
>> +		goto done;
>> +	umem = vhost_dev_reset_owner_prepare();
>> +	if (!umem) {
>> +		err = -ENOMEM;
>> +		goto done;
>> +	}
>> +	/* Follows vhost_vsock_dev_release closely except for guest_cid drop */
>> +	vsock_for_each_connected_socket(&vhost_transport.transport,
>> +					vhost_vsock_reset_orphans);
> 
> In vhost_vsock_reset_orphans() we have:
> 
> 	rcu_read_lock();
> 
> 	/* If the peer is still valid, no need to reset connection */
> 	if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk))) {
> 		rcu_read_unlock();
> 		return;
> 	}
> 
> IIUC we are not removing the guest cid from the hash table, so this 
> check will be always true, and nothing is done.
> 
> So, is this call really useful?
>

You're right, and it's most probably an artifact from mimicking the
vhost_vsock_dev_release() implementation, as mentioned in the comment.
In our case this whole iteration is a no-op, we better remove it.

BTW earlier I received some feedback from Sashiko AI reviewer, which
also spotted that same issue (and some more interesting races):

https://sashiko.dev/#/patchset/20260612165718.433546-1-andrey.drobyshev@virtuozzo.com

Apparently it only CC's its reviews to kvm@vger.kernel.org so you can't
see them right away.  Just wanted to let you know to save your time
here.  I'll send a v2 with respect to Sashiko remarks.  But of course
would be great if you spot some more issues here.


>> +	vhost_vsock_drop_backends(vsock);
>> +	vhost_vsock_flush(vsock);
>> +	vhost_dev_stop(&vsock->dev);
>> +	vhost_dev_reset_owner(&vsock->dev, umem);
>> +done:
>> +	mutex_unlock(&vsock->dev.mutex);
>> +	return err;
>> +}
>> +
>> static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
>> 				  unsigned long arg)
>> {
>> @@ -937,6 +963,8 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
>> 			return -EOPNOTSUPP;
>> 		vhost_set_backend_features(&vsock->dev, features);
>> 		return 0;
>> +	case VHOST_RESET_OWNER:
>> +		return vhost_vsock_reset_owner(vsock);
>> 	default:
>> 		mutex_lock(&vsock->dev.mutex);
>> 		r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
>> -- 
>> 2.47.1
>>
> 


^ permalink raw reply

* [syzbot] [net?] KASAN: slab-use-after-free Read in fib_rules_lookup
From: syzbot @ 2026-06-16 14:05 UTC (permalink / raw)
  To: davem, dsahern, edumazet, horms, idosch, kuba, linux-kernel,
	netdev, pabeni, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    72dfa4700f78 net: dsa: sja1105: fix lastused timestamp in ..
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=15794bd2580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=a0842261b62cdea8
dashboard link: https://syzkaller.appspot.com/bug?extid=965506b59a2de0b6905c
compiler:       Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/d4e16f50a97c/disk-72dfa470.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/6cd4a736e796/vmlinux-72dfa470.xz
kernel image: https://storage.googleapis.com/syzbot-assets/548b0011c8e8/bzImage-72dfa470.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+965506b59a2de0b6905c@syzkaller.appspotmail.com

bond0 (unregistering): Released all slaves
bond1 (unregistering): Released all slaves
bond2 (unregistering): (slave dummy0): Releasing active interface
bond2 (unregistering): Released all slaves
==================================================================
BUG: KASAN: slab-use-after-free in fib_rules_lookup+0x15e/0xeb0 net/core/fib_rules.c:321
Read of size 8 at addr ffff88804ec4c680 by task kworker/u8:21/12641

CPU: 0 UID: 0 PID: 12641 Comm: kworker/u8:21 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
Workqueue: netns cleanup_net
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 print_address_description+0x55/0x1e0 mm/kasan/report.c:378
 print_report+0x58/0x70 mm/kasan/report.c:482
 kasan_report+0x117/0x150 mm/kasan/report.c:595
 fib_rules_lookup+0x15e/0xeb0 net/core/fib_rules.c:321
 __fib_lookup+0x106/0x210 net/ipv4/fib_rules.c:96
 ip_route_output_key_hash_rcu+0x294/0x2720 net/ipv4/route.c:2811
 ip_route_output_key_hash+0x18d/0x2a0 net/ipv4/route.c:2702
 __ip_route_output_key include/net/route.h:169 [inline]
 ip_route_output_flow+0x2a/0x150 net/ipv4/route.c:2929
 ip4_datagram_release_cb+0x89d/0xbe0 net/ipv4/datagram.c:118
 release_sock+0x206/0x260 net/core/sock.c:3861
 inet_shutdown+0x2b1/0x390 net/ipv4/af_inet.c:950
 udp_tunnel_sock_release+0x6d/0x80 net/ipv4/udp_tunnel_core.c:197
 fou_release net/ipv4/fou_core.c:562 [inline]
 fou_exit_net+0x17d/0x1f0 net/ipv4/fou_core.c:1230
 ops_exit_list net/core/net_namespace.c:199 [inline]
 ops_undo_list+0x43d/0x8d0 net/core/net_namespace.c:252
 cleanup_net+0x572/0x810 net/core/net_namespace.c:702
 process_one_work kernel/workqueue.c:3314 [inline]
 process_scheduled_works+0xa8e/0x14e0 kernel/workqueue.c:3397
 worker_thread+0xa47/0xfb0 kernel/workqueue.c:3478
 kthread+0x389/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>

Allocated by task 19121:
 kasan_save_stack mm/kasan/common.c:57 [inline]
 kasan_save_track+0x3e/0x80 mm/kasan/common.c:78
 poison_kmalloc_redzone mm/kasan/common.c:398 [inline]
 __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:415
 kasan_kmalloc include/linux/kasan.h:263 [inline]
 __do_kmalloc_node mm/slub.c:5296 [inline]
 __kmalloc_node_track_caller_noprof+0x4d7/0x7b0 mm/slub.c:5408
 kmemdup_noprof+0x2b/0x70 mm/util.c:138
 kmemdup_noprof include/linux/fortify-string.h:763 [inline]
 fib_rules_register+0x2f/0x400 net/core/fib_rules.c:170
 fib4_rules_init+0x21/0x160 net/ipv4/fib_rules.c:508
 ip_fib_net_init net/ipv4/fib_frontend.c:1578 [inline]
 fib_net_init+0x17a/0x3e0 net/ipv4/fib_frontend.c:1628
 ops_init+0x35d/0x5d0 net/core/net_namespace.c:137
 setup_net+0x118/0x350 net/core/net_namespace.c:446
 copy_net_ns+0x4f9/0x720 net/core/net_namespace.c:579
 create_new_namespaces+0x3f0/0x6b0 kernel/nsproxy.c:132
 unshare_nsproxy_namespaces+0x149/0x190 kernel/nsproxy.c:234
 ksys_unshare+0x57d/0xa00 kernel/fork.c:3242
 __do_sys_unshare kernel/fork.c:3316 [inline]
 __se_sys_unshare kernel/fork.c:3314 [inline]
 __x64_sys_unshare+0x38/0x50 kernel/fork.c:3314
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Freed by task 12641:
 kasan_save_stack mm/kasan/common.c:57 [inline]
 kasan_save_track+0x3e/0x80 mm/kasan/common.c:78
 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:584
 poison_slab_object mm/kasan/common.c:253 [inline]
 __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285
 kasan_slab_free include/linux/kasan.h:235 [inline]
 slab_free_hook mm/slub.c:2689 [inline]
 __rcu_free_sheaf_prepare+0x12d/0x2a0 mm/slub.c:2940
 rcu_free_sheaf+0x31/0x200 mm/slub.c:5850
 rcu_do_batch kernel/rcu/tree.c:2617 [inline]
 rcu_core+0x78b/0x10a0 kernel/rcu/tree.c:2869
 handle_softirqs+0x225/0x840 kernel/softirq.c:622
 do_softirq+0x76/0xd0 kernel/softirq.c:523
 __local_bh_enable_ip+0xf8/0x130 kernel/softirq.c:450
 unregister_netdevice_many_notify+0x1874/0x2150 net/core/dev.c:12445
 ops_exit_rtnl_list net/core/net_namespace.c:187 [inline]
 ops_undo_list+0x391/0x8d0 net/core/net_namespace.c:248
 cleanup_net+0x572/0x810 net/core/net_namespace.c:702
 process_one_work kernel/workqueue.c:3314 [inline]
 process_scheduled_works+0xa8e/0x14e0 kernel/workqueue.c:3397
 worker_thread+0xa47/0xfb0 kernel/workqueue.c:3478
 kthread+0x389/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

The buggy address belongs to the object at ffff88804ec4c600
 which belongs to the cache kmalloc-192 of size 192
The buggy address is located 128 bytes inside of
 freed 192-byte region [ffff88804ec4c600, ffff88804ec4c6c0)

The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x4ec4c
flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff)
page_type: f5(slab)
raw: 00fff00000000000 ffff88813fe163c0 dead000000000100 dead000000000122
raw: 0000000000000000 0000000800100010 00000000f5000000 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0xd2cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 13856, tgid 13853 (syz.3.2144), ts 351172300879, free_ts 351133053454
 set_page_owner include/linux/page_owner.h:32 [inline]
 post_alloc_hook+0x22d/0x280 mm/page_alloc.c:1853
 prep_new_page mm/page_alloc.c:1861 [inline]
 get_page_from_freelist+0x24ae/0x2530 mm/page_alloc.c:3941
 __alloc_frozen_pages_noprof+0x18d/0x380 mm/page_alloc.c:5221
 alloc_slab_page mm/slub.c:3278 [inline]
 allocate_slab+0x77/0x660 mm/slub.c:3467
 new_slab mm/slub.c:3525 [inline]
 refill_objects+0x336/0x3d0 mm/slub.c:7272
 refill_sheaf mm/slub.c:2816 [inline]
 __pcs_replace_empty_main+0x320/0x720 mm/slub.c:4652
 alloc_from_pcs mm/slub.c:4750 [inline]
 slab_alloc_node mm/slub.c:4884 [inline]
 __do_kmalloc_node mm/slub.c:5295 [inline]
 __kmalloc_noprof+0x464/0x750 mm/slub.c:5308
 kmalloc_noprof include/linux/slab.h:954 [inline]
 kzalloc_noprof include/linux/slab.h:1188 [inline]
 new_dir fs/proc/proc_sysctl.c:966 [inline]
 get_subdir fs/proc/proc_sysctl.c:1010 [inline]
 sysctl_mkdir_p fs/proc/proc_sysctl.c:1320 [inline]
 __register_sysctl_table+0xc02/0x1370 fs/proc/proc_sysctl.c:1395
 neigh_sysctl_register+0x9b1/0xa90 net/core/neighbour.c:3915
 addrconf_sysctl_register+0xb3/0x1c0 net/ipv6/addrconf.c:7396
 ipv6_add_dev+0xd26/0x13a0 net/ipv6/addrconf.c:460
 addrconf_notify+0x771/0x1050 net/ipv6/addrconf.c:3679
 notifier_call_chain+0x1a5/0x3d0 kernel/notifier.c:85
 call_netdevice_notifiers_extack net/core/dev.c:2288 [inline]
 call_netdevice_notifiers net/core/dev.c:2302 [inline]
 register_netdevice+0x18db/0x1f00 net/core/dev.c:11474
 macsec_newlink+0x706/0x1200 drivers/net/macsec.c:4218
 rtnl_newlink_create+0x310/0xb00 net/core/rtnetlink.c:3905
page last free pid 12657 tgid 12657 stack trace:
 reset_page_owner include/linux/page_owner.h:25 [inline]
 __free_pages_prepare mm/page_alloc.c:1397 [inline]
 __free_frozen_pages+0xc0d/0xd20 mm/page_alloc.c:2938
 __tlb_remove_table_free mm/mmu_gather.c:228 [inline]
 tlb_remove_table_rcu+0x85/0x100 mm/mmu_gather.c:291
 rcu_do_batch kernel/rcu/tree.c:2617 [inline]
 rcu_core+0x78b/0x10a0 kernel/rcu/tree.c:2869
 handle_softirqs+0x225/0x840 kernel/softirq.c:622
 __do_softirq kernel/softirq.c:656 [inline]
 invoke_softirq kernel/softirq.c:496 [inline]
 __irq_exit_rcu+0xca/0x220 kernel/softirq.c:735
 irq_exit_rcu+0x9/0x30 kernel/softirq.c:752
 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1061 [inline]
 sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1061
 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:697

Memory state around the buggy address:
 ffff88804ec4c580: 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff88804ec4c600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff88804ec4c680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
                   ^
 ffff88804ec4c700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff88804ec4c780: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc
==================================================================


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH net] nfc: pn533: prevent division by zero in the listen mode timer
From: Simon Horman @ 2026-06-16 14:02 UTC (permalink / raw)
  To: Yinhao Hu
  Cc: netdev, David Heidelberg, Krzysztof Kozlowski, Jakub Kicinski,
	Dan Carpenter, dzm91, hust-os-kernel-patches
In-Reply-To: <20260615103547.1599528-1-dddddd@hust.edu.cn>

On Mon, Jun 15, 2026 at 03:35:47AM -0700, Yinhao Hu wrote:
> The listen-mode timer handler advances the polling state machine through
> pn533_poll_next_mod(), which computes:
> 
> dev->poll_mod_curr = (dev->poll_mod_curr + 1) % dev->poll_mod_count;
> 
> pn533_poll_reset_mod_list() clears dev->poll_mod_count without first
> stopping that timer: pn533_dep_link_down() deletes no timer at all, and
> pn533_stop_poll() uses timer_delete(), which does not wait for a handler
> already running on another CPU. When the handler runs after the count
> has been zeroed, it divides by zero:
> 
> Oops: divide error: 0000 [#1] SMP
> RIP: 0010:pn533_listen_mode_timer+0x9b/0x110
> 
> Delete the timer synchronously in pn533_poll_reset_mod_list(), the single
> place that clears the list, so the handler can no longer run past a reset.
> Also return early when poll_mod_count is already zero, covering the window
> where pn533_wq_poll() re-arms the timer just before a reset.
> 
> Fixes: 6fbbdc16be38 ("NFC: Implement pn533 polling loop")
> Signed-off-by: Yinhao Hu <dddddd@hust.edu.cn>
> ---
>  drivers/nfc/pn533/pn533.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
> index d7bdbc82e2ba..88df99001b4a 100644
> --- a/drivers/nfc/pn533/pn533.c
> +++ b/drivers/nfc/pn533/pn533.c
> @@ -951,6 +951,7 @@ static inline void pn533_poll_next_mod(struct pn533 *dev)
>  
>  static void pn533_poll_reset_mod_list(struct pn533 *dev)
>  {
> +	timer_delete_sync(&dev->listen_timer);
>  	dev->poll_mod_count = 0;
>  }
>  
> @@ -1235,6 +1236,10 @@ static void pn533_listen_mode_timer(struct timer_list *t)
>  {
>  	struct pn533 *dev = timer_container_of(dev, t, listen_timer);
>  
> +	/* Polling may have been stopped while the timer was pending. */
> +	if (!dev->poll_mod_count)
> +		return;
> +

I am concerned that access to poll_mod_count is not synchronised and thus
this may not work as intended.

>  	dev->cancel_listen = 1;
>  
>  	pn533_poll_next_mod(dev);
> -- 
> 2.43.0
> 

^ permalink raw reply

* Re: [PATCH 0/4] vhost/vsock: add support for VHOST_RESET_OWNER and CPR migration
From: Andrey Drobyshev @ 2026-06-16 14:01 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <ajFLvKcT-A0wvLYW@sgarzare-redhat>

Hello Stefano,

On 6/16/26 4:35 PM, Stefano Garzarella wrote:
> Hi Andrey,
> thanks for the series!
> 
> On Fri, Jun 12, 2026 at 07:57:14PM +0300, Andrey Drobyshev wrote:
>> Host<-->guest connections via AF_VSOCK sockets aren't supposed to
>> outlive VM migration, since VM is moving to another host.  However
>> there's a special case, which is QEMU live-update, or CPR
>> (checkpoint-restore) migration.  In this case, VM remains on the same
>> host, and we'd like such connections to persist.
> 
> In the spec we have VIRTIO_VSOCK_EVENT_TRANSPORT_RESET which is usually 
> sent by the device after a migration.
> 
> IIUC the specs don't say this has to be done all the time, so we don't 
> need to change anything in the specs, right?
> 
> We just need to avoid sending it (which I think is what we're doing 
> here... I still need to look at the patches).
>

Sending this exact ioctl is guarded by one of my patches in the QEMU
counterpart series:

https://lore.kernel.org/qemu-devel/20260612165110.431376-6-andrey.drobyshev@virtuozzo.com/

So we indeed avoid sending it on migration target in case of CPR migration.

>>
>> For this to work, we need to be able to transfer device ownership from
>> source QEMU to dest QEMU.  Namely, source needs to reset ownership by
>> issuing VHOST_RESET_OWNER ioctl, and then target has to claim it by
>> calling VHOST_SET_OWNER.
>>
>> Since VHOST_RESET_OWNER isn't yet implemented for vhost-vsock, let's add
>> such implementation (patches 1-2).  Also fix regression introduced by
>> the earlier commit [1] (patch 3), and fix the deadlock bug (commit 4).
> 
> If it's a regression, should we fix it separately?
> 
> Or is it related to this series?
>

Probably my wording wasn't quite correct.  I posted this patch here
because we found the problem during testing this particular
functionality, i.e. vsock data transfer + CPR migration.  And the
problem was introduced by a recent commit, which is fine on its own, but
breaks the CPR case.
>>
>> There's a complementary series for QEMU [0] adding support of vhost-vsock
>> devices during CPR migration.
>>
>> NOTE: this series needs to be applied on top of Michael's vhost/linux-next
>> tree as it contains relevant commit [1], not yet present in master branch.
>>
>> I've tested this (patched QEMU + patched kernel) approximately as follows:
>>
>>  * Run listener in the guest:
>>  socat -u VSOCK-LISTEN:9999 - >/tmp/recv.bin
>>
>>  * Run data transfer from host to guest:
>>  socat -u FILE:/root/bigfile.bin VSOCK-CONNECT:CID:9999
>>
>>  * Perform CPR migration during transfer (either cpr-exec or cpr-transfer)
>>  * Check that file hash sum matches
>>
>> [0] https://lore.kernel.org/qemu-devel/20260612165110.431376-1-andrey.drobyshev@virtuozzo.com
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/commit/?id=bb26ed5f3a8b
>>
>> Andrey Drobyshev (1):
>>  vhost/vsock: re-scan TX virtqueue on device start
>>
>> Denis V. Lunev (1):
>>  vhost/vsock: suppress EHOSTUNREACH fast-fail during CPR pause
>>
>> Pavel Tikhomirov (2):
>>  vhost/vsock: split out vhost_vsock_drop_backends helper
>>  vhost/vsock: add VHOST_RESET_OWNER ioctl
>>
>> drivers/vhost/vsock.c | 80 +++++++++++++++++++++++++++++++++++++------
>> 1 file changed, 69 insertions(+), 11 deletions(-)
>>
>> -- 
>> 2.47.1
>>
> 


^ permalink raw reply

* Re: [PATCH 2/4] vhost/vsock: add VHOST_RESET_OWNER ioctl
From: Stefano Garzarella @ 2026-06-16 13:48 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <20260612165718.433546-3-andrey.drobyshev@virtuozzo.com>

On Fri, Jun 12, 2026 at 07:57:16PM +0300, Andrey Drobyshev wrote:
>From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>
>This ioctl is needed for QEMU's CPR (checkpoint-restore) migration of
>the guest with vhost-vsock device.  For this to work, we need to reset
>the device ownership on the source side by calling RESET_OWNER, and then
>claim it on the dest side by calling SET_OWNER.  We expect not to lose any
>AF_VSOCK connection while this happens.
>
>Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>---
> drivers/vhost/vsock.c | 28 ++++++++++++++++++++++++++++
> 1 file changed, 28 insertions(+)
>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index b12221ce6faf..e629886e5cf8 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -894,6 +894,32 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
> 	return -EFAULT;
> }
>
>+static int vhost_vsock_reset_owner(struct vhost_vsock *vsock)
>+{
>+	struct vhost_iotlb *umem;
>+	long err;
>+
>+	mutex_lock(&vsock->dev.mutex);
>+	err = vhost_dev_check_owner(&vsock->dev);
>+	if (err)
>+		goto done;
>+	umem = vhost_dev_reset_owner_prepare();
>+	if (!umem) {
>+		err = -ENOMEM;
>+		goto done;
>+	}
>+	/* Follows vhost_vsock_dev_release closely except for guest_cid drop */
>+	vsock_for_each_connected_socket(&vhost_transport.transport,
>+					vhost_vsock_reset_orphans);

In vhost_vsock_reset_orphans() we have:

	rcu_read_lock();

	/* If the peer is still valid, no need to reset connection */
	if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk))) {
		rcu_read_unlock();
		return;
	}

IIUC we are not removing the guest cid from the hash table, so this 
check will be always true, and nothing is done.

So, is this call really useful?

>+	vhost_vsock_drop_backends(vsock);
>+	vhost_vsock_flush(vsock);
>+	vhost_dev_stop(&vsock->dev);
>+	vhost_dev_reset_owner(&vsock->dev, umem);
>+done:
>+	mutex_unlock(&vsock->dev.mutex);
>+	return err;
>+}
>+
> static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
> 				  unsigned long arg)
> {
>@@ -937,6 +963,8 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
> 			return -EOPNOTSUPP;
> 		vhost_set_backend_features(&vsock->dev, features);
> 		return 0;
>+	case VHOST_RESET_OWNER:
>+		return vhost_vsock_reset_owner(vsock);
> 	default:
> 		mutex_lock(&vsock->dev.mutex);
> 		r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
>-- 
>2.47.1
>


^ permalink raw reply

* [PATCH net v4 2/2] ipv6: account for fraggap on the paged allocation path
From: Wongi Lee @ 2026-06-16 13:46 UTC (permalink / raw)
  To: netdev
  Cc: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, asml.silence, dhowells,
	willemb, Jungwoo Lee
In-Reply-To: <ajFQn6yh43eDeQm9@DESKTOP-19IMU7U.localdomain>

In __ip6_append_data(), when the paged-allocation branch is taken
(MSG_MORE / NETIF_F_SG / large fraglen), alloclen and pagedlen are
computed as

	alloclen = fragheaderlen + transhdrlen;
	pagedlen = datalen - transhdrlen;

datalen already includes fraggap (datalen = length + fraggap). When
fraggap is non-zero, this is not the first skb and transhdrlen is zero.
The fraggap bytes carried over from the previous skb are copied just past
the fragment headers in the new skb's linear area. The linear area is
therefore undersized by fraggap bytes while pagedlen is overstated by the
same amount, and the copy writes past skb->end into the trailing
skb_shared_info.

An unprivileged user can trigger this via a UDPv6 socket using
MSG_MORE together with MSG_SPLICE_PAGES.

The bad accounting was introduced by commit 773ba4fe9104 ("ipv6:
avoid partial copy for zc"). Before commit ce650a166335 ("udp6: Fix
__ip6_append_data()'s handling of MSG_SPLICE_PAGES"), the negative
copy value caused -EINVAL to be returned. That later commit allowed
MSG_SPLICE_PAGES to proceed in this case, making the corruption
triggerable.

The non-paged branch sets alloclen to fraglen, which already accounts
for fraggap because datalen does. Bring the paged branch in line by
adding fraggap to alloclen and subtracting it from pagedlen.

After this adjustment, copy no longer collapses to -fraggap on the
paged path, so remove the stale comment describing that old arithmetic.
Since a negative copy is no longer expected for a valid MSG_SPLICE_PAGES
case, remove the MSG_SPLICE_PAGES exception from the negative copy check.

Fixes: 773ba4fe9104 ("ipv6: avoid partial copy for zc")
Signed-off-by: Jungwoo Lee <jwlee2217@gmail.com>
Signed-off-by: Wongi Lee <qw3rtyp0@gmail.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
---
 net/ipv6/ip6_output.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c14adcdd4396..13463c95c7a7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1668,8 +1668,8 @@ static int __ip6_append_data(struct sock *sk,
 				  !(rt->dst.dev->features & NETIF_F_SG)))
 				alloclen = fraglen;
 			else {
-				alloclen = fragheaderlen + transhdrlen;
-				pagedlen = datalen - transhdrlen;
+				alloclen = fragheaderlen + transhdrlen + fraggap;
+				pagedlen = datalen - transhdrlen - fraggap;
 			}
 			alloclen += alloc_extra;

@@ -1684,10 +1684,7 @@ static int __ip6_append_data(struct sock *sk,
 			fraglen = datalen + fragheaderlen;

 			copy = datalen - transhdrlen - fraggap - pagedlen;
-			/* [!] NOTE: copy may be negative if pagedlen>0
-			 * because then the equation may reduces to -fraggap.
-			 */
-			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
+			if (copy < 0) {
 				err = -EINVAL;
 				goto error;
 			}
-- 
2.34.1

^ permalink raw reply related

* Re: [PATCH 1/4] vhost/vsock: split out vhost_vsock_drop_backends helper
From: Stefano Garzarella @ 2026-06-16 13:42 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <20260612165718.433546-2-andrey.drobyshev@virtuozzo.com>

On Fri, Jun 12, 2026 at 07:57:15PM +0300, Andrey Drobyshev wrote:
>From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>
>Split the actual backend dropping part from vhost_vsock_stop.  We're
>going to need it for the VHOST_RESET_OWNER implementation in the
>following patch, when vsock->dev.mutex is already taken and owner is
>checked.
>
>Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
>---
> drivers/vhost/vsock.c | 26 +++++++++++++++++---------
> 1 file changed, 17 insertions(+), 9 deletions(-)

LGTM!

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>

>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index 9aaab6bb8061..b12221ce6faf 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -664,9 +664,24 @@ static int vhost_vsock_start(struct vhost_vsock *vsock)
> 	return ret;
> }
>
>-static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
>+static void vhost_vsock_drop_backends(struct vhost_vsock *vsock)
> {
>+	struct vhost_virtqueue *vq;
> 	size_t i;
>+
>+	lockdep_assert_held(&vsock->dev.mutex);
>+
>+	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>+		vq = &vsock->vqs[i];
>+
>+		mutex_lock(&vq->mutex);
>+		vhost_vq_set_backend(vq, NULL);
>+		mutex_unlock(&vq->mutex);
>+	}
>+}
>+
>+static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
>+{
> 	int ret = 0;
>
> 	mutex_lock(&vsock->dev.mutex);
>@@ -677,14 +692,7 @@ static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
> 			goto err;
> 	}
>
>-	for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
>-		struct vhost_virtqueue *vq = &vsock->vqs[i];
>-
>-		mutex_lock(&vq->mutex);
>-		vhost_vq_set_backend(vq, NULL);
>-		mutex_unlock(&vq->mutex);
>-	}
>-
>+	vhost_vsock_drop_backends(vsock);
> err:
> 	mutex_unlock(&vsock->dev.mutex);
> 	return ret;
>-- 
>2.47.1
>


^ permalink raw reply

* [PATCH net v4 1/2] ipv4: account for fraggap on the paged allocation path
From: Wongi Lee @ 2026-06-16 13:38 UTC (permalink / raw)
  To: netdev
  Cc: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, asml.silence, dhowells,
	willemb, Jungwoo Lee
In-Reply-To: <ajFQn6yh43eDeQm9@DESKTOP-19IMU7U.localdomain>

In __ip_append_data(), when the paged-allocation branch is taken,
alloclen and pagedlen are computed as

	alloclen = fragheaderlen + transhdrlen;
	pagedlen = datalen - transhdrlen;

datalen already includes fraggap, but the fraggap bytes carried over
from the previous skb are copied into the new skb's linear area at
offset transhdrlen by the subsequent skb_copy_and_csum_bits(). The
linear area is therefore undersized by fraggap bytes while pagedlen is
overstated by the same amount.

The non-paged branch sets alloclen to fraglen, which already accounts
for fraggap because datalen does. Bring the paged branch in line by
adding fraggap to alloclen and subtracting it from pagedlen.

After this adjustment, copy no longer collapses to -fraggap on the
paged path, so remove the stale comment describing that old arithmetic.

Fixes: 8eb77cc73977 ("ipv4: avoid partial copy for zc")
Signed-off-by: Jungwoo Lee <jwlee2217@gmail.com>
Signed-off-by: Wongi Lee <qw3rtyp0@gmail.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
---
 net/ipv4/ip_output.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 5bcd73cbdb41..ec790bad1679 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1117,8 +1117,8 @@ static int __ip_append_data(struct sock *sk,
 				  !(rt->dst.dev->features & NETIF_F_SG)))
 				alloclen = fraglen;
 			else {
-				alloclen = fragheaderlen + transhdrlen;
-				pagedlen = datalen - transhdrlen;
+				alloclen = fragheaderlen + transhdrlen + fraggap;
+				pagedlen = datalen - transhdrlen - fraggap;
 			}
 
 			alloclen += alloc_extra;
@@ -1165,9 +1165,6 @@ static int __ip_append_data(struct sock *sk,
 			}
 
 			copy = datalen - transhdrlen - fraggap - pagedlen;
-			/* [!] NOTE: copy will be negative if pagedlen>0
-			 * because then the equation reduces to -fraggap.
-			 */
 			if (copy > 0 &&
 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
 					    from, data + transhdrlen, offset,
-- 
2.34.1

^ permalink raw reply related

* Re: [PATCH v2] net: macb: add TX stall timeout callback to recover from lost TSTART write
From: Nicolai Buchwitz @ 2026-06-16 13:37 UTC (permalink / raw)
  To: Andrea della Porta
  Cc: netdev, Theo Lebrun, Nicolas Ferre, Claudiu Beznea, Andrew Lunn,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-kernel, linux-arm-kernel, linux-rpi-kernel, Lukasz Raczylo,
	Steffen Jaeckel
In-Reply-To: <468f480454a314303bac6a54780b153f689f2267.1781598350.git.andrea.porta@suse.com>

On 16.6.2026 15:23, Andrea della Porta wrote:
> From: Lukasz Raczylo <lukasz@raczylo.com>
> 
> The MACB found in the Raspberry Pi RP1 suffers from sporadic stalls on
> the TX queue.
> While the exact root cause is not yet fully understood, it is likely
> related to a hardware issue where a TSTART write to the NCR register
> is missed, preventing the transmission from being kicked off.
> 
> Implement a timeout callback to handle TX queue stalls, triggering the
> existing restart mechanism to recover.
> 
> Link: 
> https://lore.kernel.org/all/20260514215459.36109-1-lukasz@raczylo.com/
> Fixes: dc110d1b23564 ("net: cadence: macb: Add support for Raspberry Pi 
> RP1 ethernet controller")
> Signed-off-by: Lukasz Raczylo <lukasz@raczylo.com>
> Co-developed-by: Steffen Jaeckel <sjaeckel@suse.de>
> Signed-off-by: Steffen Jaeckel <sjaeckel@suse.de>
> Co-developed-by: Andrea della Porta <andrea.porta@suse.com>
> Signed-off-by: Andrea della Porta <andrea.porta@suse.com>
> ---
> 
> CHANGES IN v2:
> 
> - dropped the rate-limited log message
> - avoid incrementing tx_error as this is per packet
> 
> ---
>  drivers/net/ethernet/cadence/macb_main.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/net/ethernet/cadence/macb_main.c 
> b/drivers/net/ethernet/cadence/macb_main.c
> index a12aa21244e83..fd282a1700fb9 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -4522,6 +4522,13 @@ static int macb_setup_tc(struct net_device *dev, 
> enum tc_setup_type type,
>  	}
>  }
> 
> +static void macb_tx_timeout(struct net_device *dev, unsigned int q)
> +{
> +	struct macb *bp = netdev_priv(dev);
> +
> +	macb_tx_restart(&bp->queues[q]);
> +}
> +
>  static const struct net_device_ops macb_netdev_ops = {
>  	.ndo_open		= macb_open,
>  	.ndo_stop		= macb_close,
> @@ -4540,6 +4547,7 @@ static const struct net_device_ops 
> macb_netdev_ops = {
>  	.ndo_hwtstamp_set	= macb_hwtstamp_set,
>  	.ndo_hwtstamp_get	= macb_hwtstamp_get,
>  	.ndo_setup_tc		= macb_setup_tc,
> +	.ndo_tx_timeout		= macb_tx_timeout,
>  };
> 
>  /* Configure peripheral capabilities according to device tree

Reviewed-by: Nicolai Buchwitz <nb@tipi-net.de>

Thanks,
Nicolai

^ permalink raw reply

* [PATCH net-next v6 2/2] dinghai: add hardware register access and PCI  capability scanning
From: han.junyang @ 2026-06-16 13:35 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, horms
  Cc: linux-kernel, netdev, han.junyang, ran.ming, han.chengfei,
	zhang.yanze
In-Reply-To: <20260616212106742_trNLb7r-FL04eDlJO8tT@zte.com.cn>

From: Junyang Han <han.junyang@zte.com.cn>

Implement PCI configuration space access, BAR mapping, capability
scanning (common/notify/device), and hardware queue register
definitions for DingHai PF device.

Signed-off-by: Junyang Han <han.junyang@zte.com.cn>
---
 drivers/net/ethernet/zte/dinghai/dh_queue.h |  71 ++++
 drivers/net/ethernet/zte/dinghai/en_pf.c    | 439 ++++++++++++++++++++
 drivers/net/ethernet/zte/dinghai/en_pf.h    |  66 +++
 3 files changed, 576 insertions(+)
 create mode 100644 drivers/net/ethernet/zte/dinghai/dh_queue.h

diff --git a/drivers/net/ethernet/zte/dinghai/dh_queue.h b/drivers/net/ethernet/zte/dinghai/dh_queue.h
new file mode 100644
index 000000000000..5067c73fed33
--- /dev/null
+++ b/drivers/net/ethernet/zte/dinghai/dh_queue.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * ZTE DingHai Ethernet driver - PCI capability definitions
+ * Copyright (c) 2022-2026, ZTE Corporation.
+ */
+
+#ifndef __DH_QUEUE_H__
+#define __DH_QUEUE_H__
+
+/* Vector value used to disable MSI for queue */
+#define ZXDH_MSI_NO_VECTOR      0xff
+
+/* Status byte for guest to report progress, and synchronize features */
+/* We have seen device and processed generic fields */
+#define ZXDH_CONFIG_S_ACKNOWLEDGE 1
+/* We have found a driver for the device. */
+#define ZXDH_CONFIG_S_DRIVER      2
+/* Driver has used its parts of the config, and is happy */
+#define ZXDH_CONFIG_S_DRIVER_OK   4
+/* Driver has finished configuring features */
+#define ZXDH_CONFIG_S_FEATURES_OK 8
+/* Device entered invalid state, driver must reset it */
+#define ZXDH_CONFIG_S_NEEDS_RESET 0x40
+/* We've given up on this device */
+#define ZXDH_CONFIG_S_FAILED      0x80
+
+/* This is the PCI capability header: */
+struct zxdh_pf_pci_cap {
+	__u8 cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
+	__u8 cap_next;		/* Generic PCI field: next ptr. */
+	__u8 cap_len;		/* Generic PCI field: capability length */
+	__u8 cfg_type;		/* Identifies the structure. */
+	__u8 bar;		/* Where to find it. */
+	__u8 id;		/* Multiple capabilities of the same type */
+	__u8 padding[2];		/* Pad to full dword. */
+	__le32 offset;		/* Offset within bar. */
+	__le32 length;		/* Length of the structure, in bytes. */
+};
+
+/* Fields in ZXDH_PF_PCI_CAP_COMMON_CFG: */
+struct zxdh_pf_pci_common_cfg {
+	/* About the whole device. */
+	__le32 device_feature_select; /* read-write */
+	__le32 device_feature;	/* read-only */
+	__le32 guest_feature_select; /* read-write */
+	__le32 guest_feature;		/* read-write */
+	__le16 msix_config;		/* read-write */
+	__le16 num_queues;		/* read-only */
+	__u8 device_status;		/* read-write */
+	__u8 config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	__le16 queue_select;		/* read-write */
+	__le16 queue_size;		/* read-write, power of 2. */
+	__le16 queue_msix_vector;	/* read-write */
+	__le16 queue_enable;		/* read-write */
+	__le16 queue_notify_off;	/* read-only */
+	__le32 queue_desc_lo;		/* read-write */
+	__le32 queue_desc_hi;		/* read-write */
+	__le32 queue_avail_lo;		/* read-write */
+	__le32 queue_avail_hi;		/* read-write */
+	__le32 queue_used_lo;		/* read-write */
+	__le32 queue_used_hi;		/* read-write */
+};
+
+struct zxdh_pf_pci_notify_cap {
+	struct zxdh_pf_pci_cap cap;
+	__le32 notify_off_multiplier; /* Multiplier for queue_notify_off. */
+};
+
+#endif /* __DH_QUEUE_H__ */
diff --git a/drivers/net/ethernet/zte/dinghai/en_pf.c b/drivers/net/ethernet/zte/dinghai/en_pf.c
index 99f2a8af5bf4..401876623689 100644
--- a/drivers/net/ethernet/zte/dinghai/en_pf.c
+++ b/drivers/net/ethernet/zte/dinghai/en_pf.c
@@ -9,6 +9,7 @@
 #include <net/devlink.h>
 #include <linux/dma-mapping.h>
 #include "en_pf.h"
+#include "dh_queue.h"

 MODULE_AUTHOR("Junyang Han <han.junyang@zte.com.cn>");
 MODULE_DESCRIPTION("ZTE DingHai series Ethernet driver");
@@ -90,6 +91,444 @@ void dh_pf_pci_close(struct dh_core_dev *dev)
 	pci_disable_device(dev->pdev);
 }

+int zxdh_pf_pci_find_capability(struct pci_dev *pdev, u8 cfg_type,
+				u32 ioresource_types, int *bars)
+{
+	int pos;
+	u8 type;
+	u8 bar;
+
+	for (pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR); pos > 0;
+	     pos = pci_find_next_capability(pdev, pos, PCI_CAP_ID_VNDR)) {
+		pci_read_config_byte(pdev,
+				     pos + offsetof(struct zxdh_pf_pci_cap,
+							cfg_type), &type);
+		pci_read_config_byte(pdev,
+				     pos + offsetof(struct zxdh_pf_pci_cap, bar), &bar);
+
+		/* ignore structures with reserved BAR values */
+		if (bar > ZXDH_PF_MAX_BAR_VAL)
+			continue;
+
+		if (type == cfg_type) {
+			if (pci_resource_len(pdev, bar) &&
+			    pci_resource_flags(pdev, bar) & ioresource_types) {
+				*bars |= (1 << bar);
+				return pos;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void __iomem *zxdh_pf_map_capability(struct dh_core_dev *dh_dev, int off,
+				     size_t minlen, u32 align,
+				     u32 start, u32 size,
+				     size_t *len, resource_size_t *pa,
+				     u32 *bar_off)
+{
+	struct pci_dev *pdev = dh_dev->pdev;
+	void __iomem *p;
+	u32 offset;
+	u32 length;
+	u8 bar;
+
+	pci_read_config_byte(pdev,
+			     off + offsetof(struct zxdh_pf_pci_cap, bar), &bar);
+	pci_read_config_dword(pdev,
+			      off + offsetof(struct zxdh_pf_pci_cap,
+						offset), &offset);
+	pci_read_config_dword(pdev,
+			      off + offsetof(struct zxdh_pf_pci_cap,
+						length), &length);
+
+	if (bar_off)
+		*bar_off = offset;
+
+	if (length <= start) {
+		dev_err(dh_dev->device, "bad capability len %u (>%u expected)\n",
+			length, start);
+		return NULL;
+	}
+
+	if (length - start < minlen) {
+		dev_err(dh_dev->device, "bad capability len %u (>=%zu expected)\n",
+			length, minlen);
+		return NULL;
+	}
+
+	length -= start;
+	if (start + offset < offset) {
+		dev_err(dh_dev->device, "map wrap-around %u+%u\n", start, offset);
+		return NULL;
+	}
+
+	offset += start;
+	if (offset & (align - 1)) {
+		dev_err(dh_dev->device, "offset %u not aligned to %u\n", offset, align);
+		return NULL;
+	}
+
+	if (length > size)
+		length = size;
+
+	if (len)
+		*len = length;
+
+	if (minlen + offset < minlen ||
+	    minlen + offset > pci_resource_len(pdev, bar)) {
+		dev_err(dh_dev->device,
+			"map custom queue %zu@%u out of range on bar %i length %lu\n",
+			minlen, offset, bar,
+			(unsigned long)pci_resource_len(pdev, bar));
+		return NULL;
+	}
+
+	p = pci_iomap_range(pdev, bar, offset, length);
+	if (!p) {
+		dev_err(dh_dev->device, "unable to map custom queue %u@%u on bar %i\n",
+			length, offset, bar);
+	} else if (pa) {
+		*pa = pci_resource_start(pdev, bar) + offset;
+	}
+
+	return p;
+}
+
+int zxdh_pf_common_cfg_init(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	struct pci_dev *pdev = dh_dev->pdev;
+	int common;
+
+	/* check for a common config: if not, use legacy mode (bar 0). */
+	common = zxdh_pf_pci_find_capability(pdev, ZXDH_PCI_CAP_COMMON_CFG,
+					     IORESOURCE_IO | IORESOURCE_MEM,
+					     &pf_dev->modern_bars);
+	if (common == 0) {
+		dev_err(dh_dev->device,
+			"missing capabilities %i, leaving for legacy driver\n",
+			common);
+		return -ENODEV;
+	}
+
+	pf_dev->common = zxdh_pf_map_capability(dh_dev, common,
+						sizeof(struct zxdh_pf_pci_common_cfg),
+						ZXDH_PF_ALIGN4, 0,
+						sizeof(struct zxdh_pf_pci_common_cfg),
+						NULL, NULL, NULL);
+	if (!pf_dev->common) {
+		dev_err(dh_dev->device, "pf_dev->common is null\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int zxdh_pf_notify_cfg_init(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	struct pci_dev *pdev = dh_dev->pdev;
+	u32 notify_length;
+	u32 notify_offset;
+	int notify;
+
+	/* If common is there, these should be too... */
+	notify = zxdh_pf_pci_find_capability(pdev, ZXDH_PCI_CAP_NOTIFY_CFG,
+					     IORESOURCE_IO | IORESOURCE_MEM,
+					     &pf_dev->modern_bars);
+	if (notify == 0) {
+		dev_err(dh_dev->device, "missing capabilities %i\n", notify);
+		return -EINVAL;
+	}
+
+	pci_read_config_dword(pdev,
+			      notify + offsetof(struct zxdh_pf_pci_notify_cap,
+				notify_off_multiplier),
+		&pf_dev->notify_offset_multiplier);
+	pci_read_config_dword(pdev,
+			      notify + offsetof(struct zxdh_pf_pci_notify_cap,
+				cap.length), &notify_length);
+	pci_read_config_dword(pdev,
+			      notify + offsetof(struct zxdh_pf_pci_notify_cap,
+				cap.offset), &notify_offset);
+
+	/* We don't know how many VQs we'll map, ahead of the time.
+	 * If notify length is small, map it all now. Otherwise,
+	 * map each VQ individually later.
+	 */
+	if (notify_length + (notify_offset % PAGE_SIZE) <= PAGE_SIZE) {
+		pf_dev->notify_base = zxdh_pf_map_capability(dh_dev, notify,
+							     ZXDH_PF_MAP_MINLEN2,
+							    ZXDH_PF_ALIGN2, 0,
+							    notify_length,
+							    &pf_dev->notify_len,
+							    &pf_dev->notify_pa, NULL);
+		if (!pf_dev->notify_base) {
+			dev_err(dh_dev->device, "pf_dev->notify_base is null\n");
+			return -EINVAL;
+		}
+	} else {
+		pf_dev->notify_map_cap = notify;
+	}
+
+	return 0;
+}
+
+int zxdh_pf_device_cfg_init(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	struct pci_dev *pdev = dh_dev->pdev;
+	int device;
+
+	/* Device capability is only mandatory for
+	 * devices that have device-specific configuration.
+	 */
+	device = zxdh_pf_pci_find_capability(pdev, ZXDH_PCI_CAP_DEVICE_CFG,
+					     IORESOURCE_IO | IORESOURCE_MEM,
+					     &pf_dev->modern_bars);
+
+	/* we don't know how much we should map,
+	 * but PAGE_SIZE is more than enough for all existing devices.
+	 */
+	if (device) {
+		pf_dev->device = zxdh_pf_map_capability(dh_dev, device, 0,
+							ZXDH_PF_ALIGN4, 0, PAGE_SIZE,
+						       &pf_dev->device_len, NULL,
+						       &pf_dev->dev_cfg_bar_off);
+		if (!pf_dev->device) {
+			dev_err(dh_dev->device, "pf_dev->device is null\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+void zxdh_pf_modern_cfg_uninit(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	struct pci_dev *pdev = dh_dev->pdev;
+
+	if (pf_dev->device)
+		pci_iounmap(pdev, pf_dev->device);
+	if (pf_dev->notify_base)
+		pci_iounmap(pdev, pf_dev->notify_base);
+	pci_iounmap(pdev, pf_dev->common);
+}
+
+int zxdh_pf_modern_cfg_init(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	struct pci_dev *pdev = dh_dev->pdev;
+	int ret;
+
+	ret = zxdh_pf_common_cfg_init(dh_dev);
+	if (ret) {
+		dev_err(dh_dev->device, "zxdh_pf_common_cfg_init failed: %d\n", ret);
+		return -EINVAL;
+	}
+
+	ret = zxdh_pf_notify_cfg_init(dh_dev);
+	if (ret) {
+		dev_err(dh_dev->device, "zxdh_pf_notify_cfg_init failed: %d\n", ret);
+		goto err_map_notify;
+	}
+
+	ret = zxdh_pf_device_cfg_init(dh_dev);
+	if (ret) {
+		dev_err(dh_dev->device, "zxdh_pf_device_cfg_init failed: %d\n", ret);
+		goto err_map_device;
+	}
+
+	return 0;
+
+err_map_device:
+	if (pf_dev->notify_base)
+		pci_iounmap(pdev, pf_dev->notify_base);
+err_map_notify:
+	pci_iounmap(pdev, pf_dev->common);
+	return -EINVAL;
+}
+
+u16 zxdh_pf_get_queue_notify_off(struct dh_core_dev *dh_dev,
+				 u16 phy_index, u16 index)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+
+	if (pf_dev->packed_status)
+		iowrite16(phy_index, &pf_dev->common->queue_select);
+	else
+		iowrite16(index, &pf_dev->common->queue_select);
+
+	return ioread16(&pf_dev->common->queue_notify_off);
+}
+
+void __iomem *zxdh_pf_map_vq_notify(struct dh_core_dev *dh_dev,
+				    u16 phy_index, u16 index,
+				     resource_size_t *pa)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	u16 off;
+
+	off = zxdh_pf_get_queue_notify_off(dh_dev, phy_index, index);
+
+	if (pf_dev->notify_base) {
+		/* offset should not wrap */
+		if ((u64)off *
+			pf_dev->notify_offset_multiplier + 2 > pf_dev->notify_len) {
+			dev_err(dh_dev->device,
+				"bad notification offset %u (x %u) for queue %u > %zd",
+				off, pf_dev->notify_offset_multiplier, phy_index,
+				pf_dev->notify_len);
+			return NULL;
+		}
+
+		if (pa)
+			*pa = pf_dev->notify_pa + off * pf_dev->notify_offset_multiplier;
+
+		return pf_dev->notify_base + off * pf_dev->notify_offset_multiplier;
+	} else {
+		return zxdh_pf_map_capability(dh_dev, pf_dev->notify_map_cap, 2, 2,
+					      off * pf_dev->notify_offset_multiplier,
+					      2, NULL, pa, NULL);
+	}
+}
+
+void zxdh_pf_unmap_vq_notify(struct dh_core_dev *dh_dev, void __iomem *priv)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+
+	if (!pf_dev->notify_base)
+		pci_iounmap(dh_dev->pdev, priv);
+}
+
+void zxdh_pf_set_status(struct dh_core_dev *dh_dev, u8 status)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+
+	iowrite8(status, &pf_dev->common->device_status);
+}
+
+u8 zxdh_pf_get_status(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+
+	return ioread8(&pf_dev->common->device_status);
+}
+
+u8 zxdh_pf_get_cfg_gen(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	u8 config_generation;
+
+	config_generation = ioread8(&pf_dev->common->config_generation);
+
+	return config_generation;
+}
+
+void zxdh_pf_get_vf_mac(struct dh_core_dev *dh_dev, u8 *mac, int vf_id)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	u32 DEV_MAC_L;
+	u16 DEV_MAC_H;
+
+	if (pf_dev->pf_sriov_cap_base) {
+		DEV_MAC_L = ioread32(pf_dev->pf_sriov_cap_base +
+				     (pf_dev->sriov_bar_size) * vf_id +
+				     pf_dev->dev_cfg_bar_off);
+		mac[0] = DEV_MAC_L & 0xff;
+		mac[1] = (DEV_MAC_L >> 8) & 0xff;
+		mac[2] = (DEV_MAC_L >> 16) & 0xff;
+		mac[3] = (DEV_MAC_L >> 24) & 0xff;
+		DEV_MAC_H = ioread16(pf_dev->pf_sriov_cap_base +
+				      (pf_dev->sriov_bar_size) * vf_id +
+				      pf_dev->dev_cfg_bar_off +
+				      ZXDH_DEV_MAC_HIGH_OFFSET);
+		mac[4] = DEV_MAC_H & 0xff;
+		mac[5] = (DEV_MAC_H >> 8) & 0xff;
+	}
+}
+
+void zxdh_pf_set_vf_mac_reg(struct zxdh_pf_device *pf_dev,
+			    u8 *mac, int vf_id)
+{
+	u32 DEV_MAC_L;
+	u16 DEV_MAC_H;
+
+	if (pf_dev->pf_sriov_cap_base) {
+		DEV_MAC_L = mac[0] | (mac[1] << 8) |
+					(mac[2] << 16) | (mac[3] << 24);
+		DEV_MAC_H = mac[4] | (mac[5] << 8);
+		iowrite32(DEV_MAC_L, (pf_dev->pf_sriov_cap_base +
+			  (pf_dev->sriov_bar_size) * vf_id +
+			  pf_dev->dev_cfg_bar_off));
+		iowrite16(DEV_MAC_H, (pf_dev->pf_sriov_cap_base +
+			  (pf_dev->sriov_bar_size) * vf_id +
+			  pf_dev->dev_cfg_bar_off +
+			  ZXDH_DEV_MAC_HIGH_OFFSET));
+	}
+}
+
+void zxdh_pf_set_vf_mac(struct dh_core_dev *dh_dev, u8 *mac, int vf_id)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+
+	zxdh_pf_set_vf_mac_reg(pf_dev, mac, vf_id);
+}
+
+void zxdh_set_mac(struct dh_core_dev *dh_dev, u8 *mac)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	u32 DEV_MAC_L;
+	u16 DEV_MAC_H;
+
+	DEV_MAC_L = mac[0] | (mac[1] << 8) | (mac[2] << 16) | (mac[3] << 24);
+	DEV_MAC_H = mac[4] | (mac[5] << 8);
+	iowrite32(DEV_MAC_L, pf_dev->device);
+	iowrite16(DEV_MAC_H, pf_dev->device + ZXDH_DEV_MAC_HIGH_OFFSET);
+}
+
+void zxdh_get_mac(struct dh_core_dev *dh_dev, u8 *mac)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	u32 DEV_MAC_L;
+	u16 DEV_MAC_H;
+
+	DEV_MAC_L = ioread32(pf_dev->device);
+	mac[0] = DEV_MAC_L & 0xff;
+	mac[1] = (DEV_MAC_L >> 8) & 0xff;
+	mac[2] = (DEV_MAC_L >> 16) & 0xff;
+	mac[3] = (DEV_MAC_L >> 24) & 0xff;
+	DEV_MAC_H = ioread16(pf_dev->device + ZXDH_DEV_MAC_HIGH_OFFSET);
+	mac[4] = DEV_MAC_H & 0xff;
+	mac[5] = (DEV_MAC_H >> 8) & 0xff;
+}
+
+u64 zxdh_pf_get_features(struct dh_core_dev *dh_dev)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+	u64 device_feature;
+
+	iowrite32(0, &pf_dev->common->device_feature_select);
+	device_feature = ioread32(&pf_dev->common->device_feature);
+	iowrite32(1, &pf_dev->common->device_feature_select);
+	device_feature |= ((u64)ioread32(&pf_dev->common->device_feature)
+						<< 32);
+
+	return device_feature;
+}
+
+void zxdh_pf_set_features(struct dh_core_dev *dh_dev, u64 features)
+{
+	struct zxdh_pf_device *pf_dev = dh_dev->priv;
+
+	iowrite32(0, &pf_dev->common->guest_feature_select);
+	iowrite32((u32)features, &pf_dev->common->guest_feature);
+	iowrite32(1, &pf_dev->common->guest_feature_select);
+	iowrite32(features >> 32, &pf_dev->common->guest_feature);
+}
+
 static int dh_pf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct zxdh_pf_device *pf_dev;
diff --git a/drivers/net/ethernet/zte/dinghai/en_pf.h b/drivers/net/ethernet/zte/dinghai/en_pf.h
index 80ff1b860b83..434d18944924 100644
--- a/drivers/net/ethernet/zte/dinghai/en_pf.h
+++ b/drivers/net/ethernet/zte/dinghai/en_pf.h
@@ -17,6 +17,24 @@
 #define ZXDH_PF_DEVICE_ID	0x8040
 #define ZXDH_VF_DEVICE_ID	0x8041

+/* Common configuration */
+#define ZXDH_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define ZXDH_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define ZXDH_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define ZXDH_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define ZXDH_PCI_CAP_PCI_CFG		5
+
+#define ZXDH_PF_MAX_BAR_VAL		0x5
+#define ZXDH_PF_ALIGN4			4
+#define ZXDH_PF_ALIGN2			2
+#define ZXDH_PF_MAP_MINLEN2		2
+
+#define ZXDH_DEV_MAC_HIGH_OFFSET	4
+
 enum dh_coredev_type {
 	DH_COREDEV_PF,
 	DH_COREDEV_VF,
@@ -36,7 +54,26 @@ struct dh_core_dev {
 };

 struct zxdh_pf_device {
+	struct zxdh_pf_pci_common_cfg __iomem *common;
+	/* Device-specific data (non-legacy mode)  */
+	/* Base of vq notifications (non-legacy mode). */
+	void __iomem *device;
+	void __iomem *notify_base;
+	void __iomem *pf_sriov_cap_base;
+	/* Physical base of vq notifications */
+	resource_size_t notify_pa;
+	/* So we can sanity-check accesses. */
+	size_t notify_len;
+	size_t device_len;
+	/* Capability for when we need to map notifications per-vq. */
+	s32 notify_map_cap;
+	u32 notify_offset_multiplier;
+	/* Multiply queue_notify_off by this value. (non-legacy mode). */
+	s32 modern_bars;
 	void __iomem *pci_ioremap_addr[6];
+	u64 sriov_bar_size;
+	u32 dev_cfg_bar_off;
+	bool packed_status;
 	bool bar_chan_valid;
 	bool vepa;
 	struct mutex irq_lock; /* Protects IRQ operations */
@@ -61,5 +98,34 @@ static inline void dh_core_free_priv(struct dh_core_dev *dh_dev)
 	((pdev)->device == ZXDH_VF_DEVICE_ID ? DH_COREDEV_VF : DH_COREDEV_PF)

 void dh_pf_pci_close(struct dh_core_dev *dev);
+int zxdh_pf_pci_find_capability(struct pci_dev *pdev, u8 cfg_type,
+				u32 ioresource_types, int *bars);
+void __iomem *zxdh_pf_map_capability(struct dh_core_dev *dh_dev, int off,
+				     size_t minlen, u32 align,
+				     u32 start, u32 size,
+				     size_t *len, resource_size_t *pa,
+				     u32 *bar_off);
+int zxdh_pf_common_cfg_init(struct dh_core_dev *dh_dev);
+int zxdh_pf_notify_cfg_init(struct dh_core_dev *dh_dev);
+int zxdh_pf_device_cfg_init(struct dh_core_dev *dh_dev);
+void zxdh_pf_modern_cfg_uninit(struct dh_core_dev *dh_dev);
+int zxdh_pf_modern_cfg_init(struct dh_core_dev *dh_dev);
+u16 zxdh_pf_get_queue_notify_off(struct dh_core_dev *dh_dev,
+				 u16 phy_index, u16 index);
+void __iomem *zxdh_pf_map_vq_notify(struct dh_core_dev *dh_dev,
+				     u16 phy_index, u16 index,
+				     resource_size_t *pa);
+void zxdh_pf_unmap_vq_notify(struct dh_core_dev *dh_dev, void __iomem *priv);
+void zxdh_pf_set_status(struct dh_core_dev *dh_dev, u8 status);
+u8 zxdh_pf_get_status(struct dh_core_dev *dh_dev);
+u8 zxdh_pf_get_cfg_gen(struct dh_core_dev *dh_dev);
+void zxdh_pf_get_vf_mac(struct dh_core_dev *dh_dev, u8 *mac, int vf_id);
+void zxdh_pf_set_vf_mac_reg(struct zxdh_pf_device *pf_dev,
+			    u8 *mac, int vf_id);
+void zxdh_pf_set_vf_mac(struct dh_core_dev *dh_dev, u8 *mac, int vf_id);
+void zxdh_set_mac(struct dh_core_dev *dh_dev, u8 *mac);
+void zxdh_get_mac(struct dh_core_dev *dh_dev, u8 *mac);
+u64 zxdh_pf_get_features(struct dh_core_dev *dh_dev);
+void zxdh_pf_set_features(struct dh_core_dev *dh_dev, u64 features);

 #endif /* __ZXDH_EN_PF_H__ */
-- 
2.27.0

^ permalink raw reply related

* Re: [PATCH 0/4] vhost/vsock: add support for VHOST_RESET_OWNER and CPR migration
From: Stefano Garzarella @ 2026-06-16 13:35 UTC (permalink / raw)
  To: Andrey Drobyshev
  Cc: linux-kernel, kvm, virtualization, netdev, mst, stefanha,
	maciej.szmigiero, bchaney, mark.kanda, ptikhomirov, den
In-Reply-To: <20260612165718.433546-1-andrey.drobyshev@virtuozzo.com>

Hi Andrey,
thanks for the series!

On Fri, Jun 12, 2026 at 07:57:14PM +0300, Andrey Drobyshev wrote:
>Host<-->guest connections via AF_VSOCK sockets aren't supposed to
>outlive VM migration, since VM is moving to another host.  However
>there's a special case, which is QEMU live-update, or CPR
>(checkpoint-restore) migration.  In this case, VM remains on the same
>host, and we'd like such connections to persist.

In the spec we have VIRTIO_VSOCK_EVENT_TRANSPORT_RESET which is usually 
sent by the device after a migration.

IIUC the specs don't say this has to be done all the time, so we don't 
need to change anything in the specs, right?

We just need to avoid sending it (which I think is what we're doing 
here... I still need to look at the patches).

>
>For this to work, we need to be able to transfer device ownership from
>source QEMU to dest QEMU.  Namely, source needs to reset ownership by
>issuing VHOST_RESET_OWNER ioctl, and then target has to claim it by
>calling VHOST_SET_OWNER.
>
>Since VHOST_RESET_OWNER isn't yet implemented for vhost-vsock, let's add
>such implementation (patches 1-2).  Also fix regression introduced by
>the earlier commit [1] (patch 3), and fix the deadlock bug (commit 4).

If it's a regression, should we fix it separately?

Or is it related to this series?

>
>There's a complementary series for QEMU [0] adding support of vhost-vsock
>devices during CPR migration.
>
>NOTE: this series needs to be applied on top of Michael's vhost/linux-next
>tree as it contains relevant commit [1], not yet present in master branch.
>
>I've tested this (patched QEMU + patched kernel) approximately as follows:
>
>  * Run listener in the guest:
>  socat -u VSOCK-LISTEN:9999 - >/tmp/recv.bin
>
>  * Run data transfer from host to guest:
>  socat -u FILE:/root/bigfile.bin VSOCK-CONNECT:CID:9999
>
>  * Perform CPR migration during transfer (either cpr-exec or cpr-transfer)
>  * Check that file hash sum matches
>
>[0] https://lore.kernel.org/qemu-devel/20260612165110.431376-1-andrey.drobyshev@virtuozzo.com
>[1] https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/commit/?id=bb26ed5f3a8b
>
>Andrey Drobyshev (1):
>  vhost/vsock: re-scan TX virtqueue on device start
>
>Denis V. Lunev (1):
>  vhost/vsock: suppress EHOSTUNREACH fast-fail during CPR pause
>
>Pavel Tikhomirov (2):
>  vhost/vsock: split out vhost_vsock_drop_backends helper
>  vhost/vsock: add VHOST_RESET_OWNER ioctl
>
> drivers/vhost/vsock.c | 80 +++++++++++++++++++++++++++++++++++++------
> 1 file changed, 69 insertions(+), 11 deletions(-)
>
>-- 
>2.47.1
>


^ permalink raw reply

* Re: [PATCH bpf v2 1/2] bpf: Fix partial copy of non-linear test_run output
From: Paul Chaignon @ 2026-06-16 13:33 UTC (permalink / raw)
  To: Sun Jian
  Cc: bpf, netdev, linux-kselftest, linux-kernel, ast, daniel, andrii,
	martin.lau, eddyz87, memxor, song, yonghong.song, jolsa, davem,
	edumazet, kuba, pabeni, horms, shuah, hawk, john.fastabend, sdf,
	toke, lorenzo
In-Reply-To: <20260616093103.471444-2-sun.jian.kdev@gmail.com>

On Tue, Jun 16, 2026 at 05:31:02PM +0800, Sun Jian wrote:
> For non-linear test_run output, bpf_test_finish() derives the linear
> data copy length from copy_size - frag_size. This only matches the
> linear data length when copy_size is the full packet size.
> 
> When userspace provides a short data_out buffer, copy_size is clamped to
> that buffer size. If copy_size is smaller than frag_size, the computed
> length becomes negative and bpf_test_finish() returns -ENOSPC before
> copying the packet prefix or updating data_size_out.
> 
> Compute the linear data length from the packet layout instead, and clamp
> the linear copy length to copy_size. This preserves the expected
> partial-copy semantics: return -ENOSPC, copy the packet prefix that fits
> in data_out, and report the full packet length through data_size_out.
> 
> Fixes: 7855e0db150ad ("bpf: test_run: add xdp_shared_info pointer in bpf_test_finish signature")
> Signed-off-by: Sun Jian <sun.jian.kdev@gmail.com>
> ---
>  net/bpf/test_run.c | 11 ++++-------
>  1 file changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
> index 2bc04feadfab..976e8fa31bc9 100644
> --- a/net/bpf/test_run.c
> +++ b/net/bpf/test_run.c
> @@ -453,19 +453,16 @@ static int bpf_test_finish(const union bpf_attr *kattr,
>  	}
>  
>  	if (data_out) {
> -		int len = sinfo ? copy_size - frag_size : copy_size;
> -
> -		if (len < 0) {
> -			err = -ENOSPC;
> -			goto out;
> -		}
> +		u32 head_len = size - frag_size;
> +		u32 len = min(copy_size, head_len);
>  
>  		if (copy_to_user(data_out, data, len))
>  			goto out;
>  
>  		if (sinfo) {
> -			int i, offset = len;
> +			u32 offset = len;
>  			u32 data_len;
> +			int i;

That doesn't look needed.

>  
>  			for (i = 0; i < sinfo->nr_frags; i++) {
>  				skb_frag_t *frag = &sinfo->frags[i];
> -- 
> 2.43.0
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox