* [PATCH net-next v3 1/2] selftests: net: rds: add getsockopt() conversion test
2026-06-08 9:44 [PATCH net-next v3 0/2] net: rds: convert rds to getsockopt_iter Breno Leitao
@ 2026-06-08 9:44 ` Breno Leitao
2026-06-08 9:44 ` [PATCH net-next v3 2/2] rds: convert to getsockopt_iter Breno Leitao
1 sibling, 0 replies; 3+ messages in thread
From: Breno Leitao @ 2026-06-08 9:44 UTC (permalink / raw)
To: Allison Henderson, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Shuah Khan, Andy Grover
Cc: linux-kernel, netdev, linux-rdma, rds-devel, linux-kselftest,
Breno Leitao, kernel-team
Add a kselftest that exercises the RDS getsockopt() paths converted to
the getsockopt_iter() / sockopt_t callback:
- RDS_RECVERR and SO_RDS_TRANSPORT, which return their int value through
copy_to_iter() and report the written length in opt->optlen.
- RDS_INFO_*, which obtains the userspace buffer pages with
iov_iter_extract_pages() (including a non-zero starting page offset)
and lets the info producers copy the snapshot in under a spinlock.
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Allison Henderson <achender@kernel.org>
---
tools/testing/selftests/net/rds/.gitignore | 1 +
tools/testing/selftests/net/rds/Makefile | 4 +
tools/testing/selftests/net/rds/getsockopt.c | 208 +++++++++++++++++++++++++++
3 files changed, 213 insertions(+)
diff --git a/tools/testing/selftests/net/rds/.gitignore b/tools/testing/selftests/net/rds/.gitignore
index 1c6f04e2aa11..7ca4b1440f51 100644
--- a/tools/testing/selftests/net/rds/.gitignore
+++ b/tools/testing/selftests/net/rds/.gitignore
@@ -1 +1,2 @@
include.sh
+getsockopt
diff --git a/tools/testing/selftests/net/rds/Makefile b/tools/testing/selftests/net/rds/Makefile
index ec10ae24e4cf..ab9e92399a6d 100644
--- a/tools/testing/selftests/net/rds/Makefile
+++ b/tools/testing/selftests/net/rds/Makefile
@@ -5,6 +5,8 @@ all:
TEST_PROGS := rds_run.sh
+TEST_GEN_PROGS := getsockopt
+
TEST_FILES := \
include.sh \
settings \
@@ -16,4 +18,6 @@ EXTRA_CLEAN := \
/tmp/rds_logs \
# end of EXTRA_CLEAN
+CFLAGS += $(KHDR_INCLUDES)
+
include ../../lib.mk
diff --git a/tools/testing/selftests/net/rds/getsockopt.c b/tools/testing/selftests/net/rds/getsockopt.c
new file mode 100644
index 000000000000..93ff252c69b8
--- /dev/null
+++ b/tools/testing/selftests/net/rds/getsockopt.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Exercise the RDS getsockopt() paths that were converted to the
+ * getsockopt_iter() / sockopt_t callback.
+ *
+ * Three distinct paths are covered:
+ *
+ * - RDS_RECVERR and SO_RDS_TRANSPORT, which now return their int value
+ * through copy_to_iter() and report the written length in opt->optlen.
+ *
+ * - RDS_INFO_*, which pins the userspace buffer with
+ * iov_iter_extract_pages() (including a non-zero starting page offset)
+ * and lets the info producers memcpy the snapshot in under a spinlock.
+ *
+ * The kvec (in-kernel buffer) -> -EOPNOTSUPP path of rds_info_getsockopt()
+ * is not reachable from a userspace getsockopt() and so is not tested here.
+ */
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <linux/rds.h>
+
+#include "../../kselftest_harness.h"
+
+#ifndef AF_RDS
+#define AF_RDS 21
+#endif
+
+FIXTURE(rds) {
+ int fd;
+};
+
+FIXTURE_SETUP(rds)
+{
+ self->fd = socket(AF_RDS, SOCK_SEQPACKET, 0);
+ if (self->fd < 0)
+ SKIP(return, "AF_RDS unavailable (errno %d) - load the rds module",
+ errno);
+}
+
+FIXTURE_TEARDOWN(rds)
+{
+ if (self->fd >= 0)
+ close(self->fd);
+}
+
+/* RDS_RECVERR defaults to 0 and is reported back as a 4-byte int. */
+TEST_F(rds, recverr_default)
+{
+ socklen_t len = sizeof(int);
+ int val = 0xdeadbeef;
+
+ ASSERT_EQ(0, getsockopt(self->fd, SOL_RDS, RDS_RECVERR, &val, &len));
+ EXPECT_EQ(sizeof(int), len);
+ EXPECT_EQ(0, val);
+}
+
+/* A value set via setsockopt() must be readable back unchanged. */
+TEST_F(rds, recverr_set_get)
+{
+ socklen_t len = sizeof(int);
+ int val = 1;
+
+ ASSERT_EQ(0, setsockopt(self->fd, SOL_RDS, RDS_RECVERR, &val, len));
+
+ val = 0;
+ ASSERT_EQ(0, getsockopt(self->fd, SOL_RDS, RDS_RECVERR, &val, &len));
+ EXPECT_EQ(sizeof(int), len);
+ EXPECT_EQ(1, val);
+}
+
+/* A buffer smaller than an int is rejected with EINVAL, not silently. */
+TEST_F(rds, recverr_short_buffer)
+{
+ socklen_t len = sizeof(int) - 1;
+ char buf[sizeof(int)];
+
+ EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, RDS_RECVERR, buf, &len));
+ EXPECT_EQ(EINVAL, errno);
+}
+
+/* An unbound socket reports RDS_TRANS_NONE for SO_RDS_TRANSPORT. */
+TEST_F(rds, transport_unbound)
+{
+ socklen_t len = sizeof(int);
+ int val = 0;
+
+ ASSERT_EQ(0, getsockopt(self->fd, SOL_RDS, SO_RDS_TRANSPORT, &val,
+ &len));
+ EXPECT_EQ(sizeof(int), len);
+ EXPECT_EQ(RDS_TRANS_NONE, (unsigned int)val);
+}
+
+TEST_F(rds, transport_short_buffer)
+{
+ socklen_t len = sizeof(int) - 1;
+ char buf[sizeof(int)];
+
+ EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, SO_RDS_TRANSPORT, buf,
+ &len));
+ EXPECT_EQ(EINVAL, errno);
+}
+
+/*
+ * RDS_INFO_COUNTERS with a zero-length buffer is the "probe" call: it must
+ * fail with ENOSPC and report the required snapshot size in optlen.
+ */
+TEST_F(rds, info_counters_probe)
+{
+ socklen_t len = 0;
+
+ EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, NULL,
+ &len));
+ EXPECT_EQ(ENOSPC, errno);
+ EXPECT_GT(len, 0);
+ /* The snapshot is an array of fixed-size counter records. */
+ EXPECT_EQ(0, len % (socklen_t)sizeof(struct rds_info_counter));
+}
+
+/*
+ * A real snapshot into an unaligned userspace buffer exercises the
+ * iov_iter_extract_pages() path, including the non-zero offset0 handling
+ * that the patch reworked. Place the buffer at a non-page-aligned address
+ * spanning into the next page to make sure multi-page pinning works too.
+ */
+TEST_F(rds, info_counters_snapshot)
+{
+ struct rds_info_counter *ctr;
+ socklen_t need = 0, len;
+ long pagesz = sysconf(_SC_PAGESIZE);
+ size_t offset, map_len;
+ unsigned int i, n;
+ char *region, *buf;
+ int ret;
+
+ /* Probe for the required size. */
+ getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, NULL, &need);
+ ASSERT_GT(need, 0);
+
+ /*
+ * Place the buffer at a non-page-aligned offset that runs past the
+ * first page boundary, and size the mapping from the probed length so
+ * the test keeps working if the counter set grows.
+ */
+ offset = pagesz - 64;
+ map_len = ((offset + need + pagesz - 1) / pagesz) * pagesz;
+
+ region = mmap(NULL, map_len, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(MAP_FAILED, region);
+
+ buf = region + offset;
+
+ /*
+ * On success the RDS_INFO path returns the positive per-element size
+ * (lens.each) rather than 0, and writes the full snapshot length back
+ * into optlen.
+ */
+ len = need;
+ ret = getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, buf, &len);
+ ASSERT_GE(ret, 0) {
+ TH_LOG("getsockopt snapshot failed: errno %d", errno);
+ }
+ EXPECT_EQ(sizeof(struct rds_info_counter), ret);
+ EXPECT_EQ(need, len);
+
+ /* The counter names must be NUL-terminated, non-empty strings. */
+ ctr = (struct rds_info_counter *)buf;
+ n = len / sizeof(*ctr);
+ ASSERT_GT(n, 0);
+ for (i = 0; i < n; i++) {
+ size_t namelen = strnlen((char *)ctr[i].name,
+ sizeof(ctr[i].name));
+
+ EXPECT_GT(namelen, 0);
+ EXPECT_LT(namelen, sizeof(ctr[i].name));
+ }
+
+ munmap(region, map_len);
+}
+
+/*
+ * A non-zero but too-small buffer must report ENOSPC and the full required
+ * length, without corrupting memory past the buffer.
+ */
+TEST_F(rds, info_counters_short_buffer)
+{
+ socklen_t need = 0, len;
+ char small[sizeof(struct rds_info_counter)];
+
+ getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, NULL, &need);
+ ASSERT_GT(need, 0);
+
+ /* Ask with a buffer guaranteed smaller than the full snapshot. */
+ if (need <= (socklen_t)sizeof(small))
+ SKIP(return, "snapshot fits in one record; nothing to test");
+
+ len = 1; /* < sizeof(struct rds_info_counter) */
+ EXPECT_EQ(-1, getsockopt(self->fd, SOL_RDS, RDS_INFO_COUNTERS, small,
+ &len));
+ EXPECT_EQ(ENOSPC, errno);
+ EXPECT_EQ(need, len);
+}
+
+TEST_HARNESS_MAIN
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 3+ messages in thread* [PATCH net-next v3 2/2] rds: convert to getsockopt_iter
2026-06-08 9:44 [PATCH net-next v3 0/2] net: rds: convert rds to getsockopt_iter Breno Leitao
2026-06-08 9:44 ` [PATCH net-next v3 1/2] selftests: net: rds: add getsockopt() conversion test Breno Leitao
@ 2026-06-08 9:44 ` Breno Leitao
1 sibling, 0 replies; 3+ messages in thread
From: Breno Leitao @ 2026-06-08 9:44 UTC (permalink / raw)
To: Allison Henderson, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Shuah Khan, Andy Grover
Cc: linux-kernel, netdev, linux-rdma, rds-devel, linux-kselftest,
Breno Leitao, kernel-team
Convert RDS socket's getsockopt implementation to use the new
getsockopt_iter callback with sockopt_t.
Key changes:
- Replace (char __user *optval, int __user *optlen) with sockopt_t *opt
- Use opt->optlen for buffer length (input) and returned size (output)
- Use copy_to_iter() instead of put_user()/copy_to_user()
The RDS_INFO_* snapshot path in rds_info_getsockopt() used to pin the
userspace buffer with pin_user_pages_fast() on the raw optval address;
the info producers then memcpy into those pages under a spinlock via
kmap_atomic() and so must not fault. Obtain the same page array and
starting offset from opt->iter_out with iov_iter_extract_pages(), which
pins for write because iter_out is ITER_DEST.
The page array is preallocated here (sized with iov_iter_npages()) and
passed in, so iov_iter_extract_pages() fills it in place rather than
allocating one for us; RDS therefore keeps ownership of the array on
every return path and frees it itself. The rds_info_iterator /
rds_info_copy machinery and all producer callbacks are unchanged.
Kernel buffers (ITER_KVEC) are not page-backed in a way the info
producers can use, so the RDS_INFO path returns -EOPNOTSUPP for them;
this matches the previous behaviour, where a kernel-buffer getsockopt
hit the WARN_ONCE() path in do_sock_getsockopt() and returned
-EOPNOTSUPP. The simple RDS_RECVERR and SO_RDS_TRANSPORT options keep
working for kernel buffers via copy_to_iter().
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Allison Henderson <achender@kernel.org>
---
net/rds/af_rds.c | 36 +++++++++++++++------------
net/rds/info.c | 76 ++++++++++++++++++++++++++++++++------------------------
net/rds/info.h | 3 +--
3 files changed, 65 insertions(+), 50 deletions(-)
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6f4f9cf352bd..d5defe9172e3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -37,6 +37,7 @@
#include <linux/in.h>
#include <linux/ipv6.h>
#include <linux/poll.h>
+#include <linux/uio.h>
#include <net/sock.h>
#include "rds.h"
@@ -485,35 +486,36 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
}
static int rds_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+ sockopt_t *opt)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret = -ENOPROTOOPT, len;
int trans;
+ int val;
if (level != SOL_RDS)
goto out;
- if (get_user(len, optlen)) {
- ret = -EFAULT;
- goto out;
- }
+ len = opt->optlen;
switch (optname) {
case RDS_INFO_FIRST ... RDS_INFO_LAST:
- ret = rds_info_getsockopt(sock, optname, optval,
- optlen);
+ ret = rds_info_getsockopt(sock, optname, opt);
break;
case RDS_RECVERR:
- if (len < sizeof(int))
+ if (len < sizeof(int)) {
ret = -EINVAL;
- else
- if (put_user(rs->rs_recverr, (int __user *) optval) ||
- put_user(sizeof(int), optlen))
+ break;
+ }
+ val = rs->rs_recverr;
+ if (copy_to_iter(&val, sizeof(int), &opt->iter_out) !=
+ sizeof(int)) {
ret = -EFAULT;
- else
+ } else {
+ opt->optlen = sizeof(int);
ret = 0;
+ }
break;
case SO_RDS_TRANSPORT:
if (len < sizeof(int)) {
@@ -522,11 +524,13 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
}
trans = (rs->rs_transport ? rs->rs_transport->t_type :
RDS_TRANS_NONE); /* unbound */
- if (put_user(trans, (int __user *)optval) ||
- put_user(sizeof(int), optlen))
+ if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) !=
+ sizeof(int)) {
ret = -EFAULT;
- else
+ } else {
+ opt->optlen = sizeof(int);
ret = 0;
+ }
break;
default:
break;
@@ -653,7 +657,7 @@ static const struct proto_ops rds_proto_ops = {
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = rds_setsockopt,
- .getsockopt = rds_getsockopt,
+ .getsockopt_iter = rds_getsockopt,
.sendmsg = rds_sendmsg,
.recvmsg = rds_recvmsg,
.mmap = sock_no_mmap,
diff --git a/net/rds/info.c b/net/rds/info.c
index f1b29994934a..499b3774860e 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
+#include <linux/uio.h>
#include "rds.h"
@@ -144,60 +145,68 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
EXPORT_SYMBOL_GPL(rds_info_copy);
/*
- * @optval points to the userspace buffer that the information snapshot
- * will be copied into.
- *
- * @optlen on input is the size of the buffer in userspace. @optlen
- * on output is the size of the requested snapshot in bytes.
+ * @opt->iter_out describes the buffer that the information snapshot will be
+ * copied into, and @opt->optlen is the size of that buffer on input. On
+ * output @opt->optlen is set to the size of the requested snapshot in bytes.
*
* This function returns -errno if there is a failure, particularly -ENOSPC
- * if the given userspace buffer was not large enough to fit the snapshot.
- * On success it returns the positive number of bytes of each array element
- * in the snapshot.
+ * if the given buffer was not large enough to fit the snapshot. On success
+ * it returns the positive number of bytes of each array element in the
+ * snapshot.
*/
-int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
- int __user *optlen)
+int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt)
{
struct rds_info_iterator iter;
struct rds_info_lengths lens;
unsigned long nr_pages = 0;
- unsigned long start;
rds_info_func func;
struct page **pages = NULL;
+ size_t offset0 = 0;
+ int npages = 0;
int ret;
int len;
int total;
- if (get_user(len, optlen)) {
- ret = -EFAULT;
- goto out;
- }
+ len = opt->optlen;
/* check for all kinds of wrapping and the like */
- start = (unsigned long)optval;
- if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
+ if (len < 0 || len > INT_MAX - PAGE_SIZE + 1) {
ret = -EINVAL;
goto out;
}
+ /* The info producers write into the pages with kmap_atomic() while
+ * holding a spinlock, so they need a genuine page-backed user buffer.
+ */
+ if (!user_backed_iter(&opt->iter_out)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
/* a 0 len call is just trying to probe its length */
if (len == 0)
goto call_func;
- nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
- >> PAGE_SHIFT;
-
- pages = kmalloc_objs(struct page *, nr_pages);
+ /*
+ * Preallocate the page array and pass it in so that
+ * iov_iter_extract_pages() fills it in place rather than allocating
+ * one for us. Handing it a non-NULL array keeps ownership of the
+ * array with us on every return path, instead of depending on the
+ * iterator code to allocate and hand it back.
+ */
+ npages = iov_iter_npages(&opt->iter_out, INT_MAX);
+ pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
- ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
- if (ret != nr_pages) {
- if (ret > 0)
- nr_pages = ret;
- else
- nr_pages = 0;
+
+ ret = iov_iter_extract_pages(&opt->iter_out, &pages, len, npages,
+ 0, &offset0);
+ if (ret < 0)
+ goto out;
+ nr_pages = DIV_ROUND_UP(offset0 + ret, PAGE_SIZE);
+ if (ret != len) {
ret = -EAGAIN; /* XXX ? */
goto out;
}
@@ -213,7 +222,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
iter.pages = pages;
iter.addr = NULL;
- iter.offset = start & (PAGE_SIZE - 1);
+ iter.offset = offset0;
func(sock, len, &iter, &lens);
BUG_ON(lens.each == 0);
@@ -230,13 +239,16 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = lens.each;
}
- if (put_user(len, optlen))
- ret = -EFAULT;
+ opt->optlen = len;
out:
- if (pages)
+ /*
+ * iov_iter_extract_pages() pins only user-backed (ubuf) iters;
+ * iov_iter_extract_will_pin() reports whether an unpin is owed here.
+ */
+ if (pages && iov_iter_extract_will_pin(&opt->iter_out))
unpin_user_pages(pages, nr_pages);
- kfree(pages);
+ kvfree(pages);
return ret;
}
diff --git a/net/rds/info.h b/net/rds/info.h
index a069b51c4679..1aab62ab6d00 100644
--- a/net/rds/info.h
+++ b/net/rds/info.h
@@ -21,8 +21,7 @@ typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
void rds_info_register_func(int optname, rds_info_func func);
void rds_info_deregister_func(int optname, rds_info_func func);
-int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
- int __user *optlen);
+int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt);
void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned long bytes);
void rds_info_iter_unmap(struct rds_info_iterator *iter);
--
2.53.0-Meta
^ permalink raw reply related [flat|nested] 3+ messages in thread