From: Breno Leitao <leitao@debian.org>
To: Allison Henderson <achender@kernel.org>,
"David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>,
Paolo Abeni <pabeni@redhat.com>, Simon Horman <horms@kernel.org>,
Shuah Khan <shuah@kernel.org>,
Andy Grover <andy.grover@oracle.com>
Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
linux-rdma@vger.kernel.org, rds-devel@oss.oracle.com,
linux-kselftest@vger.kernel.org,
Breno Leitao <leitao@debian.org>,
kernel-team@meta.com
Subject: [PATCH net-next v3 2/2] rds: convert to getsockopt_iter
Date: Mon, 08 Jun 2026 02:44:58 -0700 [thread overview]
Message-ID: <20260608-getsock_more-v3-2-706ecf2ea332@debian.org> (raw)
In-Reply-To: <20260608-getsock_more-v3-0-706ecf2ea332@debian.org>
Convert RDS socket's getsockopt implementation to use the new
getsockopt_iter callback with sockopt_t.
Key changes:
- Replace (char __user *optval, int __user *optlen) with sockopt_t *opt
- Use opt->optlen for buffer length (input) and returned size (output)
- Use copy_to_iter() instead of put_user()/copy_to_user()
The RDS_INFO_* snapshot path in rds_info_getsockopt() used to pin the
userspace buffer with pin_user_pages_fast() on the raw optval address;
the info producers then memcpy into those pages under a spinlock via
kmap_atomic() and so must not fault. Obtain the same page array and
starting offset from opt->iter_out with iov_iter_extract_pages(), which
pins for write because iter_out is ITER_DEST.
The page array is preallocated here (sized with iov_iter_npages()) and
passed in, so iov_iter_extract_pages() fills it in place rather than
allocating one for us; RDS therefore keeps ownership of the array on
every return path and frees it itself. The rds_info_iterator /
rds_info_copy machinery and all producer callbacks are unchanged.
Kernel buffers (ITER_KVEC) are not page-backed in a way the info
producers can use, so the RDS_INFO path returns -EOPNOTSUPP for them;
this matches the previous behaviour, where a kernel-buffer getsockopt
hit the WARN_ONCE() path in do_sock_getsockopt() and returned
-EOPNOTSUPP. The simple RDS_RECVERR and SO_RDS_TRANSPORT options keep
working for kernel buffers via copy_to_iter().
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Allison Henderson <achender@kernel.org>
---
net/rds/af_rds.c | 36 +++++++++++++++------------
net/rds/info.c | 76 ++++++++++++++++++++++++++++++++------------------------
net/rds/info.h | 3 +--
3 files changed, 65 insertions(+), 50 deletions(-)
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6f4f9cf352bd..d5defe9172e3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -37,6 +37,7 @@
#include <linux/in.h>
#include <linux/ipv6.h>
#include <linux/poll.h>
+#include <linux/uio.h>
#include <net/sock.h>
#include "rds.h"
@@ -485,35 +486,36 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
}
static int rds_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+ sockopt_t *opt)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret = -ENOPROTOOPT, len;
int trans;
+ int val;
if (level != SOL_RDS)
goto out;
- if (get_user(len, optlen)) {
- ret = -EFAULT;
- goto out;
- }
+ len = opt->optlen;
switch (optname) {
case RDS_INFO_FIRST ... RDS_INFO_LAST:
- ret = rds_info_getsockopt(sock, optname, optval,
- optlen);
+ ret = rds_info_getsockopt(sock, optname, opt);
break;
case RDS_RECVERR:
- if (len < sizeof(int))
+ if (len < sizeof(int)) {
ret = -EINVAL;
- else
- if (put_user(rs->rs_recverr, (int __user *) optval) ||
- put_user(sizeof(int), optlen))
+ break;
+ }
+ val = rs->rs_recverr;
+ if (copy_to_iter(&val, sizeof(int), &opt->iter_out) !=
+ sizeof(int)) {
ret = -EFAULT;
- else
+ } else {
+ opt->optlen = sizeof(int);
ret = 0;
+ }
break;
case SO_RDS_TRANSPORT:
if (len < sizeof(int)) {
@@ -522,11 +524,13 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
}
trans = (rs->rs_transport ? rs->rs_transport->t_type :
RDS_TRANS_NONE); /* unbound */
- if (put_user(trans, (int __user *)optval) ||
- put_user(sizeof(int), optlen))
+ if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) !=
+ sizeof(int)) {
ret = -EFAULT;
- else
+ } else {
+ opt->optlen = sizeof(int);
ret = 0;
+ }
break;
default:
break;
@@ -653,7 +657,7 @@ static const struct proto_ops rds_proto_ops = {
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = rds_setsockopt,
- .getsockopt = rds_getsockopt,
+ .getsockopt_iter = rds_getsockopt,
.sendmsg = rds_sendmsg,
.recvmsg = rds_recvmsg,
.mmap = sock_no_mmap,
diff --git a/net/rds/info.c b/net/rds/info.c
index f1b29994934a..499b3774860e 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
+#include <linux/uio.h>
#include "rds.h"
@@ -144,60 +145,68 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
EXPORT_SYMBOL_GPL(rds_info_copy);
/*
- * @optval points to the userspace buffer that the information snapshot
- * will be copied into.
- *
- * @optlen on input is the size of the buffer in userspace. @optlen
- * on output is the size of the requested snapshot in bytes.
+ * @opt->iter_out describes the buffer that the information snapshot will be
+ * copied into, and @opt->optlen is the size of that buffer on input. On
+ * output @opt->optlen is set to the size of the requested snapshot in bytes.
*
* This function returns -errno if there is a failure, particularly -ENOSPC
- * if the given userspace buffer was not large enough to fit the snapshot.
- * On success it returns the positive number of bytes of each array element
- * in the snapshot.
+ * if the given buffer was not large enough to fit the snapshot. On success
+ * it returns the positive number of bytes of each array element in the
+ * snapshot.
*/
-int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
- int __user *optlen)
+int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt)
{
struct rds_info_iterator iter;
struct rds_info_lengths lens;
unsigned long nr_pages = 0;
- unsigned long start;
rds_info_func func;
struct page **pages = NULL;
+ size_t offset0 = 0;
+ int npages = 0;
int ret;
int len;
int total;
- if (get_user(len, optlen)) {
- ret = -EFAULT;
- goto out;
- }
+ len = opt->optlen;
/* check for all kinds of wrapping and the like */
- start = (unsigned long)optval;
- if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
+ if (len < 0 || len > INT_MAX - PAGE_SIZE + 1) {
ret = -EINVAL;
goto out;
}
+ /* The info producers write into the pages with kmap_atomic() while
+ * holding a spinlock, so they need a genuine page-backed user buffer.
+ */
+ if (!user_backed_iter(&opt->iter_out)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
/* a 0 len call is just trying to probe its length */
if (len == 0)
goto call_func;
- nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
- >> PAGE_SHIFT;
-
- pages = kmalloc_objs(struct page *, nr_pages);
+ /*
+ * Preallocate the page array and pass it in so that
+ * iov_iter_extract_pages() fills it in place rather than allocating
+ * one for us. Handing it a non-NULL array keeps ownership of the
+ * array with us on every return path, instead of depending on the
+ * iterator code to allocate and hand it back.
+ */
+ npages = iov_iter_npages(&opt->iter_out, INT_MAX);
+ pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
- ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
- if (ret != nr_pages) {
- if (ret > 0)
- nr_pages = ret;
- else
- nr_pages = 0;
+
+ ret = iov_iter_extract_pages(&opt->iter_out, &pages, len, npages,
+ 0, &offset0);
+ if (ret < 0)
+ goto out;
+ nr_pages = DIV_ROUND_UP(offset0 + ret, PAGE_SIZE);
+ if (ret != len) {
ret = -EAGAIN; /* XXX ? */
goto out;
}
@@ -213,7 +222,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
iter.pages = pages;
iter.addr = NULL;
- iter.offset = start & (PAGE_SIZE - 1);
+ iter.offset = offset0;
func(sock, len, &iter, &lens);
BUG_ON(lens.each == 0);
@@ -230,13 +239,16 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = lens.each;
}
- if (put_user(len, optlen))
- ret = -EFAULT;
+ opt->optlen = len;
out:
- if (pages)
+ /*
+ * iov_iter_extract_pages() pins only user-backed (ubuf) iters;
+ * iov_iter_extract_will_pin() reports whether an unpin is owed here.
+ */
+ if (pages && iov_iter_extract_will_pin(&opt->iter_out))
unpin_user_pages(pages, nr_pages);
- kfree(pages);
+ kvfree(pages);
return ret;
}
diff --git a/net/rds/info.h b/net/rds/info.h
index a069b51c4679..1aab62ab6d00 100644
--- a/net/rds/info.h
+++ b/net/rds/info.h
@@ -21,8 +21,7 @@ typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
void rds_info_register_func(int optname, rds_info_func func);
void rds_info_deregister_func(int optname, rds_info_func func);
-int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
- int __user *optlen);
+int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt);
void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned long bytes);
void rds_info_iter_unmap(struct rds_info_iterator *iter);
--
2.53.0-Meta
prev parent reply other threads:[~2026-06-08 9:46 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-08 9:44 [PATCH net-next v3 0/2] net: rds: convert rds to getsockopt_iter Breno Leitao
2026-06-08 9:44 ` [PATCH net-next v3 1/2] selftests: net: rds: add getsockopt() conversion test Breno Leitao
2026-06-08 9:44 ` Breno Leitao [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260608-getsock_more-v3-2-706ecf2ea332@debian.org \
--to=leitao@debian.org \
--cc=achender@kernel.org \
--cc=andy.grover@oracle.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=kernel-team@meta.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rds-devel@oss.oracle.com \
--cc=shuah@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox