linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v5 01/16] tcp_bpf, smc, tls, espintcp, siw: Reduce MSG_SENDPAGE_NOTLAST usage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
@ 2023-06-23 22:54 ` David Howells
  2023-06-23 22:54 ` [PATCH net-next v5 02/16] net: Use sendmsg(MSG_SPLICE_PAGES) not sendpage in skb_send_sock() David Howells
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:54 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel,
	Bernard Metzler, Jason Gunthorpe, Leon Romanovsky, John Fastabend,
	Jakub Sitnicki, Karsten Graul, Wenjia Zhang, Jan Karcher,
	D. Wythe, Tony Lu, Wen Gu, Boris Pismenny, Steffen Klassert,
	Herbert Xu, bpf, linux-s390, linux-rdma

As MSG_SENDPAGE_NOTLAST is being phased out along with sendpage(), don't
use it further in than the sendpage methods, but rather translate it to
MSG_MORE and use that instead.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
cc: Bernard Metzler <bmt@zurich.ibm.com>
cc: Jason Gunthorpe <jgg@ziepe.ca>
cc: Leon Romanovsky <leon@kernel.org>
cc: John Fastabend <john.fastabend@gmail.com>
cc: Jakub Sitnicki <jakub@cloudflare.com>
cc: Eric Dumazet <edumazet@google.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: David Ahern <dsahern@kernel.org>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Karsten Graul <kgraul@linux.ibm.com>
cc: Wenjia Zhang <wenjia@linux.ibm.com>
cc: Jan Karcher <jaka@linux.ibm.com>
cc: "D. Wythe" <alibuda@linux.alibaba.com>
cc: Tony Lu <tonylu@linux.alibaba.com>
cc: Wen Gu <guwen@linux.alibaba.com>
cc: Boris Pismenny <borisp@nvidia.com>
cc: Steffen Klassert <steffen.klassert@secunet.com>
cc: Herbert Xu <herbert@gondor.apana.org.au>
cc: netdev@vger.kernel.org
cc: bpf@vger.kernel.org
cc: linux-s390@vger.kernel.org
cc: linux-rdma@vger.kernel.org
---

Notes:
    ver #3)
     - In tcp_bpf, reset msg_flags on each iteration to clear MSG_MORE.
     - In tcp_bpf, set MSG_MORE if there's more data in the sk_msg.

 drivers/infiniband/sw/siw/siw_qp_tx.c |  5 ++---
 net/ipv4/tcp_bpf.c                    |  5 +++--
 net/smc/smc_tx.c                      |  6 ++++--
 net/tls/tls_device.c                  |  4 ++--
 net/xfrm/espintcp.c                   | 10 ++++++----
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
index ffb16beb6c30..7c7a51d36d0c 100644
--- a/drivers/infiniband/sw/siw/siw_qp_tx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -325,8 +325,7 @@ static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
 {
 	struct bio_vec bvec;
 	struct msghdr msg = {
-		.msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST |
-			      MSG_SPLICE_PAGES),
+		.msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES),
 	};
 	struct sock *sk = s->sk;
 	int i = 0, rv = 0, sent = 0;
@@ -335,7 +334,7 @@ static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
 		size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
 
 		if (size + offset <= PAGE_SIZE)
-			msg.msg_flags &= ~MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags &= ~MSG_MORE;
 
 		tcp_rate_check_app_limited(sk);
 		bvec_set_page(&bvec, page[i], bytes, offset);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 5a84053ac62b..31d6005cea9b 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -88,9 +88,9 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
 			int flags, bool uncharge)
 {
+	struct msghdr msghdr = {};
 	bool apply = apply_bytes;
 	struct scatterlist *sge;
-	struct msghdr msghdr = { .msg_flags = flags | MSG_SPLICE_PAGES, };
 	struct page *page;
 	int size, ret = 0;
 	u32 off;
@@ -107,11 +107,12 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
 
 		tcp_rate_check_app_limited(sk);
 retry:
+		msghdr.msg_flags = flags | MSG_SPLICE_PAGES;
 		has_tx_ulp = tls_sw_has_ctx_tx(sk);
 		if (has_tx_ulp)
 			msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY;
 
-		if (flags & MSG_SENDPAGE_NOTLAST)
+		if (size < sge->length && msg->sg.start != msg->sg.end)
 			msghdr.msg_flags |= MSG_MORE;
 
 		bvec_set_page(&bvec, page, size, off);
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 45128443f1f1..9b9e0a190734 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -168,8 +168,7 @@ static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
 	 * should known how/when to uncork it.
 	 */
 	if ((msg->msg_flags & MSG_MORE ||
-	     smc_tx_is_corked(smc) ||
-	     msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
+	     smc_tx_is_corked(smc)) &&
 	    atomic_read(&conn->sndbuf_space))
 		return true;
 
@@ -306,6 +305,9 @@ int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
 	struct kvec iov;
 	int rc;
 
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		msg.msg_flags |= MSG_MORE;
+
 	iov.iov_base = kaddr + offset;
 	iov.iov_len = size;
 	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, size);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b82770f68807..975299d7213b 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -449,7 +449,7 @@ static int tls_push_data(struct sock *sk,
 		return -sk->sk_err;
 
 	flags |= MSG_SENDPAGE_DECRYPTED;
-	tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
+	tls_push_record_flags = flags | MSG_MORE;
 
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 	if (tls_is_partially_sent_record(tls_ctx)) {
@@ -532,7 +532,7 @@ static int tls_push_data(struct sock *sk,
 		if (!size) {
 last_record:
 			tls_push_record_flags = flags;
-			if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) {
+			if (flags & MSG_MORE) {
 				more = true;
 				break;
 			}
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index 3504925babdb..d3b3f9e720b3 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -205,13 +205,15 @@ static int espintcp_sendskb_locked(struct sock *sk, struct espintcp_msg *emsg,
 static int espintcp_sendskmsg_locked(struct sock *sk,
 				     struct espintcp_msg *emsg, int flags)
 {
-	struct msghdr msghdr = { .msg_flags = flags | MSG_SPLICE_PAGES, };
+	struct msghdr msghdr = {
+		.msg_flags = flags | MSG_SPLICE_PAGES | MSG_MORE,
+	};
 	struct sk_msg *skmsg = &emsg->skmsg;
+	bool more = flags & MSG_MORE;
 	struct scatterlist *sg;
 	int done = 0;
 	int ret;
 
-	msghdr.msg_flags |= MSG_SENDPAGE_NOTLAST;
 	sg = &skmsg->sg.data[skmsg->sg.start];
 	do {
 		struct bio_vec bvec;
@@ -221,8 +223,8 @@ static int espintcp_sendskmsg_locked(struct sock *sk,
 
 		emsg->offset = 0;
 
-		if (sg_is_last(sg))
-			msghdr.msg_flags &= ~MSG_SENDPAGE_NOTLAST;
+		if (sg_is_last(sg) && !more)
+			msghdr.msg_flags &= ~MSG_MORE;
 
 		p = sg_page(sg);
 retry:



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 02/16] net: Use sendmsg(MSG_SPLICE_PAGES) not sendpage in skb_send_sock()
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
  2023-06-23 22:54 ` [PATCH net-next v5 01/16] tcp_bpf, smc, tls, espintcp, siw: Reduce MSG_SENDPAGE_NOTLAST usage David Howells
@ 2023-06-23 22:54 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:54 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel

Use sendmsg() with MSG_SPLICE_PAGES rather than sendpage in
skb_send_sock().  This causes pages to be spliced from the source iterator
if possible.

This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.

Note that this could perhaps be improved to fill out a bvec array with all
the frags and then make a single sendmsg call, possibly sticking the header
on the front also.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
---

Notes:
    ver #2)
     - Wrap lines at 80.

 net/core/skbuff.c | 50 ++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fee2b1c105fe..6c5915efbc17 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2989,32 +2989,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(skb_splice_bits);
 
-static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
-			    struct kvec *vec, size_t num, size_t size)
+static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
 {
 	struct socket *sock = sk->sk_socket;
+	size_t size = msg_data_left(msg);
 
 	if (!sock)
 		return -EINVAL;
-	return kernel_sendmsg(sock, msg, vec, num, size);
+
+	if (!sock->ops->sendmsg_locked)
+		return sock_no_sendmsg_locked(sk, msg, size);
+
+	return sock->ops->sendmsg_locked(sk, msg, size);
 }
 
-static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
-			     size_t size, int flags)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
 {
 	struct socket *sock = sk->sk_socket;
 
 	if (!sock)
 		return -EINVAL;
-	return kernel_sendpage(sock, page, offset, size, flags);
+	return sock_sendmsg(sock, msg);
 }
 
-typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
-			    struct kvec *vec, size_t num, size_t size);
-typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
-			     size_t size, int flags);
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
 static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
-			   int len, sendmsg_func sendmsg, sendpage_func sendpage)
+			   int len, sendmsg_func sendmsg)
 {
 	unsigned int orig_len = len;
 	struct sk_buff *head = skb;
@@ -3034,8 +3034,9 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
 		memset(&msg, 0, sizeof(msg));
 		msg.msg_flags = MSG_DONTWAIT;
 
-		ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
-				      sendmsg_unlocked, sk, &msg, &kv, 1, slen);
+		iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
+		ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+				      sendmsg_unlocked, sk, &msg);
 		if (ret <= 0)
 			goto error;
 
@@ -3066,11 +3067,18 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
 		slen = min_t(size_t, len, skb_frag_size(frag) - offset);
 
 		while (slen) {
-			ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
-					      sendpage_unlocked, sk,
-					      skb_frag_page(frag),
-					      skb_frag_off(frag) + offset,
-					      slen, MSG_DONTWAIT);
+			struct bio_vec bvec;
+			struct msghdr msg = {
+				.msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+			};
+
+			bvec_set_page(&bvec, skb_frag_page(frag), slen,
+				      skb_frag_off(frag) + offset);
+			iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
+				      slen);
+
+			ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+					      sendmsg_unlocked, sk, &msg);
 			if (ret <= 0)
 				goto error;
 
@@ -3107,16 +3115,14 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
 			 int len)
 {
-	return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
-			       kernel_sendpage_locked);
+	return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
 }
 EXPORT_SYMBOL_GPL(skb_send_sock_locked);
 
 /* Send skb data on a socket. Socket must be unlocked. */
 int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
 {
-	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
-			       sendpage_unlocked);
+	return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
 }
 
 /**



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
  2023-06-23 22:54 ` [PATCH net-next v5 01/16] tcp_bpf, smc, tls, espintcp, siw: Reduce MSG_SENDPAGE_NOTLAST usage David Howells
  2023-06-23 22:54 ` [PATCH net-next v5 02/16] net: Use sendmsg(MSG_SPLICE_PAGES) not sendpage in skb_send_sock() David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-25 12:20   ` Ilya Dryomov
                     ` (2 more replies)
  2023-06-23 22:55 ` [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
                   ` (12 subsequent siblings)
  15 siblings, 3 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Ilya Dryomov,
	Xiubo Li, Jeff Layton, ceph-devel

Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data.  For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #5)
     - Switch condition for setting MSG_MORE in write_partial_message_data()

 net/ceph/messenger_v1.c | 60 ++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index d664cb1593a7..814579f27f04 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -74,37 +74,6 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
 	return r;
 }
 
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int more)
-{
-	ssize_t (*sendpage)(struct socket *sock, struct page *page,
-			    int offset, size_t size, int flags);
-	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
-	int ret;
-
-	/*
-	 * sendpage cannot properly handle pages with page_count == 0,
-	 * we need to fall back to sendmsg if that's the case.
-	 *
-	 * Same goes for slab pages: skb_can_coalesce() allows
-	 * coalescing neighboring slab objects into a single frag which
-	 * triggers one of hardened usercopy checks.
-	 */
-	if (sendpage_ok(page))
-		sendpage = sock->ops->sendpage;
-	else
-		sendpage = sock_no_sendpage;
-
-	ret = sendpage(sock, page, offset, size, flags);
-	if (ret == -EAGAIN)
-		ret = 0;
-
-	return ret;
-}
-
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
 	BUG_ON(con->v1.out_skip);
@@ -464,7 +433,6 @@ static int write_partial_message_data(struct ceph_connection *con)
 	struct ceph_msg *msg = con->out_msg;
 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	u32 crc;
 
 	dout("%s %p msg %p\n", __func__, con, msg);
@@ -482,6 +450,10 @@ static int write_partial_message_data(struct ceph_connection *con)
 	 */
 	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
 	while (cursor->total_resid) {
+		struct bio_vec bvec;
+		struct msghdr msghdr = {
+			.msg_flags = MSG_SPLICE_PAGES,
+		};
 		struct page *page;
 		size_t page_offset;
 		size_t length;
@@ -493,10 +465,13 @@ static int write_partial_message_data(struct ceph_connection *con)
 		}
 
 		page = ceph_msg_data_next(cursor, &page_offset, &length);
-		if (length == cursor->total_resid)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					more);
+		if (length != cursor->total_resid)
+			msghdr.msg_flags |= MSG_MORE;
+
+		bvec_set_page(&bvec, page, length, page_offset);
+		iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, length);
+
+		ret = sock_sendmsg(con->sock, &msghdr);
 		if (ret <= 0) {
 			if (do_datacrc)
 				msg->footer.data_crc = cpu_to_le32(crc);
@@ -526,7 +501,10 @@ static int write_partial_message_data(struct ceph_connection *con)
  */
 static int write_partial_skip(struct ceph_connection *con)
 {
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	struct bio_vec bvec;
+	struct msghdr msghdr = {
+		.msg_flags = MSG_SPLICE_PAGES | MSG_MORE,
+	};
 	int ret;
 
 	dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
@@ -534,9 +512,11 @@ static int write_partial_skip(struct ceph_connection *con)
 		size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
 
 		if (size == con->v1.out_skip)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
-					more);
+			msghdr.msg_flags &= ~MSG_MORE;
+		bvec_set_page(&bvec, ZERO_PAGE(0), size, 0);
+		iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size);
+
+		ret = sock_sendmsg(con->sock, &msghdr);
 		if (ret <= 0)
 			goto out;
 		con->v1.out_skip -= ret;



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (2 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-25 12:34   ` Ilya Dryomov
  2023-06-26 15:30   ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 05/16] rds: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
                   ` (11 subsequent siblings)
  15 siblings, 2 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Ilya Dryomov,
	Xiubo Li, Jeff Layton, ceph-devel

Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data.  For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---
 net/ceph/messenger_v2.c | 91 +++++++++--------------------------------
 1 file changed, 19 insertions(+), 72 deletions(-)

diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 301a991dc6a6..87ac97073e75 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
 	return ret;
 }
 
-static int do_sendmsg(struct socket *sock, struct iov_iter *it)
-{
-	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
-	int ret;
-
-	msg.msg_iter = *it;
-	while (iov_iter_count(it)) {
-		ret = sock_sendmsg(sock, &msg);
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				ret = 0;
-			return ret;
-		}
-
-		iov_iter_advance(it, ret);
-	}
-
-	WARN_ON(msg_data_left(&msg));
-	return 1;
-}
-
-static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
-{
-	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
-	struct bio_vec bv;
-	int ret;
-
-	if (WARN_ON(!iov_iter_is_bvec(it)))
-		return -EINVAL;
-
-	while (iov_iter_count(it)) {
-		/* iov_iter_iovec() for ITER_BVEC */
-		bvec_set_page(&bv, it->bvec->bv_page,
-			      min(iov_iter_count(it),
-				  it->bvec->bv_len - it->iov_offset),
-			      it->bvec->bv_offset + it->iov_offset);
-
-		/*
-		 * sendpage cannot properly handle pages with
-		 * page_count == 0, we need to fall back to sendmsg if
-		 * that's the case.
-		 *
-		 * Same goes for slab pages: skb_can_coalesce() allows
-		 * coalescing neighboring slab objects into a single frag
-		 * which triggers one of hardened usercopy checks.
-		 */
-		if (sendpage_ok(bv.bv_page)) {
-			ret = sock->ops->sendpage(sock, bv.bv_page,
-						  bv.bv_offset, bv.bv_len,
-						  CEPH_MSG_FLAGS);
-		} else {
-			iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-			ret = sock_sendmsg(sock, &msg);
-		}
-		if (ret <= 0) {
-			if (ret == -EAGAIN)
-				ret = 0;
-			return ret;
-		}
-
-		iov_iter_advance(it, ret);
-	}
-
-	return 1;
-}
-
 /*
  * Write as much as possible.  The socket is expected to be corked,
- * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ * so we don't bother with MSG_MORE here.
  *
  * Return:
- *   1 - done, nothing (else) to write
+ *  >0 - done, nothing (else) to write
  *   0 - socket is full, need to wait
  *  <0 - error
  */
 static int ceph_tcp_send(struct ceph_connection *con)
 {
+	struct msghdr msg = {
+		.msg_iter	= con->v2.out_iter,
+		.msg_flags	= CEPH_MSG_FLAGS,
+	};
 	int ret;
 
+	if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
+		return -EINVAL;
+
+	if (con->v2.out_iter_sendpage)
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
 	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
 	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
-	if (con->v2.out_iter_sendpage)
-		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
-	else
-		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+
+	ret = sock_sendmsg(con->sock, &msg);
+	if (ret > 0)
+		iov_iter_advance(&con->v2.out_iter, ret);
+	else if (ret == -EAGAIN)
+		ret = 0;
+
 	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
 	     iov_iter_count(&con->v2.out_iter));
 	return ret;



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 05/16] rds: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (3 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 06/16] dlm: " David Howells
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel,
	Santosh Shilimkar, linux-rdma, rds-devel

When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced.

To make this work, the data is assembled in a bio_vec array and attached to
a BVEC-type iterator.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-rdma@vger.kernel.org
cc: rds-devel@oss.oracle.com
cc: netdev@vger.kernel.org
---

Notes:
    ver #4)
     - Reduce change to only call sendmsg on a page at a time.

 net/rds/tcp_send.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 8c4d1d6e9249..7d284ac7e81a 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -72,9 +72,10 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 {
 	struct rds_conn_path *cp = rm->m_inc.i_conn_path;
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
+	struct msghdr msg = {};
+	struct bio_vec bvec;
 	int done = 0;
 	int ret = 0;
-	int more;
 
 	if (hdr_off == 0) {
 		/*
@@ -111,15 +112,17 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 			goto out;
 	}
 
-	more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0;
 	while (sg < rm->data.op_nents) {
-		int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
-
-		ret = tc->t_sock->ops->sendpage(tc->t_sock,
-						sg_page(&rm->data.op_sg[sg]),
-						rm->data.op_sg[sg].offset + off,
-						rm->data.op_sg[sg].length - off,
-						flags);
+		msg.msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL;
+		if (sg + 1 < rm->data.op_nents)
+			msg.msg_flags |= MSG_MORE;
+
+		bvec_set_page(&bvec, sg_page(&rm->data.op_sg[sg]),
+			      rm->data.op_sg[sg].length - off,
+			      rm->data.op_sg[sg].offset + off);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
+			      rm->data.op_sg[sg].length - off);
+		ret = sock_sendmsg(tc->t_sock, &msg);
 		rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
 			 rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
 			 ret);
@@ -132,8 +135,6 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 			off = 0;
 			sg++;
 		}
-		if (sg == rm->data.op_nents - 1)
-			more = 0;
 	}
 
 out:



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 06/16] dlm: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (4 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 05/16] rds: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 07/16] nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage David Howells
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel,
	Christine Caulfield, David Teigland, cluster-devel

When transmitting data, call down a layer using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather using
sendpage.  This allows ->sendpage() to be replaced by something that can
handle multiple multipage folios in a single transaction.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Christine Caulfield <ccaulfie@redhat.com>
cc: David Teigland <teigland@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: cluster-devel@redhat.com
cc: netdev@vger.kernel.org
---
 fs/dlm/lowcomms.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3d3802c47b8b..5c12d8cdfc16 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1395,8 +1395,11 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
 /* Send a message */
 static int send_to_sock(struct connection *con)
 {
-	const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 	struct writequeue_entry *e;
+	struct bio_vec bvec;
+	struct msghdr msg = {
+		.msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL,
+	};
 	int len, offset, ret;
 
 	spin_lock_bh(&con->writequeue_lock);
@@ -1412,8 +1415,9 @@ static int send_to_sock(struct connection *con)
 	WARN_ON_ONCE(len == 0 && e->users == 0);
 	spin_unlock_bh(&con->writequeue_lock);
 
-	ret = kernel_sendpage(con->sock, e->page, offset, len,
-			      msg_flags);
+	bvec_set_page(&bvec, e->page, len, offset);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(con->sock, &msg);
 	trace_dlm_send(con->nodeid, ret);
 	if (ret == -EAGAIN || ret == 0) {
 		lock_sock(con->sock->sk);



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 07/16] nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (5 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 06/16] dlm: " David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 08/16] nvmet-tcp: " David Howells
                   ` (8 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Sagi Grimberg,
	Willem de Bruijn, Keith Busch, Jens Axboe, Christoph Hellwig,
	Chaitanya Kulkarni, linux-nvme

When transmitting data, call down into TCP using a sendmsg with
MSG_SPLICE_PAGES instead of sendpage.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Willem de Bruijn <willemb@google.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-nvme@lists.infradead.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #4)
     - Cancel MSG_SPLICE_PAGES if the page being sent fails sendpage_ok().
    
    ver #3)
     - Split nvme/host from nvme/target changes.
    
    ver #2)
     - Wrap lines at 80.

 drivers/nvme/host/tcp.c | 49 +++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..47ae17f16c05 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -997,25 +997,28 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 	u32 h2cdata_left = req->h2cdata_left;
 
 	while (true) {
+		struct bio_vec bvec;
+		struct msghdr msg = {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+		};
 		struct page *page = nvme_tcp_req_cur_page(req);
 		size_t offset = nvme_tcp_req_cur_offset(req);
 		size_t len = nvme_tcp_req_cur_length(req);
 		bool last = nvme_tcp_pdu_last_send(req, len);
 		int req_data_sent = req->data_sent;
-		int ret, flags = MSG_DONTWAIT;
+		int ret;
 
 		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
-			flags |= MSG_EOR;
+			msg.msg_flags |= MSG_EOR;
 		else
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
-		if (sendpage_ok(page)) {
-			ret = kernel_sendpage(queue->sock, page, offset, len,
-					flags);
-		} else {
-			ret = sock_no_sendpage(queue->sock, page, offset, len,
-					flags);
-		}
+		if (!sendpage_ok(page))
+			msg.msg_flags &= ~MSG_SPLICE_PAGES,
+
+		bvec_set_page(&bvec, page, len, offset);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+		ret = sock_sendmsg(queue->sock, &msg);
 		if (ret <= 0)
 			return ret;
 
@@ -1054,22 +1057,24 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
 	bool inline_data = nvme_tcp_has_inline_data(req);
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) + hdgst - req->offset;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (inline_data || nvme_tcp_queue_more(queue))
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
-	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-			offset_in_page(pdu) + req->offset, len,  flags);
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1093,6 +1098,8 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 {
 	struct nvme_tcp_queue *queue = req->queue;
 	struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
+	struct bio_vec bvec;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) - req->offset + hdgst;
 	int ret;
@@ -1101,13 +1108,11 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
 	if (!req->h2cdata_left)
-		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
-	else
-		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
-				offset_in_page(pdu) + req->offset, len,
-				MSG_DONTWAIT | MSG_MORE);
+		msg.msg_flags |= MSG_SPLICE_PAGES;
+
+	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+	ret = sock_sendmsg(queue->sock, &msg);
 	if (unlikely(ret <= 0))
 		return ret;
 



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 08/16] nvmet-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (6 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 07/16] nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 09/16] smc: Drop smc_sendpage() in favour of smc_sendmsg() + MSG_SPLICE_PAGES David Howells
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Sagi Grimberg,
	Willem de Bruijn, Keith Busch, Jens Axboe, Christoph Hellwig,
	Chaitanya Kulkarni, linux-nvme

When transmitting data, call down into TCP using a single sendmsg with
MSG_SPLICE_PAGES to indicate that content should be spliced rather than
copied instead of calling sendpage.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Willem de Bruijn <willemb@google.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-nvme@lists.infradead.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #3)
     - Split nvme/host from nvme/target changes.
    
    ver #2)
     - Wrap lines at 80.

 drivers/nvme/target/tcp.c | 46 ++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index ed98df72c76b..868aa4de2e4c 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -576,13 +576,17 @@ static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
 
 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
 {
+	struct msghdr msg = {
+		.msg_flags = MSG_DONTWAIT | MSG_MORE | MSG_SPLICE_PAGES,
+	};
+	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
 	int ret;
 
-	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
-			offset_in_page(cmd->data_pdu) + cmd->offset,
-			left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
+	bvec_set_virt(&bvec, (void *)cmd->data_pdu + cmd->offset, left);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+	ret = sock_sendmsg(cmd->queue->sock, &msg);
 	if (ret <= 0)
 		return ret;
 
@@ -603,17 +607,21 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 	int ret;
 
 	while (cmd->cur_sg) {
+		struct msghdr msg = {
+			.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+		};
 		struct page *page = sg_page(cmd->cur_sg);
+		struct bio_vec bvec;
 		u32 left = cmd->cur_sg->length - cmd->offset;
-		int flags = MSG_DONTWAIT;
 
 		if ((!last_in_batch && cmd->queue->send_list_len) ||
 		    cmd->wbytes_done + left < cmd->req.transfer_len ||
 		    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
-		ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
-					left, flags);
+		bvec_set_page(&bvec, page, left, cmd->offset);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+		ret = sock_sendmsg(cmd->queue->sock, &msg);
 		if (ret <= 0)
 			return ret;
 
@@ -649,18 +657,20 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 		bool last_in_batch)
 {
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (!last_in_batch && cmd->queue->send_list_len)
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
-	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
-		offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+	bvec_set_virt(&bvec, (void *)cmd->rsp_pdu + cmd->offset, left);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+	ret = sock_sendmsg(cmd->queue->sock, &msg);
 	if (ret <= 0)
 		return ret;
 	cmd->offset += ret;
@@ -677,18 +687,20 @@ static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 
 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 {
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
+	struct bio_vec bvec;
 	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
-	int flags = MSG_DONTWAIT;
 	int ret;
 
 	if (!last_in_batch && cmd->queue->send_list_len)
-		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+		msg.msg_flags |= MSG_MORE;
 	else
-		flags |= MSG_EOR;
+		msg.msg_flags |= MSG_EOR;
 
-	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
-		offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+	bvec_set_virt(&bvec, (void *)cmd->r2t_pdu + cmd->offset, left);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, left);
+	ret = sock_sendmsg(cmd->queue->sock, &msg);
 	if (ret <= 0)
 		return ret;
 	cmd->offset += ret;



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 09/16] smc: Drop smc_sendpage() in favour of smc_sendmsg() + MSG_SPLICE_PAGES
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (7 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 08/16] nvmet-tcp: " David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 10/16] drbd: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Karsten Graul,
	Wenjia Zhang, Jan Karcher, D. Wythe, Tony Lu, Wen Gu, linux-s390

Drop the smc_sendpage() code as smc_sendmsg() just passes the call down to
the underlying TCP socket and smc_tx_sendpage() is just a wrapper around
its sendmsg implementation.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Karsten Graul <kgraul@linux.ibm.com>
cc: Wenjia Zhang <wenjia@linux.ibm.com>
cc: Jan Karcher <jaka@linux.ibm.com>
cc: "D. Wythe" <alibuda@linux.alibaba.com>
cc: Tony Lu <tonylu@linux.alibaba.com>
cc: Wen Gu <guwen@linux.alibaba.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-s390@vger.kernel.org
cc: netdev@vger.kernel.org
---
 net/smc/af_smc.c    | 29 -----------------------------
 net/smc/smc_stats.c |  2 +-
 net/smc/smc_stats.h |  1 -
 net/smc/smc_tx.c    | 19 -------------------
 net/smc/smc_tx.h    |  2 --
 5 files changed, 1 insertion(+), 52 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 538e9c6ec8c9..a7f887d91d89 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -3133,34 +3133,6 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
 	return put_user(answ, (int __user *)arg);
 }
 
-static ssize_t smc_sendpage(struct socket *sock, struct page *page,
-			    int offset, size_t size, int flags)
-{
-	struct sock *sk = sock->sk;
-	struct smc_sock *smc;
-	int rc = -EPIPE;
-
-	smc = smc_sk(sk);
-	lock_sock(sk);
-	if (sk->sk_state != SMC_ACTIVE) {
-		release_sock(sk);
-		goto out;
-	}
-	release_sock(sk);
-	if (smc->use_fallback) {
-		rc = kernel_sendpage(smc->clcsock, page, offset,
-				     size, flags);
-	} else {
-		lock_sock(sk);
-		rc = smc_tx_sendpage(smc, page, offset, size, flags);
-		release_sock(sk);
-		SMC_STAT_INC(smc, sendpage_cnt);
-	}
-
-out:
-	return rc;
-}
-
 /* Map the affected portions of the rmbe into an spd, note the number of bytes
  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
  * updates till whenever a respective page has been fully processed.
@@ -3232,7 +3204,6 @@ static const struct proto_ops smc_sock_ops = {
 	.sendmsg	= smc_sendmsg,
 	.recvmsg	= smc_recvmsg,
 	.mmap		= sock_no_mmap,
-	.sendpage	= smc_sendpage,
 	.splice_read	= smc_splice_read,
 };
 
diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c
index e80e34f7ac15..ca14c0f3a07d 100644
--- a/net/smc/smc_stats.c
+++ b/net/smc/smc_stats.c
@@ -227,7 +227,7 @@ static int smc_nl_fill_stats_tech_data(struct sk_buff *skb,
 			      SMC_NLA_STATS_PAD))
 		goto errattr;
 	if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT,
-			      smc_tech->sendpage_cnt,
+			      0,
 			      SMC_NLA_STATS_PAD))
 		goto errattr;
 	if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT,
diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h
index 84b7ecd8c05c..b60fe1eb37ab 100644
--- a/net/smc/smc_stats.h
+++ b/net/smc/smc_stats.h
@@ -71,7 +71,6 @@ struct smc_stats_tech {
 	u64			clnt_v2_succ_cnt;
 	u64			srv_v1_succ_cnt;
 	u64			srv_v2_succ_cnt;
-	u64			sendpage_cnt;
 	u64			urg_data_cnt;
 	u64			splice_cnt;
 	u64			cork_cnt;
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 9b9e0a190734..3b0ff3b589c7 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -297,25 +297,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
 	return rc;
 }
 
-int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
-		    size_t size, int flags)
-{
-	struct msghdr msg = {.msg_flags = flags};
-	char *kaddr = kmap(page);
-	struct kvec iov;
-	int rc;
-
-	if (flags & MSG_SENDPAGE_NOTLAST)
-		msg.msg_flags |= MSG_MORE;
-
-	iov.iov_base = kaddr + offset;
-	iov.iov_len = size;
-	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, size);
-	rc = smc_tx_sendmsg(smc, &msg, size);
-	kunmap(page);
-	return rc;
-}
-
 /***************************** sndbuf consumer *******************************/
 
 /* sndbuf consumer: actual data transfer of one target chunk with ISM write */
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index 34b578498b1f..a59f370b8b43 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -31,8 +31,6 @@ void smc_tx_pending(struct smc_connection *conn);
 void smc_tx_work(struct work_struct *work);
 void smc_tx_init(struct smc_sock *smc);
 int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
-int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset,
-		    size_t size, int flags);
 int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
 void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
 void smc_tx_consumer_update(struct smc_connection *conn, bool force);



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 10/16] drbd: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (8 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 09/16] smc: Drop smc_sendpage() in favour of smc_sendmsg() + MSG_SPLICE_PAGES David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel,
	Philipp Reisner, Lars Ellenberg, Christoph Böhmwalder,
	drbd-dev, linux-block

Use sendmsg() conditionally with MSG_SPLICE_PAGES in _drbd_send_page()
rather than calling sendpage() or _drbd_no_send_page().

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Philipp Reisner <philipp.reisner@linbit.com>
cc: Lars Ellenberg <lars.ellenberg@linbit.com>
cc: "Christoph Böhmwalder" <christoph.boehmwalder@linbit.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: drbd-dev@lists.linbit.com
cc: linux-block@vger.kernel.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #4)
     - Don't look at msg.msg_iter after calling sendmsg.  There's no guarantee
       it has changed.
    
    ver #2)
     - Wrap lines at 80.

 drivers/block/drbd/drbd_main.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 83987e7a5ef2..ea82d6733313 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1540,6 +1540,8 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa
 		    int offset, size_t size, unsigned msg_flags)
 {
 	struct socket *socket = peer_device->connection->data.socket;
+	struct msghdr msg = { .msg_flags = msg_flags, };
+	struct bio_vec bvec;
 	int len = size;
 	int err = -EIO;
 
@@ -1549,15 +1551,17 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa
 	 * put_page(); and would cause either a VM_BUG directly, or
 	 * __page_cache_release a page that would actually still be referenced
 	 * by someone, leading to some obscure delayed Oops somewhere else. */
-	if (drbd_disable_sendpage || !sendpage_ok(page))
-		return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
+	if (!drbd_disable_sendpage && sendpage_ok(page))
+		msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES;
 
-	msg_flags |= MSG_NOSIGNAL;
 	drbd_update_congested(peer_device->connection);
 	do {
 		int sent;
 
-		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
+		bvec_set_page(&bvec, page, offset, len);
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
+
+		sent = sock_sendmsg(socket, &msg);
 		if (sent <= 0) {
 			if (sent == -EAGAIN) {
 				if (we_should_drop_the_connection(peer_device->connection, socket))



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (9 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 10/16] drbd: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-27 19:02   ` Chris Leech
  2023-06-23 22:55 ` [PATCH net-next v5 12/16] scsi: target: iscsi: " David Howells
                   ` (4 subsequent siblings)
  15 siblings, 1 reply; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Mike Christie,
	Lee Duncan, Chris Leech, James E.J. Bottomley, Martin K. Petersen,
	Al Viro, open-iscsi, linux-scsi, target-devel

Use sendmsg() with MSG_SPLICE_PAGES rather than sendpage.  This allows
multiple pages and multipage folios to be passed through.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
cc: Lee Duncan <lduncan@suse.com>
cc: Chris Leech <cleech@redhat.com>
cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
cc: "Martin K. Petersen" <martin.petersen@oracle.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: open-iscsi@googlegroups.com
cc: linux-scsi@vger.kernel.org
cc: target-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #5)
     - Split iscsi changes into client and target patches

 drivers/scsi/iscsi_tcp.c | 26 ++++++++++----------------
 drivers/scsi/iscsi_tcp.h |  2 --
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 9637d4bc2bc9..9ab8555180a3 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -301,35 +301,32 @@ static int iscsi_sw_tcp_xmit_segment(struct iscsi_tcp_conn *tcp_conn,
 
 	while (!iscsi_tcp_segment_done(tcp_conn, segment, 0, r)) {
 		struct scatterlist *sg;
+		struct msghdr msg = {};
+		struct bio_vec bv;
 		unsigned int offset, copy;
-		int flags = 0;
 
 		r = 0;
 		offset = segment->copied;
 		copy = segment->size - offset;
 
 		if (segment->total_copied + segment->size < segment->total_size)
-			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+			msg.msg_flags |= MSG_MORE;
 
 		if (tcp_sw_conn->queue_recv)
-			flags |= MSG_DONTWAIT;
+			msg.msg_flags |= MSG_DONTWAIT;
 
-		/* Use sendpage if we can; else fall back to sendmsg */
 		if (!segment->data) {
+			if (!tcp_conn->iscsi_conn->datadgst_en)
+				msg.msg_flags |= MSG_SPLICE_PAGES;
 			sg = segment->sg;
 			offset += segment->sg_offset + sg->offset;
-			r = tcp_sw_conn->sendpage(sk, sg_page(sg), offset,
-						  copy, flags);
+			bvec_set_page(&bv, sg_page(sg), copy, offset);
 		} else {
-			struct msghdr msg = { .msg_flags = flags };
-			struct kvec iov = {
-				.iov_base = segment->data + offset,
-				.iov_len = copy
-			};
-
-			r = kernel_sendmsg(sk, &msg, &iov, 1, copy);
+			bvec_set_virt(&bv, segment->data + offset, copy);
 		}
+		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, copy);
 
+		r = sock_sendmsg(sk, &msg);
 		if (r < 0) {
 			iscsi_tcp_segment_unmap(segment);
 			return r;
@@ -746,7 +743,6 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session,
 	sock_no_linger(sk);
 
 	iscsi_sw_tcp_conn_set_callbacks(conn);
-	tcp_sw_conn->sendpage = tcp_sw_conn->sock->ops->sendpage;
 	/*
 	 * set receive state machine into initial state
 	 */
@@ -777,8 +773,6 @@ static int iscsi_sw_tcp_conn_set_param(struct iscsi_cls_conn *cls_conn,
 			return -ENOTCONN;
 		}
 		iscsi_set_param(cls_conn, param, buf, buflen);
-		tcp_sw_conn->sendpage = conn->datadgst_en ?
-			sock_no_sendpage : tcp_sw_conn->sock->ops->sendpage;
 		mutex_unlock(&tcp_sw_conn->sock_lock);
 		break;
 	case ISCSI_PARAM_MAX_R2T:
diff --git a/drivers/scsi/iscsi_tcp.h b/drivers/scsi/iscsi_tcp.h
index 68e14a344904..89a6fc552f0b 100644
--- a/drivers/scsi/iscsi_tcp.h
+++ b/drivers/scsi/iscsi_tcp.h
@@ -47,8 +47,6 @@ struct iscsi_sw_tcp_conn {
 	/* MIB custom statistics */
 	uint32_t		sendpage_failures_cnt;
 	uint32_t		discontiguous_hdr_cnt;
-
-	ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
 };
 
 struct iscsi_sw_tcp_host {



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 12/16] scsi: target: iscsi: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (10 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 13/16] ocfs2: Fix use of slab data with sendpage David Howells
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Mike Christie,
	Maurizio Lombardi, James E.J. Bottomley, Martin K. Petersen,
	Al Viro, open-iscsi, linux-scsi, target-devel

Use sendmsg() with MSG_SPLICE_PAGES rather than sendpage.  This allows
multiple pages and multipage folios to be passed through.

TODO: iscsit_fe_sendpage_sg() should perhaps set up a bio_vec array for the
entire set of pages it's going to transfer plus two for the header and
trailer and page fragments to hold the header and trailer - and then call
sendmsg once for the entire message.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Mike Christie <michael.christie@oracle.com>
cc: Maurizio Lombardi <mlombard@redhat.com>
cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
cc: "Martin K. Petersen" <martin.petersen@oracle.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: open-iscsi@googlegroups.com
cc: linux-scsi@vger.kernel.org
cc: target-devel@vger.kernel.org
cc: netdev@vger.kernel.org
---

Notes:
    ver #5)
     - Split iscsi changes into client and target patches
    
    ver #2)
     - Wrap lines at 80.

 drivers/target/iscsi/iscsi_target_util.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index b14835fcb033..6231fa4ef5c6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -1129,6 +1129,8 @@ int iscsit_fe_sendpage_sg(
 	struct iscsit_conn *conn)
 {
 	struct scatterlist *sg = cmd->first_data_sg;
+	struct bio_vec bvec;
+	struct msghdr msghdr = { .msg_flags = MSG_SPLICE_PAGES,	};
 	struct kvec iov;
 	u32 tx_hdr_size, data_len;
 	u32 offset = cmd->first_data_sg_off;
@@ -1172,17 +1174,18 @@ int iscsit_fe_sendpage_sg(
 		u32 space = (sg->length - offset);
 		u32 sub_len = min_t(u32, data_len, space);
 send_pg:
-		tx_sent = conn->sock->ops->sendpage(conn->sock,
-					sg_page(sg), sg->offset + offset, sub_len, 0);
+		bvec_set_page(&bvec, sg_page(sg), sub_len, sg->offset + offset);
+		iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, sub_len);
+
+		tx_sent = conn->sock->ops->sendmsg(conn->sock, &msghdr,
+						   sub_len);
 		if (tx_sent != sub_len) {
 			if (tx_sent == -EAGAIN) {
-				pr_err("tcp_sendpage() returned"
-						" -EAGAIN\n");
+				pr_err("sendmsg/splice returned -EAGAIN\n");
 				goto send_pg;
 			}
 
-			pr_err("tcp_sendpage() failure: %d\n",
-					tx_sent);
+			pr_err("sendmsg/splice failure: %d\n", tx_sent);
 			return -1;
 		}
 



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 13/16] ocfs2: Fix use of slab data with sendpage
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (11 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 12/16] scsi: target: iscsi: " David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 14/16] ocfs2: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Mark Fasheh,
	Kurt Hackel, Joel Becker, Joseph Qi, ocfs2-devel

ocfs2 uses kzalloc() to allocate buffers for o2net_hand, o2net_keep_req and
o2net_keep_resp and then passes these to sendpage.  This isn't really
allowed as the lifetime of slab objects is not controlled by page ref -
though in this case it will probably work.  sendmsg() with MSG_SPLICE_PAGES
will, however, print a warning and give an error.

Fix it to use folio_alloc() instead to allocate a buffer for the handshake
message, keepalive request and reply messages.

Fixes: 98211489d414 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Mark Fasheh <mark@fasheh.com>
cc: Kurt Hackel <kurt.hackel@oracle.com>
cc: Joel Becker <jlbec@evilplan.org>
cc: Joseph Qi <joseph.qi@linux.alibaba.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: ocfs2-devel@oss.oracle.com
cc: netdev@vger.kernel.org
---
 fs/ocfs2/cluster/tcp.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aecbd712a00c..929a1133bc18 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2087,18 +2087,24 @@ void o2net_stop_listening(struct o2nm_node *node)
 
 int o2net_init(void)
 {
+	struct folio *folio;
+	void *p;
 	unsigned long i;
 
 	o2quo_init();
-
 	o2net_debugfs_init();
 
-	o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
-	o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
-	o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
-	if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
+	folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0);
+	if (!folio)
 		goto out;
 
+	p = folio_address(folio);
+	o2net_hand = p;
+	p += sizeof(struct o2net_handshake);
+	o2net_keep_req = p;
+	p += sizeof(struct o2net_msg);
+	o2net_keep_resp = p;
+
 	o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
 	o2net_hand->connector_id = cpu_to_be64(1);
 
@@ -2124,9 +2130,6 @@ int o2net_init(void)
 	return 0;
 
 out:
-	kfree(o2net_hand);
-	kfree(o2net_keep_req);
-	kfree(o2net_keep_resp);
 	o2net_debugfs_exit();
 	o2quo_exit();
 	return -ENOMEM;
@@ -2135,8 +2138,6 @@ int o2net_init(void)
 void o2net_exit(void)
 {
 	o2quo_exit();
-	kfree(o2net_hand);
-	kfree(o2net_keep_req);
-	kfree(o2net_keep_resp);
 	o2net_debugfs_exit();
+	folio_put(virt_to_folio(o2net_hand));
 }



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 14/16] ocfs2: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (12 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 13/16] ocfs2: Fix use of slab data with sendpage David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-23 22:55 ` [PATCH net-next v5 16/16] net: Kill MSG_SENDPAGE_NOTLAST David Howells
  2023-06-24 23:00 ` [PATCH net-next v5 00/16] splice, net: Switch over users of sendpage() and remove it patchwork-bot+netdevbpf
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Mark Fasheh,
	Joel Becker, Joseph Qi, ocfs2-devel

Switch ocfs2 from using sendpage() to using sendmsg() + MSG_SPLICE_PAGES so
that sendpage can be phased out.

Signed-off-by: David Howells <dhowells@redhat.com>

cc: Mark Fasheh <mark@fasheh.com>
cc: Joel Becker <jlbec@evilplan.org>
cc: Joseph Qi <joseph.qi@linux.alibaba.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: ocfs2-devel@oss.oracle.com
cc: netdev@vger.kernel.org
---

Notes:
    ver #4)
     - Use folio_alloc() for o2net_hand, o2net_keep_req and o2net_keep_resp.
    
    ver #2)
     - Wrap lines at 80.

 fs/ocfs2/cluster/tcp.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 929a1133bc18..960080753d3b 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -930,19 +930,22 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
 }
 
 static void o2net_sendpage(struct o2net_sock_container *sc,
-			   void *kmalloced_virt,
-			   size_t size)
+			   void *virt, size_t size)
 {
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+	struct msghdr msg = {};
+	struct bio_vec bv;
 	ssize_t ret;
 
+	bvec_set_virt(&bv, virt, size);
+	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size);
+
 	while (1) {
+		msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES;
 		mutex_lock(&sc->sc_send_lock);
-		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
-						 virt_to_page(kmalloced_virt),
-						 offset_in_page(kmalloced_virt),
-						 size, MSG_DONTWAIT);
+		ret = sock_sendmsg(sc->sc_sock, &msg);
 		mutex_unlock(&sc->sc_send_lock);
+
 		if (ret == size)
 			break;
 		if (ret == (ssize_t)-EAGAIN) {



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH net-next v5 16/16] net: Kill MSG_SENDPAGE_NOTLAST
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (13 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 14/16] ocfs2: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
@ 2023-06-23 22:55 ` David Howells
  2023-06-24 23:00 ` [PATCH net-next v5 00/16] splice, net: Switch over users of sendpage() and remove it patchwork-bot+netdevbpf
  15 siblings, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-23 22:55 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, bpf, dccp,
	linux-afs, linux-arm-msm, linux-can, linux-crypto, linux-doc,
	linux-hams, linux-perf-users, linux-rdma, linux-sctp, linux-wpan,
	linux-x25, mptcp, rds-devel, tipc-discussion, virtualization

Now that ->sendpage() has been removed, MSG_SENDPAGE_NOTLAST can be cleaned
up.  Things were converted to use MSG_MORE instead, but the protocol
sendpage stubs still convert MSG_SENDPAGE_NOTLAST to MSG_MORE, which is now
unnecessary.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
cc: bpf@vger.kernel.org
cc: dccp@vger.kernel.org
cc: linux-afs@lists.infradead.org
cc: linux-arm-msm@vger.kernel.org
cc: linux-can@vger.kernel.org
cc: linux-crypto@vger.kernel.org
cc: linux-doc@vger.kernel.org
cc: linux-hams@vger.kernel.org
cc: linux-perf-users@vger.kernel.org
cc: linux-rdma@vger.kernel.org
cc: linux-sctp@vger.kernel.org
cc: linux-wpan@vger.kernel.org
cc: linux-x25@vger.kernel.org
cc: mptcp@lists.linux.dev
cc: netdev@vger.kernel.org
cc: rds-devel@oss.oracle.com
cc: tipc-discussion@lists.sourceforge.net
cc: virtualization@lists.linux-foundation.org
---

Notes:
    ver #3)
     - tcp_bpf is now handled by an earlier patch.

 include/linux/socket.h                         | 4 +---
 net/tls/tls_device.c                           | 3 +--
 net/tls/tls_main.c                             | 2 +-
 net/tls/tls_sw.c                               | 2 +-
 tools/perf/trace/beauty/include/linux/socket.h | 1 -
 tools/perf/trace/beauty/msg_flags.c            | 5 +----
 6 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 58204700018a..39b74d83c7c4 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -319,7 +319,6 @@ struct ucred {
 #define MSG_MORE	0x8000	/* Sender will send more */
 #define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
 #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
-#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
 #define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
@@ -341,8 +340,7 @@ struct ucred {
 
 /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
 #define MSG_INTERNAL_SENDMSG_FLAGS \
-	(MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_NOTLAST | \
-	 MSG_SENDPAGE_DECRYPTED)
+	(MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED)
 
 /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
 #define SOL_IP		0
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 840ee06f1708..2021fe557e50 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -441,8 +441,7 @@ static int tls_push_data(struct sock *sk,
 	long timeo;
 
 	if (flags &
-	    ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST |
-	      MSG_SPLICE_PAGES))
+	    ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SPLICE_PAGES))
 		return -EOPNOTSUPP;
 
 	if (unlikely(sk->sk_err))
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d5ed4d47b16e..b6896126bb92 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -127,7 +127,7 @@ int tls_push_sg(struct sock *sk,
 {
 	struct bio_vec bvec;
 	struct msghdr msg = {
-		.msg_flags = MSG_SENDPAGE_NOTLAST | MSG_SPLICE_PAGES | flags,
+		.msg_flags = MSG_SPLICE_PAGES | flags,
 	};
 	int ret = 0;
 	struct page *p;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 9b3aa89a4292..53f944e6d8ef 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1194,7 +1194,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 
 	if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
 			       MSG_CMSG_COMPAT | MSG_SPLICE_PAGES |
-			       MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY))
+			       MSG_SENDPAGE_NOPOLICY))
 		return -EOPNOTSUPP;
 
 	ret = mutex_lock_interruptible(&tls_ctx->tx_lock);
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index 13c3a237b9c9..3bef212a24d7 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -318,7 +318,6 @@ struct ucred {
 #define MSG_MORE	0x8000	/* Sender will send more */
 #define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
 #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
-#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
 #define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c
index ea68db08b8e7..5cdebd7ece7e 100644
--- a/tools/perf/trace/beauty/msg_flags.c
+++ b/tools/perf/trace/beauty/msg_flags.c
@@ -8,9 +8,6 @@
 #ifndef MSG_WAITFORONE
 #define MSG_WAITFORONE		   0x10000
 #endif
-#ifndef MSG_SENDPAGE_NOTLAST
-#define MSG_SENDPAGE_NOTLAST	   0x20000
-#endif
 #ifndef MSG_FASTOPEN
 #define MSG_FASTOPEN		0x20000000
 #endif
@@ -50,7 +47,7 @@ static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
 	P_MSG_FLAG(NOSIGNAL);
 	P_MSG_FLAG(MORE);
 	P_MSG_FLAG(WAITFORONE);
-	P_MSG_FLAG(SENDPAGE_NOTLAST);
+	P_MSG_FLAG(SPLICE_PAGES);
 	P_MSG_FLAG(FASTOPEN);
 	P_MSG_FLAG(CMSG_CLOEXEC);
 #undef P_MSG_FLAG



^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 00/16] splice, net: Switch over users of sendpage() and remove it
       [not found] <20230623225513.2732256-1-dhowells@redhat.com>
                   ` (14 preceding siblings ...)
  2023-06-23 22:55 ` [PATCH net-next v5 16/16] net: Kill MSG_SENDPAGE_NOTLAST David Howells
@ 2023-06-24 23:00 ` patchwork-bot+netdevbpf
  15 siblings, 0 replies; 27+ messages in thread
From: patchwork-bot+netdevbpf @ 2023-06-24 23:00 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, alexander.duyck, davem, edumazet, kuba, pabeni,
	willemdebruijn.kernel, dsahern, willy, axboe, linux-mm,
	linux-kernel

Hello:

This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Fri, 23 Jun 2023 23:54:57 +0100 you wrote:
> Here's the final set of patches towards the removal of sendpage.  All the
> drivers that use sendpage() get switched over to using sendmsg() with
> MSG_SPLICE_PAGES.
> 
> The following changes are made:
> 
>  (1) Make the protocol drivers behave according to MSG_MORE, not
>      MSG_SENDPAGE_NOTLAST.  The latter is restricted to turning on MSG_MORE
>      in the sendpage() wrappers.
> 
> [...]

Here is the summary with links:
  - [net-next,v5,01/16] tcp_bpf, smc, tls, espintcp, siw: Reduce MSG_SENDPAGE_NOTLAST usage
    https://git.kernel.org/netdev/net-next/c/f8dd95b29d7e
  - [net-next,v5,02/16] net: Use sendmsg(MSG_SPLICE_PAGES) not sendpage in skb_send_sock()
    https://git.kernel.org/netdev/net-next/c/c729ed6f5be5
  - [net-next,v5,03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
    https://git.kernel.org/netdev/net-next/c/40a8c17aa770
  - [net-next,v5,04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
    https://git.kernel.org/netdev/net-next/c/fa094ccae1e7
  - [net-next,v5,05/16] rds: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
    https://git.kernel.org/netdev/net-next/c/572efade27c5
  - [net-next,v5,06/16] dlm: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
    https://git.kernel.org/netdev/net-next/c/a1a5e8752786
  - [net-next,v5,07/16] nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
    https://git.kernel.org/netdev/net-next/c/7769887817c3
  - [net-next,v5,08/16] nvmet-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage
    https://git.kernel.org/netdev/net-next/c/c336a79983c7
  - [net-next,v5,09/16] smc: Drop smc_sendpage() in favour of smc_sendmsg() + MSG_SPLICE_PAGES
    https://git.kernel.org/netdev/net-next/c/2f8bc2bbb0fa
  - [net-next,v5,10/16] drbd: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
    https://git.kernel.org/netdev/net-next/c/eeac7405c735
  - [net-next,v5,11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
    https://git.kernel.org/netdev/net-next/c/fa8df3435727
  - [net-next,v5,12/16] scsi: target: iscsi: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
    https://git.kernel.org/netdev/net-next/c/d2fe21077d6d
  - [net-next,v5,13/16] ocfs2: Fix use of slab data with sendpage
    https://git.kernel.org/netdev/net-next/c/86d7bd6e66e9
  - [net-next,v5,14/16] ocfs2: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
    https://git.kernel.org/netdev/net-next/c/e52828cc0109
  - [net-next,v5,15/16] sock: Remove ->sendpage*() in favour of sendmsg(MSG_SPLICE_PAGES)
    https://git.kernel.org/netdev/net-next/c/dc97391e6610
  - [net-next,v5,16/16] net: Kill MSG_SENDPAGE_NOTLAST
    https://git.kernel.org/netdev/net-next/c/b848b26c6672

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html




^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-23 22:55 ` [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
@ 2023-06-25 12:20   ` Ilya Dryomov
  2023-06-26 14:00   ` David Howells
  2023-06-26 15:12   ` David Howells
  2 siblings, 0 replies; 27+ messages in thread
From: Ilya Dryomov @ 2023-06-25 12:20 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

On Sat, Jun 24, 2023 at 12:55 AM David Howells <dhowells@redhat.com> wrote:
>
> Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
> transmitting data.  For the moment, this can only transmit one page at a
> time because of the architecture of net/ceph/, but if
> write_partial_message_data() can be given a bvec[] at a time by the
> iteration code, this would allow pages to be sent in a batch.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Ilya Dryomov <idryomov@gmail.com>
> cc: Xiubo Li <xiubli@redhat.com>
> cc: Jeff Layton <jlayton@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: ceph-devel@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
>
> Notes:
>     ver #5)
>      - Switch condition for setting MSG_MORE in write_partial_message_data()
>
>  net/ceph/messenger_v1.c | 60 ++++++++++++++---------------------------
>  1 file changed, 20 insertions(+), 40 deletions(-)
>
> diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
> index d664cb1593a7..814579f27f04 100644
> --- a/net/ceph/messenger_v1.c
> +++ b/net/ceph/messenger_v1.c
> @@ -74,37 +74,6 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
>         return r;
>  }
>
> -/*
> - * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
> - */
> -static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
> -                            int offset, size_t size, int more)
> -{
> -       ssize_t (*sendpage)(struct socket *sock, struct page *page,
> -                           int offset, size_t size, int flags);
> -       int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
> -       int ret;
> -
> -       /*
> -        * sendpage cannot properly handle pages with page_count == 0,
> -        * we need to fall back to sendmsg if that's the case.
> -        *
> -        * Same goes for slab pages: skb_can_coalesce() allows
> -        * coalescing neighboring slab objects into a single frag which
> -        * triggers one of hardened usercopy checks.
> -        */
> -       if (sendpage_ok(page))
> -               sendpage = sock->ops->sendpage;
> -       else
> -               sendpage = sock_no_sendpage;
> -
> -       ret = sendpage(sock, page, offset, size, flags);
> -       if (ret == -EAGAIN)
> -               ret = 0;
> -
> -       return ret;
> -}
> -
>  static void con_out_kvec_reset(struct ceph_connection *con)
>  {
>         BUG_ON(con->v1.out_skip);
> @@ -464,7 +433,6 @@ static int write_partial_message_data(struct ceph_connection *con)
>         struct ceph_msg *msg = con->out_msg;
>         struct ceph_msg_data_cursor *cursor = &msg->cursor;
>         bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
> -       int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
>         u32 crc;
>
>         dout("%s %p msg %p\n", __func__, con, msg);
> @@ -482,6 +450,10 @@ static int write_partial_message_data(struct ceph_connection *con)
>          */
>         crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
>         while (cursor->total_resid) {
> +               struct bio_vec bvec;
> +               struct msghdr msghdr = {
> +                       .msg_flags = MSG_SPLICE_PAGES,

Hi David,

This appears to be losing MSG_DONTWAIT | MSG_NOSIGNAL flags which were
set previously?

> +               };
>                 struct page *page;
>                 size_t page_offset;
>                 size_t length;
> @@ -493,10 +465,13 @@ static int write_partial_message_data(struct ceph_connection *con)
>                 }
>
>                 page = ceph_msg_data_next(cursor, &page_offset, &length);
> -               if (length == cursor->total_resid)
> -                       more = MSG_MORE;
> -               ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
> -                                       more);
> +               if (length != cursor->total_resid)
> +                       msghdr.msg_flags |= MSG_MORE;
> +
> +               bvec_set_page(&bvec, page, length, page_offset);
> +               iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, length);
> +
> +               ret = sock_sendmsg(con->sock, &msghdr);
>                 if (ret <= 0) {

And this is losing munging -EAGAIN -> 0?

>                         if (do_datacrc)
>                                 msg->footer.data_crc = cpu_to_le32(crc);
> @@ -526,7 +501,10 @@ static int write_partial_message_data(struct ceph_connection *con)
>   */
>  static int write_partial_skip(struct ceph_connection *con)
>  {
> -       int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +       struct bio_vec bvec;
> +       struct msghdr msghdr = {
> +               .msg_flags = MSG_SPLICE_PAGES | MSG_MORE,
> +       };
>         int ret;
>
>         dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
> @@ -534,9 +512,11 @@ static int write_partial_skip(struct ceph_connection *con)
>                 size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
>
>                 if (size == con->v1.out_skip)
> -                       more = MSG_MORE;
> -               ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
> -                                       more);
> +                       msghdr.msg_flags &= ~MSG_MORE;
> +               bvec_set_page(&bvec, ZERO_PAGE(0), size, 0);
> +               iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size);
> +
> +               ret = sock_sendmsg(con->sock, &msghdr);
>                 if (ret <= 0)

Same here...  I would suggest that you keep ceph_tcp_sendpage() function
and make only minimal modifications to avoid regressions.

Thanks,

                Ilya


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
  2023-06-23 22:55 ` [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
@ 2023-06-25 12:34   ` Ilya Dryomov
  2023-06-26 15:30   ` David Howells
  1 sibling, 0 replies; 27+ messages in thread
From: Ilya Dryomov @ 2023-06-25 12:34 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

On Sat, Jun 24, 2023 at 12:55 AM David Howells <dhowells@redhat.com> wrote:
>
> Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
> transmitting data.  For the moment, this can only transmit one page at a
> time because of the architecture of net/ceph/, but if
> write_partial_message_data() can be given a bvec[] at a time by the

Hi David,

write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
doesn't apply here.  I would suggest squashing the two net/ceph patches
into one since even the titles are the same.

Also, we tend to use "libceph: " prefix for net/ceph changes.

> iteration code, this would allow pages to be sent in a batch.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> cc: Ilya Dryomov <idryomov@gmail.com>
> cc: Xiubo Li <xiubli@redhat.com>
> cc: Jeff Layton <jlayton@kernel.org>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: ceph-devel@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
>  net/ceph/messenger_v2.c | 91 +++++++++--------------------------------
>  1 file changed, 19 insertions(+), 72 deletions(-)
>
> diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
> index 301a991dc6a6..87ac97073e75 100644
> --- a/net/ceph/messenger_v2.c
> +++ b/net/ceph/messenger_v2.c
> @@ -117,91 +117,38 @@ static int ceph_tcp_recv(struct ceph_connection *con)
>         return ret;
>  }
>
> -static int do_sendmsg(struct socket *sock, struct iov_iter *it)
> -{
> -       struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> -       int ret;
> -
> -       msg.msg_iter = *it;
> -       while (iov_iter_count(it)) {
> -               ret = sock_sendmsg(sock, &msg);
> -               if (ret <= 0) {
> -                       if (ret == -EAGAIN)
> -                               ret = 0;
> -                       return ret;
> -               }
> -
> -               iov_iter_advance(it, ret);
> -       }
> -
> -       WARN_ON(msg_data_left(&msg));
> -       return 1;
> -}
> -
> -static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
> -{
> -       struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
> -       struct bio_vec bv;
> -       int ret;
> -
> -       if (WARN_ON(!iov_iter_is_bvec(it)))
> -               return -EINVAL;
> -
> -       while (iov_iter_count(it)) {
> -               /* iov_iter_iovec() for ITER_BVEC */
> -               bvec_set_page(&bv, it->bvec->bv_page,
> -                             min(iov_iter_count(it),
> -                                 it->bvec->bv_len - it->iov_offset),
> -                             it->bvec->bv_offset + it->iov_offset);
> -
> -               /*
> -                * sendpage cannot properly handle pages with
> -                * page_count == 0, we need to fall back to sendmsg if
> -                * that's the case.
> -                *
> -                * Same goes for slab pages: skb_can_coalesce() allows
> -                * coalescing neighboring slab objects into a single frag
> -                * which triggers one of hardened usercopy checks.
> -                */
> -               if (sendpage_ok(bv.bv_page)) {
> -                       ret = sock->ops->sendpage(sock, bv.bv_page,
> -                                                 bv.bv_offset, bv.bv_len,
> -                                                 CEPH_MSG_FLAGS);
> -               } else {
> -                       iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
> -                       ret = sock_sendmsg(sock, &msg);
> -               }
> -               if (ret <= 0) {
> -                       if (ret == -EAGAIN)
> -                               ret = 0;
> -                       return ret;
> -               }
> -
> -               iov_iter_advance(it, ret);
> -       }
> -
> -       return 1;
> -}
> -
>  /*
>   * Write as much as possible.  The socket is expected to be corked,
> - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> + * so we don't bother with MSG_MORE here.
>   *
>   * Return:
> - *   1 - done, nothing (else) to write
> + *  >0 - done, nothing (else) to write

It would be nice to avoid making tweaks like this to the outer
interface as part of switching to a new internal API.

>   *   0 - socket is full, need to wait
>   *  <0 - error
>   */
>  static int ceph_tcp_send(struct ceph_connection *con)
>  {
> +       struct msghdr msg = {
> +               .msg_iter       = con->v2.out_iter,
> +               .msg_flags      = CEPH_MSG_FLAGS,
> +       };
>         int ret;
>
> +       if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> +               return -EINVAL;

Previously, this WARN_ON + error applied only to the "try sendpage"
path.  There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
pretty sure that placing it here breaks everything.

> +
> +       if (con->v2.out_iter_sendpage)
> +               msg.msg_flags |= MSG_SPLICE_PAGES;
> +
>         dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
>              iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
> -       if (con->v2.out_iter_sendpage)
> -               ret = do_try_sendpage(con->sock, &con->v2.out_iter);
> -       else
> -               ret = do_sendmsg(con->sock, &con->v2.out_iter);
> +
> +       ret = sock_sendmsg(con->sock, &msg);
> +       if (ret > 0)
> +               iov_iter_advance(&con->v2.out_iter, ret);
> +       else if (ret == -EAGAIN)
> +               ret = 0;

Hrm, is sock_sendmsg() now guaranteed to exhaust the iterator (i.e.
a "short write" is no longer possible)?  Unless that is the case, this
is not an equivalent transformation.

This is actually the reason for

>   * Return:
>   *   1 - done, nothing (else) to write

specification which you also tweaked.  It doesn't make sense for
ceph_tcp_send() to return the number of bytes sent because the caller
expects everything to be sent when a positive number is returned.

Thanks,

                Ilya


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-23 22:55 ` [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
  2023-06-25 12:20   ` Ilya Dryomov
@ 2023-06-26 14:00   ` David Howells
  2023-06-26 15:41     ` Ilya Dryomov
  2023-06-26 16:44     ` David Howells
  2023-06-26 15:12   ` David Howells
  2 siblings, 2 replies; 27+ messages in thread
From: David Howells @ 2023-06-26 14:00 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: dhowells, netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

Ilya Dryomov <idryomov@gmail.com> wrote:

> Same here...  I would suggest that you keep ceph_tcp_sendpage() function
> and make only minimal modifications to avoid regressions.

This is now committed to net-next.  I can bring ceph_tcp_sendpage() back into
existence or fix it in place for now if you have a preference.

Note that I'm working on patches to rework the libceph transmission path so
that it isn't dealing with transmitting a single page at a time, but it's not
ready yet.

David



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-23 22:55 ` [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
  2023-06-25 12:20   ` Ilya Dryomov
  2023-06-26 14:00   ` David Howells
@ 2023-06-26 15:12   ` David Howells
  2023-06-26 15:52     ` Ilya Dryomov
  2 siblings, 1 reply; 27+ messages in thread
From: David Howells @ 2023-06-26 15:12 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: dhowells, netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

Ilya Dryomov <idryomov@gmail.com> wrote:

> > -       int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;

Btw, why are you setting MSG_DONTWAIT?  If you're in the middle of
transmitting a message on a TCP socket, surely you can't just switch to
transmitting a different message on the same socket without doing some sort of
reframing?

David



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
  2023-06-23 22:55 ` [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
  2023-06-25 12:34   ` Ilya Dryomov
@ 2023-06-26 15:30   ` David Howells
  2023-06-26 16:07     ` Ilya Dryomov
  2023-06-26 17:01     ` David Howells
  1 sibling, 2 replies; 27+ messages in thread
From: David Howells @ 2023-06-26 15:30 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: dhowells, netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

Ilya Dryomov <idryomov@gmail.com> wrote:

> write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
> doesn't apply here.  I would suggest squashing the two net/ceph patches
> into one since even the titles are the same.

I would, but they're now applied to net-next, so we need to patch that.

> >   * Write as much as possible.  The socket is expected to be corked,
> > - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> > + * so we don't bother with MSG_MORE here.
> >   *
> >   * Return:
> > - *   1 - done, nothing (else) to write
> > + *  >0 - done, nothing (else) to write
> 
> It would be nice to avoid making tweaks like this to the outer
> interface as part of switching to a new internal API.

Ok.  I'll change that and wrap the sendmsg in a loop.  Though, as I asked in
an earlier reply, why is MSG_DONTWAIT used here?

> > +       if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> > +               return -EINVAL;
> 
> Previously, this WARN_ON + error applied only to the "try sendpage"
> path.  There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
> pretty sure that placing it here breaks everything.

This should have been removed as MSG_SPLICE_PAGES now accepts KVEC and XARRAY
iterators also.

Btw, is it feasible to use con->v2.out_iter_sendpage to apply MSG_SPLICE_PAGES
to the iterator to be transmitted as a whole?  It seems to be set depending on
iterator type.

David



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-26 14:00   ` David Howells
@ 2023-06-26 15:41     ` Ilya Dryomov
  2023-06-26 16:44     ` David Howells
  1 sibling, 0 replies; 27+ messages in thread
From: Ilya Dryomov @ 2023-06-26 15:41 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

On Mon, Jun 26, 2023 at 4:00 PM David Howells <dhowells@redhat.com> wrote:
>
> Ilya Dryomov <idryomov@gmail.com> wrote:
>
> > Same here...  I would suggest that you keep ceph_tcp_sendpage() function
> > and make only minimal modifications to avoid regressions.
>
> This is now committed to net-next.

This needs to be dropped from linux-next because both this and
especially the other (net/ceph/messenger_v2.c) patch introduce
regressions.

> I can bring ceph_tcp_sendpage() back into
> existence or fix it in place for now if you have a preference.

I already mentioned that I would prefer if ceph_tcp_sendpage() was
brought back into existence.

>
> Note that I'm working on patches to rework the libceph transmission path so
> that it isn't dealing with transmitting a single page at a time, but it's not
> ready yet.

That is a worthwhile improvement now that sock_sendmsg() can take
advantage of multiple pages!  It would be pretty invasive though so
I think ceph_tcp_sendpage() is better to remain in place until then.

Thanks,

                Ilya


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-26 15:12   ` David Howells
@ 2023-06-26 15:52     ` Ilya Dryomov
  0 siblings, 0 replies; 27+ messages in thread
From: Ilya Dryomov @ 2023-06-26 15:52 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

On Mon, Jun 26, 2023 at 5:12 PM David Howells <dhowells@redhat.com> wrote:
>
> Ilya Dryomov <idryomov@gmail.com> wrote:
>
> > > -       int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
>
> Btw, why are you setting MSG_DONTWAIT?  If you're in the middle of
> transmitting a message on a TCP socket, surely you can't just switch to
> transmitting a different message on the same socket without doing some sort of
> reframing?

We don't want to hog kworker threads.  You are correct that we can't
switch to transmitting a different message on the same socket but Ceph
is massively parallel and there can be dozens or even hundreds of other
sockets to work on.

Thanks,

                Ilya


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
  2023-06-26 15:30   ` David Howells
@ 2023-06-26 16:07     ` Ilya Dryomov
  2023-06-26 17:01     ` David Howells
  1 sibling, 0 replies; 27+ messages in thread
From: Ilya Dryomov @ 2023-06-26 16:07 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

On Mon, Jun 26, 2023 at 5:30 PM David Howells <dhowells@redhat.com> wrote:
>
> Ilya Dryomov <idryomov@gmail.com> wrote:
>
> > write_partial_message_data() is net/ceph/messenger_v1.c specific, so it
> > doesn't apply here.  I would suggest squashing the two net/ceph patches
> > into one since even the titles are the same.
>
> I would, but they're now applied to net-next, so we need to patch that.

I don't see a problem with that given that the patches themselves have
major issues (i.e. it's not just a commit message/title nit).

>
> > >   * Write as much as possible.  The socket is expected to be corked,
> > > - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
> > > + * so we don't bother with MSG_MORE here.
> > >   *
> > >   * Return:
> > > - *   1 - done, nothing (else) to write
> > > + *  >0 - done, nothing (else) to write
> >
> > It would be nice to avoid making tweaks like this to the outer
> > interface as part of switching to a new internal API.
>
> Ok.  I'll change that and wrap the sendmsg in a loop.  Though, as I asked in
> an earlier reply, why is MSG_DONTWAIT used here?

See my reply there.

>
> > > +       if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)))
> > > +               return -EINVAL;
> >
> > Previously, this WARN_ON + error applied only to the "try sendpage"
> > path.  There is a ton of kvec usage in net/ceph/messenger_v2.c, so I'm
> > pretty sure that placing it here breaks everything.
>
> This should have been removed as MSG_SPLICE_PAGES now accepts KVEC and XARRAY
> iterators also.
>
> Btw, is it feasible to use con->v2.out_iter_sendpage to apply MSG_SPLICE_PAGES
> to the iterator to be transmitted as a whole?  It seems to be set depending on
> iterator type.

I'm not sure I understand what you mean by "transmitted as a whole".
con->v2.out_iter_sendpage is set only when zerocopy is desired.  If the
underlying data is not guaranteed to remain stable, zerocopy behavior
is not safe.

Thanks,

                Ilya


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-26 14:00   ` David Howells
  2023-06-26 15:41     ` Ilya Dryomov
@ 2023-06-26 16:44     ` David Howells
  1 sibling, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-26 16:44 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: dhowells, netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

Ilya Dryomov <idryomov@gmail.com> wrote:

> > This is now committed to net-next.
> 
> This needs to be dropped from linux-next because both this and
> especially the other (net/ceph/messenger_v2.c) patch introduce
> regressions.

net-next, not linux-next.  I'm not sure they drop things from there rather
than reverting them.

David



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()
  2023-06-26 15:30   ` David Howells
  2023-06-26 16:07     ` Ilya Dryomov
@ 2023-06-26 17:01     ` David Howells
  1 sibling, 0 replies; 27+ messages in thread
From: David Howells @ 2023-06-26 17:01 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: dhowells, netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Xiubo Li,
	Jeff Layton, ceph-devel

Ilya Dryomov <idryomov@gmail.com> wrote:

> > Btw, is it feasible to use con->v2.out_iter_sendpage to apply
> > MSG_SPLICE_PAGES to the iterator to be transmitted as a whole?  It seems
> > to be set depending on iterator type.
> 
> I'm not sure I understand what you mean by "transmitted as a whole".
> con->v2.out_iter_sendpage is set only when zerocopy is desired.  If the
> underlying data is not guaranteed to remain stable, zerocopy behavior
> is not safe.

I think I need to reinstate the per-page sendpage_ok() check here also -
though Al pointed out it isn't sufficiently exhaustive.  There are pages that
sendpage_ok() will return true on that you shouldn't be passing to sendpage().

I'll whip up a patch to partially revert this also.

David



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH net-next v5 11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage
  2023-06-23 22:55 ` [PATCH net-next v5 11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
@ 2023-06-27 19:02   ` Chris Leech
  0 siblings, 0 replies; 27+ messages in thread
From: Chris Leech @ 2023-06-27 19:02 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, Alexander Duyck, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel, Mike Christie,
	Lee Duncan, James E.J. Bottomley, Martin K. Petersen, Al Viro,
	open-iscsi, linux-scsi, target-devel

On Fri, Jun 23, 2023 at 11:55:08PM +0100, David Howells wrote:
> Use sendmsg() with MSG_SPLICE_PAGES rather than sendpage.  This allows
> multiple pages and multipage folios to be passed through.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> Reviewed-by: Mike Christie <michael.christie@oracle.com>
> cc: Lee Duncan <lduncan@suse.com>
> cc: Chris Leech <cleech@redhat.com>
> cc: "James E.J. Bottomley" <jejb@linux.ibm.com>
> cc: "Martin K. Petersen" <martin.petersen@oracle.com>
> cc: "David S. Miller" <davem@davemloft.net>
> cc: Eric Dumazet <edumazet@google.com>
> cc: Jakub Kicinski <kuba@kernel.org>
> cc: Paolo Abeni <pabeni@redhat.com>
> cc: Jens Axboe <axboe@kernel.dk>
> cc: Matthew Wilcox <willy@infradead.org>
> cc: Al Viro <viro@zeniv.linux.org.uk>
> cc: open-iscsi@googlegroups.com
> cc: linux-scsi@vger.kernel.org
> cc: target-devel@vger.kernel.org
> cc: netdev@vger.kernel.org
> ---
> 
> Notes:
>     ver #5)
>      - Split iscsi changes into client and target patches
> 
>  drivers/scsi/iscsi_tcp.c | 26 ++++++++++----------------
>  drivers/scsi/iscsi_tcp.h |  2 --
>  2 files changed, 10 insertions(+), 18 deletions(-)

This seems good to me.

Reviewed-by: Chris Leech <cleech@redhat.com>
 
> diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
> index 9637d4bc2bc9..9ab8555180a3 100644
> --- a/drivers/scsi/iscsi_tcp.c
> +++ b/drivers/scsi/iscsi_tcp.c
> @@ -301,35 +301,32 @@ static int iscsi_sw_tcp_xmit_segment(struct iscsi_tcp_conn *tcp_conn,
>  
>  	while (!iscsi_tcp_segment_done(tcp_conn, segment, 0, r)) {
>  		struct scatterlist *sg;
> +		struct msghdr msg = {};
> +		struct bio_vec bv;
>  		unsigned int offset, copy;
> -		int flags = 0;
>  
>  		r = 0;
>  		offset = segment->copied;
>  		copy = segment->size - offset;
>  
>  		if (segment->total_copied + segment->size < segment->total_size)
> -			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +			msg.msg_flags |= MSG_MORE;
>  
>  		if (tcp_sw_conn->queue_recv)
> -			flags |= MSG_DONTWAIT;
> +			msg.msg_flags |= MSG_DONTWAIT;
>  
> -		/* Use sendpage if we can; else fall back to sendmsg */
>  		if (!segment->data) {
> +			if (!tcp_conn->iscsi_conn->datadgst_en)
> +				msg.msg_flags |= MSG_SPLICE_PAGES;
>  			sg = segment->sg;
>  			offset += segment->sg_offset + sg->offset;
> -			r = tcp_sw_conn->sendpage(sk, sg_page(sg), offset,
> -						  copy, flags);
> +			bvec_set_page(&bv, sg_page(sg), copy, offset);
>  		} else {
> -			struct msghdr msg = { .msg_flags = flags };
> -			struct kvec iov = {
> -				.iov_base = segment->data + offset,
> -				.iov_len = copy
> -			};
> -
> -			r = kernel_sendmsg(sk, &msg, &iov, 1, copy);
> +			bvec_set_virt(&bv, segment->data + offset, copy);
>  		}
> +		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, copy);
>  
> +		r = sock_sendmsg(sk, &msg);
>  		if (r < 0) {
>  			iscsi_tcp_segment_unmap(segment);
>  			return r;
> @@ -746,7 +743,6 @@ iscsi_sw_tcp_conn_bind(struct iscsi_cls_session *cls_session,
>  	sock_no_linger(sk);
>  
>  	iscsi_sw_tcp_conn_set_callbacks(conn);
> -	tcp_sw_conn->sendpage = tcp_sw_conn->sock->ops->sendpage;
>  	/*
>  	 * set receive state machine into initial state
>  	 */
> @@ -777,8 +773,6 @@ static int iscsi_sw_tcp_conn_set_param(struct iscsi_cls_conn *cls_conn,
>  			return -ENOTCONN;
>  		}
>  		iscsi_set_param(cls_conn, param, buf, buflen);
> -		tcp_sw_conn->sendpage = conn->datadgst_en ?
> -			sock_no_sendpage : tcp_sw_conn->sock->ops->sendpage;
>  		mutex_unlock(&tcp_sw_conn->sock_lock);
>  		break;
>  	case ISCSI_PARAM_MAX_R2T:
> diff --git a/drivers/scsi/iscsi_tcp.h b/drivers/scsi/iscsi_tcp.h
> index 68e14a344904..89a6fc552f0b 100644
> --- a/drivers/scsi/iscsi_tcp.h
> +++ b/drivers/scsi/iscsi_tcp.h
> @@ -47,8 +47,6 @@ struct iscsi_sw_tcp_conn {
>  	/* MIB custom statistics */
>  	uint32_t		sendpage_failures_cnt;
>  	uint32_t		discontiguous_hdr_cnt;
> -
> -	ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int);
>  };
>  
>  struct iscsi_sw_tcp_host {
> 



^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2023-06-27 19:03 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20230623225513.2732256-1-dhowells@redhat.com>
2023-06-23 22:54 ` [PATCH net-next v5 01/16] tcp_bpf, smc, tls, espintcp, siw: Reduce MSG_SENDPAGE_NOTLAST usage David Howells
2023-06-23 22:54 ` [PATCH net-next v5 02/16] net: Use sendmsg(MSG_SPLICE_PAGES) not sendpage in skb_send_sock() David Howells
2023-06-23 22:55 ` [PATCH net-next v5 03/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
2023-06-25 12:20   ` Ilya Dryomov
2023-06-26 14:00   ` David Howells
2023-06-26 15:41     ` Ilya Dryomov
2023-06-26 16:44     ` David Howells
2023-06-26 15:12   ` David Howells
2023-06-26 15:52     ` Ilya Dryomov
2023-06-23 22:55 ` [PATCH net-next v5 04/16] ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
2023-06-25 12:34   ` Ilya Dryomov
2023-06-26 15:30   ` David Howells
2023-06-26 16:07     ` Ilya Dryomov
2023-06-26 17:01     ` David Howells
2023-06-23 22:55 ` [PATCH net-next v5 05/16] rds: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
2023-06-23 22:55 ` [PATCH net-next v5 06/16] dlm: " David Howells
2023-06-23 22:55 ` [PATCH net-next v5 07/16] nvme-tcp: Use sendmsg(MSG_SPLICE_PAGES) rather then sendpage David Howells
2023-06-23 22:55 ` [PATCH net-next v5 08/16] nvmet-tcp: " David Howells
2023-06-23 22:55 ` [PATCH net-next v5 09/16] smc: Drop smc_sendpage() in favour of smc_sendmsg() + MSG_SPLICE_PAGES David Howells
2023-06-23 22:55 ` [PATCH net-next v5 10/16] drbd: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
2023-06-23 22:55 ` [PATCH net-next v5 11/16] scsi: iscsi_tcp: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage David Howells
2023-06-27 19:02   ` Chris Leech
2023-06-23 22:55 ` [PATCH net-next v5 12/16] scsi: target: iscsi: " David Howells
2023-06-23 22:55 ` [PATCH net-next v5 13/16] ocfs2: Fix use of slab data with sendpage David Howells
2023-06-23 22:55 ` [PATCH net-next v5 14/16] ocfs2: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage() David Howells
2023-06-23 22:55 ` [PATCH net-next v5 16/16] net: Kill MSG_SENDPAGE_NOTLAST David Howells
2023-06-24 23:00 ` [PATCH net-next v5 00/16] splice, net: Switch over users of sendpage() and remove it patchwork-bot+netdevbpf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).