public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: David Howells <dhowells@redhat.com>
To: Viacheslav Dubeyko <slava@dubeyko.com>,
	Alex Markuze <amarkuze@redhat.com>
Cc: David Howells <dhowells@redhat.com>,
	Ilya Dryomov <idryomov@gmail.com>,
	Jeff Layton <jlayton@kernel.org>,
	Dongsheng Yang <dongsheng.yang@easystack.cn>,
	ceph-devel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [RFC PATCH 12/35] libceph: Bypass the messenger-v1 Tx loop for databuf/iter data blobs
Date: Thu, 13 Mar 2025 23:33:04 +0000	[thread overview]
Message-ID: <20250313233341.1675324-13-dhowells@redhat.com> (raw)
In-Reply-To: <20250313233341.1675324-1-dhowells@redhat.com>

Don't use the messenger-v1 Tx loop for databuf/iter data blobs, which sends
page fragments individually, but rather pass the entire iterator to the
socket in one go.  This uses the loop inside of tcp_sendmsg() to do the
work and allows TCP to make better choices.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Viacheslav Dubeyko <slava@dubeyko.com>
cc: Alex Markuze <amarkuze@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: ceph-devel@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
---
 include/linux/ceph/messenger.h |  1 +
 net/ceph/messenger.c           |  1 +
 net/ceph/messenger_v1.c        | 76 ++++++++++++++++++++++++++++------
 3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 864aad369c91..1b646d0dff39 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -255,6 +255,7 @@ struct ceph_msg_data_cursor {
 		};
 		struct {
 			struct iov_iter		iov_iter;
+			struct iov_iter		crc_iter;
 			unsigned int		lastlen;
 		};
 	};
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 02439b38ec94..dc8082575e4f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -975,6 +975,7 @@ static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
 	struct ceph_msg_data *data = cursor->data;
 
 	cursor->iov_iter = data->iter;
+	cursor->crc_iter = data->iter;
 	cursor->lastlen = 0;
 	iov_iter_truncate(&cursor->iov_iter, length);
 	cursor->resid = iov_iter_count(&cursor->iov_iter);
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 0cb61c76b9b8..d6464ac62b09 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -3,6 +3,7 @@
 
 #include <linux/bvec.h>
 #include <linux/crc32c.h>
+#include <linux/iov_iter.h>
 #include <linux/net.h>
 #include <linux/socket.h>
 #include <net/sock.h>
@@ -74,6 +75,21 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
 	return r;
 }
 
+static int ceph_tcp_sock_sendmsg(struct socket *sock, struct iov_iter *iter,
+				 unsigned int flags)
+{
+	struct msghdr msg = {
+		.msg_iter  = *iter,
+		.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | flags,
+	};
+	int r;
+
+	r = sock_sendmsg(sock, &msg);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
 /*
  * @more: MSG_MORE or 0.
  */
@@ -455,6 +471,24 @@ static int write_partial_kvec(struct ceph_connection *con)
 	return ret;  /* done! */
 }
 
+static size_t ceph_crc_from_iter(void *iter_from, size_t progress,
+				 size_t len, void *priv, void *priv2)
+{
+	u32 *crc = priv;
+
+	*crc = crc32c(*crc, iter_from, len);
+	return 0;
+}
+
+static void ceph_calc_crc(struct iov_iter *iter, size_t count, u32 *crc)
+{
+	size_t done;
+
+	done = iterate_and_advance_kernel(iter, count, crc, NULL,
+					  ceph_crc_from_iter);
+	WARN_ON(done != count);
+}
+
 /*
  * Write as much message data payload as we can.  If we finish, queue
  * up the footer.
@@ -467,7 +501,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 	struct ceph_msg *msg = con->out_msg;
 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	u32 crc;
+	u32 crc = 0;
 
 	dout("%s %p msg %p\n", __func__, con, msg);
 
@@ -484,9 +518,6 @@ static int write_partial_message_data(struct ceph_connection *con)
 	 */
 	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
 	while (cursor->total_resid) {
-		struct page *page;
-		size_t page_offset;
-		size_t length;
 		int ret;
 
 		if (!cursor->resid) {
@@ -494,17 +525,36 @@ static int write_partial_message_data(struct ceph_connection *con)
 			continue;
 		}
 
-		page = ceph_msg_data_next(cursor, &page_offset, &length);
-		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					MSG_MORE);
-		if (ret <= 0) {
-			if (do_datacrc)
-				msg->footer.data_crc = cpu_to_le32(crc);
+		if (cursor->data->type == CEPH_MSG_DATA_DATABUF ||
+		    cursor->data->type == CEPH_MSG_DATA_ITER) {
+			ret = ceph_tcp_sock_sendmsg(con->sock, &cursor->iov_iter,
+						    MSG_MORE);
+			if (ret <= 0) {
+				if (do_datacrc)
+					msg->footer.data_crc = cpu_to_le32(crc);
 
-			return ret;
+				return ret;
+			}
+			if (do_datacrc && cursor->need_crc)
+				ceph_calc_crc(&cursor->crc_iter, ret, &crc);
+		} else {
+			struct page *page;
+			size_t page_offset;
+			size_t length;
+
+			page = ceph_msg_data_next(cursor, &page_offset, &length);
+			ret = ceph_tcp_sendpage(con->sock, page, page_offset,
+						length, MSG_MORE);
+			if (ret <= 0) {
+				if (do_datacrc)
+					msg->footer.data_crc = cpu_to_le32(crc);
+
+				return ret;
+			}
+			if (do_datacrc && cursor->need_crc)
+				crc = ceph_crc32c_page(crc, page, page_offset,
+						       length);
 		}
-		if (do_datacrc && cursor->need_crc)
-			crc = ceph_crc32c_page(crc, page, page_offset, length);
 		ceph_msg_data_advance(cursor, (size_t)ret);
 	}
 


  parent reply	other threads:[~2025-03-13 23:34 UTC|newest]

Thread overview: 72+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-03-13 23:32 [RFC PATCH 00/35] ceph, rbd, netfs: Make ceph fully use netfslib David Howells
2025-03-13 23:32 ` [RFC PATCH 01/35] ceph: Fix incorrect flush end position calculation David Howells
2025-03-13 23:32 ` [RFC PATCH 02/35] libceph: Rename alignment to offset David Howells
2025-03-14 19:04   ` Viacheslav Dubeyko
2025-03-14 20:01     ` David Howells
2025-03-13 23:32 ` [RFC PATCH 03/35] libceph: Add a new data container type, ceph_databuf David Howells
2025-03-14 20:06   ` Viacheslav Dubeyko
2025-03-17 11:27     ` David Howells
2025-03-13 23:32 ` [RFC PATCH 04/35] ceph: Convert ceph_mds_request::r_pagelist to a databuf David Howells
2025-03-14 22:27   ` slava
2025-03-17 11:52     ` David Howells
2025-03-20 20:34       ` Viacheslav Dubeyko
2025-03-20 22:01         ` David Howells
2025-03-13 23:32 ` [RFC PATCH 05/35] libceph: Add functions to add ceph_databufs to requests David Howells
2025-03-13 23:32 ` [RFC PATCH 06/35] rbd: Use ceph_databuf for rbd_obj_read_sync() David Howells
2025-03-17 19:08   ` Viacheslav Dubeyko
2025-04-11 13:48     ` David Howells
2025-03-13 23:32 ` [RFC PATCH 07/35] libceph: Change ceph_osdc_call()'s reply to a ceph_databuf David Howells
2025-03-17 19:41   ` Viacheslav Dubeyko
2025-03-17 22:12     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 08/35] libceph: Unexport osd_req_op_cls_request_data_pages() David Howells
2025-03-13 23:33 ` [RFC PATCH 09/35] libceph: Remove osd_req_op_cls_response_data_pages() David Howells
2025-03-13 23:33 ` [RFC PATCH 10/35] libceph: Convert notify_id_pages to a ceph_databuf David Howells
2025-03-13 23:33 ` [RFC PATCH 11/35] ceph: Use ceph_databuf in DIO David Howells
2025-03-17 20:03   ` Viacheslav Dubeyko
2025-03-17 22:26     ` David Howells
2025-03-13 23:33 ` David Howells [this message]
2025-03-13 23:33 ` [RFC PATCH 13/35] rbd: Switch from using bvec_iter to iov_iter David Howells
2025-03-18 19:38   ` Viacheslav Dubeyko
2025-03-18 22:13     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 14/35] libceph: Remove bvec and bio data container types David Howells
2025-03-13 23:33 ` [RFC PATCH 15/35] libceph: Make osd_req_op_cls_init() use a ceph_databuf and map it David Howells
2025-03-13 23:33 ` [RFC PATCH 16/35] libceph: Convert req_page of ceph_osdc_call() to ceph_databuf David Howells
2025-03-13 23:33 ` [RFC PATCH 17/35] libceph, rbd: Use ceph_databuf encoding start/stop David Howells
2025-03-18 19:59   ` Viacheslav Dubeyko
2025-03-18 22:19     ` David Howells
2025-03-20 21:45       ` Viacheslav Dubeyko
2025-03-13 23:33 ` [RFC PATCH 18/35] libceph, rbd: Convert some page arrays to ceph_databuf David Howells
2025-03-18 20:02   ` Viacheslav Dubeyko
2025-03-18 22:25     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 19/35] libceph, ceph: Convert users of ceph_pagelist " David Howells
2025-03-18 20:09   ` Viacheslav Dubeyko
2025-03-18 22:27     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 20/35] libceph: Remove ceph_pagelist David Howells
2025-03-13 23:33 ` [RFC PATCH 21/35] libceph: Make notify code use ceph_databuf_enc_start/stop David Howells
2025-03-18 20:12   ` Viacheslav Dubeyko
2025-03-18 22:36     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 22/35] libceph, rbd: Convert ceph_osdc_notify() reply to ceph_databuf David Howells
2025-03-19  0:08   ` Viacheslav Dubeyko
2025-03-20 14:44     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 23/35] rbd: Use ceph_databuf_enc_start/stop() David Howells
2025-03-19  0:32   ` Viacheslav Dubeyko
2025-03-20 14:59     ` Why use plain numbers and totals rather than predef'd constants for RPC sizes? David Howells
2025-03-20 21:48       ` Viacheslav Dubeyko
2025-03-13 23:33 ` [RFC PATCH 24/35] ceph: Make ceph_calc_file_object_mapping() return size as size_t David Howells
2025-03-13 23:33 ` [RFC PATCH 25/35] ceph: Wrap POSIX_FADV_WILLNEED to get caps David Howells
2025-03-13 23:33 ` [RFC PATCH 26/35] ceph: Kill ceph_rw_context David Howells
2025-03-13 23:33 ` [RFC PATCH 27/35] netfs: Pass extra write context to write functions David Howells
2025-03-13 23:33 ` [RFC PATCH 28/35] netfs: Adjust group handling David Howells
2025-03-19 18:57   ` Viacheslav Dubeyko
2025-03-20 15:22     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 29/35] netfs: Allow fs-private data to be handed through to request alloc David Howells
2025-03-13 23:33 ` [RFC PATCH 30/35] netfs: Make netfs_page_mkwrite() use folio_mkwrite_check_truncate() David Howells
2025-03-13 23:33 ` [RFC PATCH 31/35] netfs: Fix netfs_unbuffered_read() to return ssize_t rather than int David Howells
2025-03-13 23:33 ` [RFC PATCH 32/35] netfs: Add some more RMW support for ceph David Howells
2025-03-19 19:14   ` Viacheslav Dubeyko
2025-03-20 15:25     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 33/35] ceph: Use netfslib [INCOMPLETE] David Howells
2025-03-19 19:54   ` Viacheslav Dubeyko
2025-03-20 15:38     ` David Howells
2025-03-13 23:33 ` [RFC PATCH 34/35] ceph: Enable multipage folios for ceph files David Howells
2025-03-13 23:33 ` [RFC PATCH 35/35] ceph: Remove old I/O API bits David Howells

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250313233341.1675324-13-dhowells@redhat.com \
    --to=dhowells@redhat.com \
    --cc=amarkuze@redhat.com \
    --cc=ceph-devel@vger.kernel.org \
    --cc=dongsheng.yang@easystack.cn \
    --cc=idryomov@gmail.com \
    --cc=jlayton@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=slava@dubeyko.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox