From: Xiubo Li <xiubli@redhat.com>
To: Jeff Layton <jlayton@kernel.org>, idryomov@gmail.com
Cc: ceph-devel@vger.kernel.org, dhowells@redhat.com,
viro@zeniv.linux.org.uk, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH v2 1/2] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type
Date: Fri, 1 Jul 2022 10:04:29 +0800 [thread overview]
Message-ID: <86b15eae-66f8-2865-bca4-74c207b30100@redhat.com> (raw)
In-Reply-To: <20220627155449.383989-2-jlayton@kernel.org>
On 6/27/22 11:54 PM, Jeff Layton wrote:
> Add an iov_iter to the unions in ceph_msg_data and ceph_msg_data_cursor.
> Instead of requiring a list of pages or bvecs, we can just use an
> iov_iter directly, and avoid extra allocations.
>
> We assume that the pages represented by the iter are pinned such that
> they shouldn't incur page faults, which is the case for the iov_iters
> created by netfs.
>
> While working on this, Al Viro informed me that he was going to change
> iov_iter_get_pages to auto-advance the iterator as that pattern is more
> or less required for ITER_PIPE anyway. We emulate that here for now by
> advancing in the _next op and tracking that amount in the "lastlen"
> field.
>
> In the event that _next is called twice without an intervening
> _advance, we revert the iov_iter by the remaining lastlen before
> calling iov_iter_get_pages.
>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: David Howells <dhowells@redhat.com>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> include/linux/ceph/messenger.h | 8 ++++
> include/linux/ceph/osd_client.h | 4 ++
> net/ceph/messenger.c | 85 +++++++++++++++++++++++++++++++++
> net/ceph/osd_client.c | 27 +++++++++++
> 4 files changed, 124 insertions(+)
>
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 9fd7255172ad..2eaaabbe98cb 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -123,6 +123,7 @@ enum ceph_msg_data_type {
> CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
> #endif /* CONFIG_BLOCK */
> CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */
> + CEPH_MSG_DATA_ITER, /* data source/destination is an iov_iter */
> };
>
> #ifdef CONFIG_BLOCK
> @@ -224,6 +225,7 @@ struct ceph_msg_data {
> bool own_pages;
> };
> struct ceph_pagelist *pagelist;
> + struct iov_iter iter;
> };
> };
>
> @@ -248,6 +250,10 @@ struct ceph_msg_data_cursor {
> struct page *page; /* page from list */
> size_t offset; /* bytes from list */
> };
> + struct {
> + struct iov_iter iov_iter;
> + unsigned int lastlen;
> + };
> };
> };
>
> @@ -605,6 +611,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
> #endif /* CONFIG_BLOCK */
> void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
> struct ceph_bvec_iter *bvec_pos);
> +void ceph_msg_data_add_iter(struct ceph_msg *msg,
> + struct iov_iter *iter);
>
> struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
> gfp_t flags, bool can_fail);
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 6ec3cb2ac457..ef0ad534b6c5 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -108,6 +108,7 @@ enum ceph_osd_data_type {
> CEPH_OSD_DATA_TYPE_BIO,
> #endif /* CONFIG_BLOCK */
> CEPH_OSD_DATA_TYPE_BVECS,
> + CEPH_OSD_DATA_TYPE_ITER,
> };
>
> struct ceph_osd_data {
> @@ -131,6 +132,7 @@ struct ceph_osd_data {
> struct ceph_bvec_iter bvec_pos;
> u32 num_bvecs;
> };
> + struct iov_iter iter;
> };
> };
>
> @@ -501,6 +503,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
> void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
> unsigned int which,
> struct ceph_bvec_iter *bvec_pos);
> +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
> + unsigned int which, struct iov_iter *iter);
>
> extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
> unsigned int which,
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 6056c8f7dd4c..604f025034ab 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -964,6 +964,69 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
> return true;
> }
>
> +static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
> + size_t length)
> +{
> + struct ceph_msg_data *data = cursor->data;
> +
> + cursor->iov_iter = data->iter;
> + cursor->lastlen = 0;
> + iov_iter_truncate(&cursor->iov_iter, length);
> + cursor->resid = iov_iter_count(&cursor->iov_iter);
> +}
> +
> +static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
> + size_t *page_offset,
> + size_t *length)
> +{
> + struct page *page;
> + ssize_t len;
> +
> + if (cursor->lastlen)
> + iov_iter_revert(&cursor->iov_iter, cursor->lastlen);
> +
> + len = iov_iter_get_pages(&cursor->iov_iter, &page, PAGE_SIZE,
> + 1, page_offset);
> + BUG_ON(len < 0);
> +
> + cursor->lastlen = len;
> +
> + /*
> + * FIXME: Al Viro says that he will soon change iov_iter_get_pages
> + * to auto-advance the iterator. Emulate that here for now.
> + */
> + iov_iter_advance(&cursor->iov_iter, len);
> +
> + /*
> + * FIXME: The assumption is that the pages represented by the iov_iter
> + * are pinned, with the references held by the upper-level
> + * callers, or by virtue of being under writeback. Eventually,
> + * we'll get an iov_iter_get_pages variant that doesn't take page
> + * refs. Until then, just put the page ref.
> + */
> + VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
> + put_page(page);
> +
> + *length = min_t(size_t, len, cursor->resid);
> + return page;
> +}
> +
> +static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
> + size_t bytes)
> +{
> + BUG_ON(bytes > cursor->resid);
> + cursor->resid -= bytes;
> +
> + if (bytes < cursor->lastlen) {
> + cursor->lastlen -= bytes;
> + } else {
> + iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen);
> + cursor->lastlen = 0;
> + }
> +
> + return cursor->resid;
> +}
> +
> /*
> * Message data is handled (sent or received) in pieces, where each
> * piece resides on a single page. The network layer might not
> @@ -991,6 +1054,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
> case CEPH_MSG_DATA_BVECS:
> ceph_msg_data_bvecs_cursor_init(cursor, length);
> break;
> + case CEPH_MSG_DATA_ITER:
> + ceph_msg_data_iter_cursor_init(cursor, length);
> + break;
> case CEPH_MSG_DATA_NONE:
> default:
> /* BUG(); */
> @@ -1038,6 +1104,9 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
> case CEPH_MSG_DATA_BVECS:
> page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
> break;
> + case CEPH_MSG_DATA_ITER:
> + page = ceph_msg_data_iter_next(cursor, page_offset, length);
> + break;
> case CEPH_MSG_DATA_NONE:
> default:
> page = NULL;
> @@ -1076,6 +1145,9 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
> case CEPH_MSG_DATA_BVECS:
> new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
> break;
> + case CEPH_MSG_DATA_ITER:
> + new_piece = ceph_msg_data_iter_advance(cursor, bytes);
> + break;
> case CEPH_MSG_DATA_NONE:
> default:
> BUG();
> @@ -1874,6 +1946,19 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
> }
> EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
>
> +void ceph_msg_data_add_iter(struct ceph_msg *msg,
> + struct iov_iter *iter)
> +{
> + struct ceph_msg_data *data;
> +
> + data = ceph_msg_data_add(msg);
> + data->type = CEPH_MSG_DATA_ITER;
> + data->iter = *iter;
> +
> + msg->data_length += iov_iter_count(&data->iter);
> +}
> +EXPORT_SYMBOL(ceph_msg_data_add_iter);
> +
Will this be used outside the libceph.ko ? It seem will never ? And also
some other existing ones like 'ceph_msg_data_add_bvecs'.
> /*
> * construct a new message with given type, size
> * the new msg has a ref count of 1.
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 75761537c644..2a7e46524e71 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
> osd_data->num_bvecs = num_bvecs;
> }
>
> +static void ceph_osd_iter_init(struct ceph_osd_data *osd_data,
> + struct iov_iter *iter)
> +{
> + osd_data->type = CEPH_OSD_DATA_TYPE_ITER;
> + osd_data->iter = *iter;
> +}
> +
> static struct ceph_osd_data *
> osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
> {
> @@ -264,6 +271,22 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
> }
> EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
>
> +/**
> + * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer
> + * @osd_req: The request to set up
> + * @which: ?
For this could you explain more ?
> + * @iter: The buffer iterator
> + */
> +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
> + unsigned int which, struct iov_iter *iter)
> +{
> + struct ceph_osd_data *osd_data;
> +
> + osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
> + ceph_osd_iter_init(osd_data, iter);
> +}
> +EXPORT_SYMBOL(osd_req_op_extent_osd_iter);
> +
> static void osd_req_op_cls_request_info_pagelist(
> struct ceph_osd_request *osd_req,
> unsigned int which, struct ceph_pagelist *pagelist)
> @@ -346,6 +369,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
> #endif /* CONFIG_BLOCK */
> case CEPH_OSD_DATA_TYPE_BVECS:
> return osd_data->bvec_pos.iter.bi_size;
> + case CEPH_OSD_DATA_TYPE_ITER:
> + return iov_iter_count(&osd_data->iter);
> default:
> WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
> return 0;
> @@ -954,6 +979,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
> #endif
> } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
> ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
> + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) {
> + ceph_msg_data_add_iter(msg, &osd_data->iter);
> } else {
> BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
> }
next prev parent reply other threads:[~2022-07-01 2:04 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-06-27 15:54 [PATCH v2 0/2] libceph: add new iov_iter msg_data type and use it for reads Jeff Layton
2022-06-27 15:54 ` [PATCH v2 1/2] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type Jeff Layton
2022-07-01 2:04 ` Xiubo Li [this message]
2022-07-01 10:16 ` Jeff Layton
2022-06-27 15:54 ` [PATCH v2 2/2] ceph: use osd_req_op_extent_osd_iter for netfs reads Jeff Layton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=86b15eae-66f8-2865-bca4-74c207b30100@redhat.com \
--to=xiubli@redhat.com \
--cc=ceph-devel@vger.kernel.org \
--cc=dhowells@redhat.com \
--cc=idryomov@gmail.com \
--cc=jlayton@kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).