linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: Xiubo Li <xiubli@redhat.com>, idryomov@gmail.com
Cc: ceph-devel@vger.kernel.org, dhowells@redhat.com,
	viro@zeniv.linux.org.uk, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH v2 1/2] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type
Date: Fri, 01 Jul 2022 06:16:51 -0400	[thread overview]
Message-ID: <2431ec5495612912399471c94dbec6246866359a.camel@kernel.org> (raw)
In-Reply-To: <86b15eae-66f8-2865-bca4-74c207b30100@redhat.com>

On Fri, 2022-07-01 at 10:04 +0800, Xiubo Li wrote:
> On 6/27/22 11:54 PM, Jeff Layton wrote:
> > Add an iov_iter to the unions in ceph_msg_data and ceph_msg_data_cursor.
> > Instead of requiring a list of pages or bvecs, we can just use an
> > iov_iter directly, and avoid extra allocations.
> > 
> > We assume that the pages represented by the iter are pinned such that
> > they shouldn't incur page faults, which is the case for the iov_iters
> > created by netfs.
> > 
> > While working on this, Al Viro informed me that he was going to change
> > iov_iter_get_pages to auto-advance the iterator as that pattern is more
> > or less required for ITER_PIPE anyway. We emulate that here for now by
> > advancing in the _next op and tracking that amount in the "lastlen"
> > field.
> > 
> > In the event that _next is called twice without an intervening
> > _advance, we revert the iov_iter by the remaining lastlen before
> > calling iov_iter_get_pages.
> > 
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Cc: David Howells <dhowells@redhat.com>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >   include/linux/ceph/messenger.h  |  8 ++++
> >   include/linux/ceph/osd_client.h |  4 ++
> >   net/ceph/messenger.c            | 85 +++++++++++++++++++++++++++++++++
> >   net/ceph/osd_client.c           | 27 +++++++++++
> >   4 files changed, 124 insertions(+)
> > 
> > diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> > index 9fd7255172ad..2eaaabbe98cb 100644
> > --- a/include/linux/ceph/messenger.h
> > +++ b/include/linux/ceph/messenger.h
> > @@ -123,6 +123,7 @@ enum ceph_msg_data_type {
> >   	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
> >   #endif /* CONFIG_BLOCK */
> >   	CEPH_MSG_DATA_BVECS,	/* data source/destination is a bio_vec array */
> > +	CEPH_MSG_DATA_ITER,	/* data source/destination is an iov_iter */
> >   };
> >   
> >   #ifdef CONFIG_BLOCK
> > @@ -224,6 +225,7 @@ struct ceph_msg_data {
> >   			bool		own_pages;
> >   		};
> >   		struct ceph_pagelist	*pagelist;
> > +		struct iov_iter		iter;
> >   	};
> >   };
> >   
> > @@ -248,6 +250,10 @@ struct ceph_msg_data_cursor {
> >   			struct page	*page;		/* page from list */
> >   			size_t		offset;		/* bytes from list */
> >   		};
> > +		struct {
> > +			struct iov_iter		iov_iter;
> > +			unsigned int		lastlen;
> > +		};
> >   	};
> >   };
> >   
> > @@ -605,6 +611,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
> >   #endif /* CONFIG_BLOCK */
> >   void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
> >   			     struct ceph_bvec_iter *bvec_pos);
> > +void ceph_msg_data_add_iter(struct ceph_msg *msg,
> > +			    struct iov_iter *iter);
> >   
> >   struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
> >   			       gfp_t flags, bool can_fail);
> > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> > index 6ec3cb2ac457..ef0ad534b6c5 100644
> > --- a/include/linux/ceph/osd_client.h
> > +++ b/include/linux/ceph/osd_client.h
> > @@ -108,6 +108,7 @@ enum ceph_osd_data_type {
> >   	CEPH_OSD_DATA_TYPE_BIO,
> >   #endif /* CONFIG_BLOCK */
> >   	CEPH_OSD_DATA_TYPE_BVECS,
> > +	CEPH_OSD_DATA_TYPE_ITER,
> >   };
> >   
> >   struct ceph_osd_data {
> > @@ -131,6 +132,7 @@ struct ceph_osd_data {
> >   			struct ceph_bvec_iter	bvec_pos;
> >   			u32			num_bvecs;
> >   		};
> > +		struct iov_iter		iter;
> >   	};
> >   };
> >   
> > @@ -501,6 +503,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
> >   void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
> >   					 unsigned int which,
> >   					 struct ceph_bvec_iter *bvec_pos);
> > +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
> > +				unsigned int which, struct iov_iter *iter);
> >   
> >   extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
> >   					unsigned int which,
> > diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> > index 6056c8f7dd4c..604f025034ab 100644
> > --- a/net/ceph/messenger.c
> > +++ b/net/ceph/messenger.c
> > @@ -964,6 +964,69 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
> >   	return true;
> >   }
> >   
> > +static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
> > +					size_t length)
> > +{
> > +	struct ceph_msg_data *data = cursor->data;
> > +
> > +	cursor->iov_iter = data->iter;
> > +	cursor->lastlen = 0;
> > +	iov_iter_truncate(&cursor->iov_iter, length);
> > +	cursor->resid = iov_iter_count(&cursor->iov_iter);
> > +}
> > +
> > +static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
> > +						size_t *page_offset,
> > +						size_t *length)
> > +{
> > +	struct page *page;
> > +	ssize_t len;
> > +
> > +	if (cursor->lastlen)
> > +		iov_iter_revert(&cursor->iov_iter, cursor->lastlen);
> > +
> > +	len = iov_iter_get_pages(&cursor->iov_iter, &page, PAGE_SIZE,
> > +				 1, page_offset);
> > +	BUG_ON(len < 0);
> > +
> > +	cursor->lastlen = len;
> > +
> > +	/*
> > +	 * FIXME: Al Viro says that he will soon change iov_iter_get_pages
> > +	 * to auto-advance the iterator. Emulate that here for now.
> > +	 */
> > +	iov_iter_advance(&cursor->iov_iter, len);
> > +
> > +	/*
> > +	 * FIXME: The assumption is that the pages represented by the iov_iter
> > +	 * 	  are pinned, with the references held by the upper-level
> > +	 * 	  callers, or by virtue of being under writeback. Eventually,
> > +	 * 	  we'll get an iov_iter_get_pages variant that doesn't take page
> > +	 * 	  refs. Until then, just put the page ref.
> > +	 */
> > +	VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
> > +	put_page(page);
> > +
> > +	*length = min_t(size_t, len, cursor->resid);
> > +	return page;
> > +}
> > +
> > +static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
> > +					size_t bytes)
> > +{
> > +	BUG_ON(bytes > cursor->resid);
> > +	cursor->resid -= bytes;
> > +
> > +	if (bytes < cursor->lastlen) {
> > +		cursor->lastlen -= bytes;
> > +	} else {
> > +		iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen);
> > +		cursor->lastlen = 0;
> > +	}
> > +
> > +	return cursor->resid;
> > +}
> > +
> >   /*
> >    * Message data is handled (sent or received) in pieces, where each
> >    * piece resides on a single page.  The network layer might not
> > @@ -991,6 +1054,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
> >   	case CEPH_MSG_DATA_BVECS:
> >   		ceph_msg_data_bvecs_cursor_init(cursor, length);
> >   		break;
> > +	case CEPH_MSG_DATA_ITER:
> > +		ceph_msg_data_iter_cursor_init(cursor, length);
> > +		break;
> >   	case CEPH_MSG_DATA_NONE:
> >   	default:
> >   		/* BUG(); */
> > @@ -1038,6 +1104,9 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
> >   	case CEPH_MSG_DATA_BVECS:
> >   		page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
> >   		break;
> > +	case CEPH_MSG_DATA_ITER:
> > +		page = ceph_msg_data_iter_next(cursor, page_offset, length);
> > +		break;
> >   	case CEPH_MSG_DATA_NONE:
> >   	default:
> >   		page = NULL;
> > @@ -1076,6 +1145,9 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
> >   	case CEPH_MSG_DATA_BVECS:
> >   		new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
> >   		break;
> > +	case CEPH_MSG_DATA_ITER:
> > +		new_piece = ceph_msg_data_iter_advance(cursor, bytes);
> > +		break;
> >   	case CEPH_MSG_DATA_NONE:
> >   	default:
> >   		BUG();
> > @@ -1874,6 +1946,19 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
> >   }
> >   EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
> >   
> > +void ceph_msg_data_add_iter(struct ceph_msg *msg,
> > +			    struct iov_iter *iter)
> > +{
> > +	struct ceph_msg_data *data;
> > +
> > +	data = ceph_msg_data_add(msg);
> > +	data->type = CEPH_MSG_DATA_ITER;
> > +	data->iter = *iter;
> > +
> > +	msg->data_length += iov_iter_count(&data->iter);
> > +}
> > +EXPORT_SYMBOL(ceph_msg_data_add_iter);
> > +
> 
> Will this be used outside the libceph.ko ? It seem will never ? And also 
> some other existing ones like 'ceph_msg_data_add_bvecs'.
> 

No, I don't think it will. We can probably remove that.

> >   /*
> >    * construct a new message with given type, size
> >    * the new msg has a ref count of 1.
> > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> > index 75761537c644..2a7e46524e71 100644
> > --- a/net/ceph/osd_client.c
> > +++ b/net/ceph/osd_client.c
> > @@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
> >   	osd_data->num_bvecs = num_bvecs;
> >   }
> >   
> > +static void ceph_osd_iter_init(struct ceph_osd_data *osd_data,
> > +			       struct iov_iter *iter)
> > +{
> > +	osd_data->type = CEPH_OSD_DATA_TYPE_ITER;
> > +	osd_data->iter = *iter;
> > +}
> > +
> >   static struct ceph_osd_data *
> >   osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
> >   {
> > @@ -264,6 +271,22 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
> >   }
> >   EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
> >   
> > +/**
> > + * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer
> > + * @osd_req: The request to set up
> > + * @which: ?
> 
> For this could you explain more ?
> 
> 

Sure. I'll respin and flesh out this comment.

> > + * @iter: The buffer iterator
> > + */
> > +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
> > +				unsigned int which, struct iov_iter *iter)
> > +{
> > +	struct ceph_osd_data *osd_data;
> > +
> > +	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
> > +	ceph_osd_iter_init(osd_data, iter);
> > +}
> > +EXPORT_SYMBOL(osd_req_op_extent_osd_iter);
> > +
> >   static void osd_req_op_cls_request_info_pagelist(
> >   			struct ceph_osd_request *osd_req,
> >   			unsigned int which, struct ceph_pagelist *pagelist)
> > @@ -346,6 +369,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
> >   #endif /* CONFIG_BLOCK */
> >   	case CEPH_OSD_DATA_TYPE_BVECS:
> >   		return osd_data->bvec_pos.iter.bi_size;
> > +	case CEPH_OSD_DATA_TYPE_ITER:
> > +		return iov_iter_count(&osd_data->iter);
> >   	default:
> >   		WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
> >   		return 0;
> > @@ -954,6 +979,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
> >   #endif
> >   	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
> >   		ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
> > +	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) {
> > +		ceph_msg_data_add_iter(msg, &osd_data->iter);
> >   	} else {
> >   		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
> >   	}
> 

-- 
Jeff Layton <jlayton@kernel.org>

  reply	other threads:[~2022-07-01 10:16 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-27 15:54 [PATCH v2 0/2] libceph: add new iov_iter msg_data type and use it for reads Jeff Layton
2022-06-27 15:54 ` [PATCH v2 1/2] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type Jeff Layton
2022-07-01  2:04   ` Xiubo Li
2022-07-01 10:16     ` Jeff Layton [this message]
2022-06-27 15:54 ` [PATCH v2 2/2] ceph: use osd_req_op_extent_osd_iter for netfs reads Jeff Layton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2431ec5495612912399471c94dbec6246866359a.camel@kernel.org \
    --to=jlayton@kernel.org \
    --cc=ceph-devel@vger.kernel.org \
    --cc=dhowells@redhat.com \
    --cc=idryomov@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=xiubli@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).