public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
From: Hannes Reinecke <hare@suse.de>
To: Nitesh Shetty <nj.shetty@samsung.com>
Cc: chaitanyak@nvidia.com, linux-block@vger.kernel.org,
	linux-scsi@vger.kernel.org, dm-devel@redhat.com,
	linux-nvme@lists.infradead.org, linux-fsdevel@vger.kernel.org,
	axboe@kernel.dk, msnitzer@redhat.com, bvanassche@acm.org,
	martin.petersen@oracle.com, kbusch@kernel.org, hch@lst.de,
	Frederick.Knight@netapp.com, osandov@fb.com,
	lsf-pc@lists.linux-foundation.org, djwong@kernel.org,
	josef@toxicpanda.com, clm@fb.com, dsterba@suse.com,
	tytso@mit.edu, jack@suse.com, nitheshshetty@gmail.com,
	gost.dev@samsung.com, Arnav Dawn <arnav.dawn@samsung.com>,
	Alasdair Kergon <agk@redhat.com>,
	Mike Snitzer <snitzer@kernel.org>,
	Sagi Grimberg <sagi@grimberg.me>,
	James Smart <james.smart@broadcom.com>,
	Chaitanya Kulkarni <kch@nvidia.com>,
	Damien Le Moal <damien.lemoal@opensource.wdc.com>,
	Naohiro Aota <naohiro.aota@wdc.com>,
	Johannes Thumshirn <jth@kernel.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v4 02/10] block: Add copy offload support infrastructure
Date: Wed, 27 Apr 2022 12:29:15 +0200	[thread overview]
Message-ID: <2082148f-890f-e5f4-c304-b99212aa377e@suse.de> (raw)
In-Reply-To: <20220426101241.30100-3-nj.shetty@samsung.com>

On 4/26/22 12:12, Nitesh Shetty wrote:
> Introduce blkdev_issue_copy which supports source and destination bdevs,
> and an array of (source, destination and copy length) tuples.
> Introduce REQ_COPY copy offload operation flag. Create a read-write
> bio pair with a token as payload and submitted to the device in order.
> Read request populates token with source specific information which
> is then passed with write request.
> This design is courtesy Mikulas Patocka's token based copy
> 
> Larger copy will be divided, based on max_copy_sectors,
> max_copy_range_sector limits.
> 
> Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> Signed-off-by: Arnav Dawn <arnav.dawn@samsung.com>
> ---
>   block/blk-lib.c           | 232 ++++++++++++++++++++++++++++++++++++++
>   block/blk.h               |   2 +
>   include/linux/blk_types.h |  21 ++++
>   include/linux/blkdev.h    |   2 +
>   include/uapi/linux/fs.h   |  14 +++
>   5 files changed, 271 insertions(+)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 09b7e1200c0f..ba9da2d2f429 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -117,6 +117,238 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>   }
>   EXPORT_SYMBOL(blkdev_issue_discard);
>   
> +/*
> + * Wait on and process all in-flight BIOs.  This must only be called once
> + * all bios have been issued so that the refcount can only decrease.
> + * This just waits for all bios to make it through bio_copy_end_io. IO
> + * errors are propagated through cio->io_error.
> + */
> +static int cio_await_completion(struct cio *cio)
> +{
> +	int ret = 0;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&cio->lock, flags);
> +	if (cio->refcount) {
> +		cio->waiter = current;
> +		__set_current_state(TASK_UNINTERRUPTIBLE);
> +		spin_unlock_irqrestore(&cio->lock, flags);
> +		blk_io_schedule();
> +		/* wake up sets us TASK_RUNNING */
> +		spin_lock_irqsave(&cio->lock, flags);
> +		cio->waiter = NULL;
> +		ret = cio->io_err;
> +	}
> +	spin_unlock_irqrestore(&cio->lock, flags);
> +	kvfree(cio);
> +
> +	return ret;
> +}
> +
> +static void bio_copy_end_io(struct bio *bio)
> +{
> +	struct copy_ctx *ctx = bio->bi_private;
> +	struct cio *cio = ctx->cio;
> +	sector_t clen;
> +	int ri = ctx->range_idx;
> +	unsigned long flags;
> +	bool wake = false;
> +
> +	if (bio->bi_status) {
> +		cio->io_err = bio->bi_status;
> +		clen = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - ctx->start_sec;
> +		cio->rlist[ri].comp_len = min_t(sector_t, clen, cio->rlist[ri].comp_len);
> +	}
> +	__free_page(bio->bi_io_vec[0].bv_page);
> +	kfree(ctx);
> +	bio_put(bio);
> +
> +	spin_lock_irqsave(&cio->lock, flags);
> +	if (((--cio->refcount) <= 0) && cio->waiter)
> +		wake = true;
> +	spin_unlock_irqrestore(&cio->lock, flags);
> +	if (wake)
> +		wake_up_process(cio->waiter);
> +}
> +
> +/*
> + * blk_copy_offload	- Use device's native copy offload feature
> + * Go through user provide payload, prepare new payload based on device's copy offload limits.
> + */
> +int blk_copy_offload(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *rlist, struct block_device *dst_bdev, gfp_t gfp_mask)
> +{
> +	struct request_queue *sq = bdev_get_queue(src_bdev);
> +	struct request_queue *dq = bdev_get_queue(dst_bdev);
> +	struct bio *read_bio, *write_bio;
> +	struct copy_ctx *ctx;
> +	struct cio *cio;
> +	struct page *token;
> +	sector_t src_blk, copy_len, dst_blk;
> +	sector_t remaining, max_copy_len = LONG_MAX;
> +	unsigned long flags;
> +	int ri = 0, ret = 0;
> +
> +	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
> +	if (!cio)
> +		return -ENOMEM;
> +	cio->rlist = rlist;
> +	spin_lock_init(&cio->lock);
> +
> +	max_copy_len = min_t(sector_t, sq->limits.max_copy_sectors, dq->limits.max_copy_sectors);
> +	max_copy_len = min3(max_copy_len, (sector_t)sq->limits.max_copy_range_sectors,
> +			(sector_t)dq->limits.max_copy_range_sectors) << SECTOR_SHIFT;
> +
> +	for (ri = 0; ri < nr_srcs; ri++) {
> +		cio->rlist[ri].comp_len = rlist[ri].len;
> +		src_blk = rlist[ri].src;
> +		dst_blk = rlist[ri].dst;
> +		for (remaining = rlist[ri].len; remaining > 0; remaining -= copy_len) {
> +			copy_len = min(remaining, max_copy_len);
> +
> +			token = alloc_page(gfp_mask);
> +			if (unlikely(!token)) {
> +				ret = -ENOMEM;
> +				goto err_token;
> +			}
> +
> +			ctx = kzalloc(sizeof(struct copy_ctx), gfp_mask);
> +			if (!ctx) {
> +				ret = -ENOMEM;
> +				goto err_ctx;
> +			}
> +			ctx->cio = cio;
> +			ctx->range_idx = ri;
> +			ctx->start_sec = dst_blk;
> +
> +			read_bio = bio_alloc(src_bdev, 1, REQ_OP_READ | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!read_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			read_bio->bi_iter.bi_sector = src_blk >> SECTOR_SHIFT;
> +			__bio_add_page(read_bio, token, PAGE_SIZE, 0);
> +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> +			read_bio->bi_iter.bi_size = copy_len;
> +			ret = submit_bio_wait(read_bio);
> +			bio_put(read_bio);
> +			if (ret)
> +				goto err_read_bio;
> +
> +			write_bio = bio_alloc(dst_bdev, 1, REQ_OP_WRITE | REQ_COPY | REQ_NOMERGE,
> +					gfp_mask);
> +			if (!write_bio) {
> +				ret = -ENOMEM;
> +				goto err_read_bio;
> +			}
> +			write_bio->bi_iter.bi_sector = dst_blk >> SECTOR_SHIFT;
> +			__bio_add_page(write_bio, token, PAGE_SIZE, 0);
> +			/*__bio_add_page increases bi_size by len, so overwrite it with copy len*/
> +			write_bio->bi_iter.bi_size = copy_len;
> +			write_bio->bi_end_io = bio_copy_end_io;
> +			write_bio->bi_private = ctx;
> +
> +			spin_lock_irqsave(&cio->lock, flags);
> +			++cio->refcount;
> +			spin_unlock_irqrestore(&cio->lock, flags);
> +
> +			submit_bio(write_bio);
> +			src_blk += copy_len;
> +			dst_blk += copy_len;
> +		}
> +	}
> +

Hmm. I'm not sure if I like the copy loop.
What I definitely would do is to allocate the write bio before reading 
data; after all, if we can't allocate the write bio reading is pretty 
much pointless.

But the real issue I have with this is that it's doing synchronous 
reads, thereby limiting the performance.

Can't you submit the write bio from the end_io function of the read bio?
That would disentangle things, and we should be getting a better 
performance.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


  parent reply	other threads:[~2022-04-27 12:50 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20220426101804epcas5p4a0a325d3ce89e868e4924bbdeeba6d15@epcas5p4.samsung.com>
2022-04-26 10:12 ` [PATCH v4 00/10] Add Copy offload support Nitesh Shetty
2022-04-26 10:12   ` [PATCH v4 01/10] block: Introduce queue limits for copy-offload support Nitesh Shetty
2022-04-27  1:59     ` Damien Le Moal
2022-04-27 15:30       ` Nitesh Shetty
2022-04-27 21:57         ` Damien Le Moal
2022-04-27 10:30     ` Hannes Reinecke
2022-04-26 10:12   ` [PATCH v4 02/10] block: Add copy offload support infrastructure Nitesh Shetty
2022-04-27  0:11     ` kernel test robot
2022-04-27  2:45     ` Damien Le Moal
2022-04-27 15:15       ` Nitesh Shetty
2022-04-27 22:04         ` Damien Le Moal
2022-04-28  8:01           ` Nitesh Shetty
2022-04-27 10:29     ` Hannes Reinecke [this message]
2022-04-27 15:48       ` Nitesh Shetty
2022-04-26 10:12   ` [PATCH v4 03/10] block: Introduce a new ioctl for copy Nitesh Shetty
2022-04-27  2:48     ` Damien Le Moal
2022-04-27 13:03       ` Nitesh Shetty
2022-04-27 10:37     ` Hannes Reinecke
2022-04-26 10:12   ` [PATCH v4 04/10] block: add emulation " Nitesh Shetty
2022-04-27  1:33     ` kernel test robot
2022-04-26 10:12   ` [PATCH v4 05/10] nvme: add copy offload support Nitesh Shetty
2022-04-28 14:02     ` kernel test robot
2022-04-26 10:12   ` [PATCH v4 06/10] nvmet: add copy command support for bdev and file ns Nitesh Shetty
2022-04-28 14:53     ` kernel test robot
2022-04-26 10:12   ` [PATCH v4 07/10] dm: Add support for copy offload Nitesh Shetty
2022-04-28 15:54     ` kernel test robot
2022-04-26 10:12   ` [PATCH v4 08/10] dm: Enable copy offload for dm-linear target Nitesh Shetty
2022-04-26 10:12   ` [PATCH v4 09/10] dm kcopyd: use copy offload support Nitesh Shetty
2022-04-26 10:12   ` [PATCH v4 10/10] fs: add support for copy file range in zonefs Nitesh Shetty
2022-04-27  1:42     ` Damien Le Moal
2022-04-27  1:46   ` [PATCH v4 00/10] Add Copy offload support Damien Le Moal
2022-04-27 15:38     ` Nitesh Shetty
2022-04-27 21:56       ` Damien Le Moal
2022-04-27  2:00   ` Damien Le Moal
2022-04-27  2:19   ` Damien Le Moal
2022-04-27 12:49     ` Nitesh Shetty
2022-04-27 22:05       ` Damien Le Moal
2022-04-28  7:49         ` Nitesh Shetty
2022-04-28 21:37           ` Damien Le Moal
2022-04-29  3:39             ` [dm-devel] " Bart Van Assche
2022-05-02  4:09       ` Dave Chinner
2022-05-02 12:54         ` Damien Le Moal
2022-05-02 23:20           ` Dave Chinner
2022-05-02 12:14       ` [dm-devel] " Damien Le Moal
2022-05-02 12:16         ` Damien Le Moal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2082148f-890f-e5f4-c304-b99212aa377e@suse.de \
    --to=hare@suse.de \
    --cc=Frederick.Knight@netapp.com \
    --cc=agk@redhat.com \
    --cc=arnav.dawn@samsung.com \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=chaitanyak@nvidia.com \
    --cc=clm@fb.com \
    --cc=damien.lemoal@opensource.wdc.com \
    --cc=djwong@kernel.org \
    --cc=dm-devel@redhat.com \
    --cc=dsterba@suse.com \
    --cc=gost.dev@samsung.com \
    --cc=hch@lst.de \
    --cc=jack@suse.com \
    --cc=james.smart@broadcom.com \
    --cc=josef@toxicpanda.com \
    --cc=jth@kernel.org \
    --cc=kbusch@kernel.org \
    --cc=kch@nvidia.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=lsf-pc@lists.linux-foundation.org \
    --cc=martin.petersen@oracle.com \
    --cc=msnitzer@redhat.com \
    --cc=naohiro.aota@wdc.com \
    --cc=nitheshshetty@gmail.com \
    --cc=nj.shetty@samsung.com \
    --cc=osandov@fb.com \
    --cc=sagi@grimberg.me \
    --cc=snitzer@kernel.org \
    --cc=tytso@mit.edu \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox