public inbox for linux-block@vger.kernel.org
 help / color / mirror / Atom feed
From: Mikulas Patocka <mpatocka@redhat.com>
To: Jens Axboe <axboe@kernel.dk>
Cc: Li Nan <linan666@huaweicloud.com>,
	Zdenek Kabelac <zkabelac@redhat.com>,
	Christoph Hellwig <hch@infradead.org>,
	Chaitanya Kulkarni <chaitanyak@nvidia.com>,
	linux-block@vger.kernel.org, dm-devel@redhat.com
Subject: [PATCH v3 3/4] brd: enable discard
Date: Thu, 10 Aug 2023 12:09:33 +0200 (CEST)	[thread overview]
Message-ID: <10a61c-8c51-11f3-83b-dbffe688d68d@redhat.com> (raw)
In-Reply-To: <2dacc73-854-e71c-1746-99b017401c9a@redhat.com>

This patch implements discard in the brd driver. We use RCU to free the
page, so that if there are any concurrent readers or writes, they won't
touch the page after it is freed.

Calling "call_rcu" for each page is inefficient, so we attempt to batch
multiple pages to a single "call_rcu" call.

Note that we replace "BUG_ON(!page);" with "if (page) ..." in copy_to_brd
- the page can't be NULL under normal circumstances, it can only be NULL
if REQ_OP_WRITE races with REQ_OP_DISCARD. If these two bios race with
each other on the same page, the result is undefined, so we can handle
this race condition just by skipping the copying.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/block/brd.c |  144 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 131 insertions(+), 13 deletions(-)

Index: linux-2.6/drivers/block/brd.c
===================================================================
--- linux-2.6.orig/drivers/block/brd.c
+++ linux-2.6/drivers/block/brd.c
@@ -46,6 +46,8 @@ struct brd_device {
 	u64			brd_nr_pages;
 };
 
+static bool discard;
+
 /*
  * Look up and return a brd's page for a given sector.
  */
@@ -100,6 +102,54 @@ static int brd_insert_page(struct brd_de
 	return ret;
 }
 
+struct free_page_batch {
+	struct rcu_head rcu;
+	struct list_head list;
+};
+
+static void brd_free_page_rcu(struct rcu_head *head)
+{
+	__free_page(container_of(head, struct page, rcu_head));
+}
+
+static void brd_free_pages_rcu(struct rcu_head *head)
+{
+	struct free_page_batch *batch = container_of(head, struct free_page_batch, rcu);
+
+	while (!list_empty(&batch->list)) {
+		struct page *page = list_entry(batch->list.prev, struct page, lru);
+
+		list_del(&page->lru);
+
+		__free_page(page);
+	}
+
+	kfree(batch);
+}
+
+static void brd_free_page(struct brd_device *brd, sector_t sector,
+			  struct free_page_batch **batch)
+{
+	struct page *page;
+	pgoff_t idx;
+
+	idx = sector >> PAGE_SECTORS_SHIFT;
+	page = xa_erase(&brd->brd_pages, idx);
+
+	if (page) {
+		BUG_ON(page->index != idx);
+		if (!*batch) {
+			*batch = kmalloc(sizeof(struct free_page_batch), GFP_NOIO);
+			if (unlikely(!*batch)) {
+				call_rcu(&page->rcu_head, brd_free_page_rcu);
+				return;
+			}
+			INIT_LIST_HEAD(&(*batch)->list);
+		}
+		list_add(&page->lru, &(*batch)->list);
+	}
+}
+
 /*
  * Free all backing store pages and xarray. This must only be called when
  * there are no other users of the device.
@@ -152,11 +202,11 @@ static void copy_to_brd(struct brd_devic
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	rcu_read_lock();
 	page = brd_lookup_page(brd, sector);
-	BUG_ON(!page);
-
-	dst = kmap_atomic(page);
-	memcpy(dst + offset, src, copy);
-	kunmap_atomic(dst);
+	if (page) {
+		dst = kmap_atomic(page);
+		memcpy(dst + offset, src, copy);
+		kunmap_atomic(dst);
+	}
 	rcu_read_unlock();
 
 	if (copy < n) {
@@ -165,11 +215,11 @@ static void copy_to_brd(struct brd_devic
 		copy = n - copy;
 		rcu_read_lock();
 		page = brd_lookup_page(brd, sector);
-		BUG_ON(!page);
-
-		dst = kmap_atomic(page);
-		memcpy(dst, src, copy);
-		kunmap_atomic(dst);
+		if (page) {
+			dst = kmap_atomic(page);
+			memcpy(dst, src, copy);
+			kunmap_atomic(dst);
+		}
 		rcu_read_unlock();
 	}
 }
@@ -248,6 +298,34 @@ out:
 	return err;
 }
 
+void brd_do_discard(struct brd_device *brd, struct bio *bio)
+{
+	struct free_page_batch *batch = NULL;
+	sector_t sector, len, front_pad;
+
+	if (unlikely(!discard)) {
+		bio->bi_status = BLK_STS_NOTSUPP;
+		return;
+	}
+
+	sector = bio->bi_iter.bi_sector;
+	len = bio_sectors(bio);
+	front_pad = -sector & (PAGE_SECTORS - 1);
+	sector += front_pad;
+	if (unlikely(len <= front_pad))
+		return;
+	len -= front_pad;
+	len = round_down(len, PAGE_SECTORS);
+	while (len) {
+		brd_free_page(brd, sector, &batch);
+		sector += PAGE_SECTORS;
+		len -= PAGE_SECTORS;
+		cond_resched();
+	}
+	if (batch)
+		call_rcu(&batch->rcu, brd_free_pages_rcu);
+}
+
 static void brd_submit_bio(struct bio *bio)
 {
 	struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
@@ -280,6 +358,9 @@ static void brd_submit_bio(struct bio *b
 				sector += len >> SECTOR_SHIFT;
 			}
 			break;
+		case REQ_OP_DISCARD:
+			brd_do_discard(brd, bio);
+			break;
 		default:
 			bio->bi_status = BLK_STS_NOTSUPP;
 			break;
@@ -293,6 +374,40 @@ static const struct block_device_operati
 	.submit_bio =		brd_submit_bio,
 };
 
+static LIST_HEAD(brd_devices);
+static struct dentry *brd_debugfs_dir;
+
+static void brd_set_discard_limits(struct brd_device *brd)
+{
+	struct request_queue *queue = brd->brd_disk->queue;
+	if (discard) {
+		queue->limits.discard_granularity = PAGE_SIZE;
+		blk_queue_max_discard_sectors(queue, round_down(UINT_MAX, PAGE_SECTORS));
+	} else {
+		queue->limits.discard_granularity = 0;
+		blk_queue_max_discard_sectors(queue, 0);
+	}
+}
+
+static int discard_set_bool(const char *val, const struct kernel_param *kp)
+{
+	struct brd_device *brd;
+
+	int r = param_set_bool(val, kp);
+	if (r)
+		return r;
+
+	list_for_each_entry(brd, &brd_devices, brd_list)
+		brd_set_discard_limits(brd);
+
+	return 0;
+}
+
+static const struct kernel_param_ops discard_ops = {
+	.set = discard_set_bool,
+	.get = param_get_bool,
+};
+
 /*
  * And now the modules code and kernel interface.
  */
@@ -308,6 +423,10 @@ static int max_part = 1;
 module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 
+static bool discard = false;
+module_param_cb(discard, &discard_ops, &discard, 0644);
+MODULE_PARM_DESC(discard, "Support discard");
+
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 MODULE_ALIAS("rd");
@@ -326,9 +445,6 @@ __setup("ramdisk_size=", ramdisk_size);
  * The device scheme is derived from loop.c. Keep them in synch where possible
  * (should share code eventually).
  */
-static LIST_HEAD(brd_devices);
-static struct dentry *brd_debugfs_dir;
-
 static int brd_alloc(int i)
 {
 	struct brd_device *brd;
@@ -373,6 +489,8 @@ static int brd_alloc(int i)
 	 */
 	blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
 
+	brd_set_discard_limits(brd);
+
 	/* Tell the block layer that this is not a rotational device */
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
 	blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);


  parent reply	other threads:[~2023-08-10 10:10 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-10 10:07 [PATCH v3 0/4] brd discard patches Mikulas Patocka
2023-08-10 10:08 ` [PATCH v3 1/4] brd: use a switch statement in brd_submit_bio Mikulas Patocka
2023-08-10 10:09 ` [PATCH v3 2/4] brd: extend the rcu regions to cover read and write Mikulas Patocka
2023-08-10 10:09 ` Mikulas Patocka [this message]
2023-08-10 10:10 ` [PATCH v3 4/4] brd: implement write zeroes Mikulas Patocka
2023-11-10  1:22 ` [PATCH v3 0/4] brd discard patches Li Nan
2023-11-14 13:59   ` Mikulas Patocka
2024-01-19  8:41 ` Ming Lei
2024-01-22 16:30   ` Mikulas Patocka
2024-01-23  2:49     ` Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=10a61c-8c51-11f3-83b-dbffe688d68d@redhat.com \
    --to=mpatocka@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=chaitanyak@nvidia.com \
    --cc=dm-devel@redhat.com \
    --cc=hch@infradead.org \
    --cc=linan666@huaweicloud.com \
    --cc=linux-block@vger.kernel.org \
    --cc=zkabelac@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox