All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
To: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>,
	Kent Overstreet <kent.overstreet@gmail.com>,
	linux-bcache@vger.kernel.org
Subject: Re: [RFC] Live resize of backing device
Date: Fri, 27 Jan 2023 13:44:51 +0100	[thread overview]
Message-ID: <50e64fcd-3bd8-4175-c96e-5fa2ffe051d4@devo.com> (raw)
In-Reply-To: <755CAB25-BC58-4100-A524-6F922E1C13DC@suse.de>

 From 83f490ec8e81c840bdaf69e66021d661751975f2 Mon Sep 17 00:00:00 2001
From: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
Date: Thu, 8 Sep 2022 09:47:55 +0200
Subject: [PATCH v2] bcache: Add support for live resize of backing devices

Signed-off-by: Andrea Tomassetti <andrea.tomassetti-opensource@devo.com>
---
Hi Coly,
this is the second version of the patch. As you correctly pointed out,
I implemented roll-back functionalities in case of error.
I'm testing this funcionality using QEMU/KVM vm via libvirt.
Here the steps:
   1. make-bcache --writeback -B /dev/vdb -C /dev/vdc
   2. mkfs.xfs /dev/bcache0
   3. mount /dev/bcache0 /mnt
   3. dd if=/dev/random of=/mnt/random0 bs=1M count=1000
   4. md5sum /mnt/random0 | tee /mnt/random0.md5
   5. [HOST] virsh blockresize <vm-name> --path <disk-path> --size 
<new-size>
   6. xfs_growfs /dev/bcache0
   6. Repeat steps 3 and 4 with a different file name (e.g. random1.md5)
   7. umount/reboot/remount and check that the md5 hashes are correct with
         md5sum -c /mnt/random?.md5

  drivers/md/bcache/super.c | 84 ++++++++++++++++++++++++++++++++++++++-
  1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index ba3909bb6bea..1435a3f605f8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2443,6 +2443,85 @@ static bool bch_is_open(dev_t dev)
  	return bch_is_open_cache(dev) || bch_is_open_backing(dev);
  }

+static bool bch_update_capacity(dev_t dev)
+{
+	const size_t max_stripes = min_t(size_t, INT_MAX,
+					 SIZE_MAX / sizeof(atomic_t));
+
+	uint64_t n, n_old, orig_cached_sectors = 0;
+	void *tmp_realloc;
+
+	int nr_stripes_old;
+	bool res = false;
+
+	struct bcache_device *d;
+	struct cache_set *c, *tc;
+	struct cached_dev *dcp, *t, *dc = NULL;
+
+	uint64_t parent_nr_sectors;
+
+	list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+		list_for_each_entry_safe(dcp, t, &c->cached_devs, list)
+			if (dcp->bdev->bd_dev == dev) {
+				dc = dcp;
+				goto dc_found;
+			}
+
+dc_found:
+	if (!dc)
+		return false;
+
+	parent_nr_sectors = bdev_nr_sectors(dc->bdev) - dc->sb.data_offset;
+
+	if (parent_nr_sectors == bdev_nr_sectors(dc->disk.disk->part0))
+		return false;
+
+	d = &dc->disk;
+	orig_cached_sectors = d->c->cached_dev_sectors;
+
+	/* Force cached device sectors re-calc */
+	calc_cached_dev_sectors(d->c);
+
+	/* Block writeback thread */
+	down_write(&dc->writeback_lock);
+	nr_stripes_old = d->nr_stripes;
+	n = DIV_ROUND_UP_ULL(parent_nr_sectors, d->stripe_size);
+	if (!n || n > max_stripes) {
+		pr_err("nr_stripes too large or invalid: %llu (start sector beyond 
end of disk?)\n",
+			n);
+		goto restore_dev_sectors;
+	}
+	d->nr_stripes = n;
+
+	n = d->nr_stripes * sizeof(atomic_t);
+	n_old = nr_stripes_old * sizeof(atomic_t);
+	tmp_realloc = kvrealloc(d->stripe_sectors_dirty, n_old,
+		n, GFP_KERNEL);
+	if (!tmp_realloc)
+		goto restore_nr_stripes;
+
+	d->stripe_sectors_dirty = (atomic_t *) tmp_realloc;
+
+	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+	n_old = BITS_TO_LONGS(nr_stripes_old) * sizeof(unsigned long);
+	tmp_realloc = kvrealloc(d->full_dirty_stripes, n_old, n, GFP_KERNEL);
+	if (!tmp_realloc)
+		goto restore_nr_stripes;
+
+	d->full_dirty_stripes = (unsigned long *) tmp_realloc;
+
+	if ((res = set_capacity_and_notify(dc->disk.disk, parent_nr_sectors)))
+		goto unblock_and_exit;
+
+restore_nr_stripes:
+	d->nr_stripes = nr_stripes_old;
+restore_dev_sectors:
+	d->c->cached_dev_sectors = orig_cached_sectors;
+unblock_and_exit:
+	up_write(&dc->writeback_lock);
+	return res;
+}
+
  struct async_reg_args {
  	struct delayed_work reg_work;
  	char *path;
@@ -2569,7 +2648,10 @@ static ssize_t register_bcache(struct kobject *k, 
struct kobj_attribute *attr,
  			mutex_lock(&bch_register_lock);
  			if (lookup_bdev(strim(path), &dev) == 0 &&
  			    bch_is_open(dev))
-				err = "device already registered";
+				if (bch_update_capacity(dev))
+					err = "capacity changed";
+				else
+					err = "device already registered";
  			else
  				err = "device busy";
  			mutex_unlock(&bch_register_lock);
--
2.39.0



On 25/1/23 18:59, Coly Li wrote:
> 
> 
>> 2023年1月25日 18:07,Andrea Tomassetti <andrea.tomassetti-opensource@devo.com> 写道:
>>
>> On Tue, Jan 17, 2023 at 5:18 PM Coly Li <colyli@suse.de> wrote:
>>>>
> 
>>>>>
>>>>>> struct async_reg_args {
>>>>>>     struct delayed_work reg_work;
>>>>>>     char *path;
>>>>>> @@ -2569,7 +2639,10 @@ static ssize_t register_bcache(struct kobject
>>>>>> *k, struct kobj_attribute *attr,
>>>>>>             mutex_lock(&bch_register_lock);
>>>>>>             if (lookup_bdev(strim(path), &dev) == 0 &&
>>>>>>                 bch_is_open(dev))
>>>>>> -                err = "device already registered";
>>>>>> +                if (bch_update_capacity(dev))
>>>>>> +                    err = "capacity changed";
>>>>>> +                else
>>>>>> +                    err = "device already registered";
>>>>>
>>>>>
>>>>> As I said, it should be a separated write-only sysfile under the cache
>>>>> device's directory.
>>>> Can I ask why you don't like the automatic resize way? Why should the
>>>> resize be manual?
>>>
>>> Most of system administrators don’t like such silently automatic things. They want to extend the size explicitly, especially when there is other dependences in their configurations.
>>>
>> What I was trying to say is that, in order to resize a block device, a
>> manual command should be executed. So, this is already a "non-silent"
>> automatic thing.
>> Moreover, if the block device has a FS on it, the FS needs to be
>> manually grown with some special utilities, e.g. xfs_growfs. So,
>> again, another non-silent automatic step. Don't you agree?
>> For example, to resize a qcow device attached to a VM I'm manually
>> doing a `virsh blockresize`. As soon as I issue that command, the
>> virtio_blk driver inside the VM detects the disk size change and calls
>> the `set_capacity_and_notify` function. Why then should bcache behave
>> differently?
> 
> The above VM example makes sense, I am almost convinced.
> 
>>
>> If you're concerned that this can somehow break the
>> behaviour-compatibility with older versions of the driver, can we
>> protect this automatic discovery with an optional parameter? Will this
>> be an option you will take into account?
> 
> Then let’s forget the option sysfs at this moment. Once you feel the patch is ready for me to testing, please notice me with detailed steps to redo your testing.
> At that time during my testing, let’s discuss whether an extra option is necesssary, for now just keep your idea as automatically resize the cached device.
> 
> Thanks for your detailed explanation.
> 
> Coly Li
> 

  reply	other threads:[~2023-01-27 12:45 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-03 10:05 [RFC] Live resize of backing device Andrea Tomassetti
2022-08-04 14:32 ` Coly Li
2022-08-05 19:38 ` Eric Wheeler
2022-09-06 13:22   ` Andrea Tomassetti
2022-09-08  8:32     ` Andrea Tomassetti
2022-09-19 11:42       ` Andrea Tomassetti
2022-09-19 12:16         ` Coly Li
2022-12-09  8:57           ` Andrea Tomassetti
2022-12-09  9:36             ` Coly Li
2022-12-30 10:40       ` Coly Li
2023-01-11 16:01         ` Andrea Tomassetti
2023-01-17 13:08           ` Error messages with kernel 6.1.[56] Pierre Juhen
2023-01-17 16:08             ` Coly Li
2023-01-17 16:18           ` [RFC] Live resize of backing device Coly Li
2023-01-25 10:07             ` Andrea Tomassetti
2023-01-25 17:59               ` Coly Li
2023-01-27 12:44                 ` Andrea Tomassetti [this message]
2023-01-27 22:40                   ` Eric Wheeler
2023-01-31 10:20                     ` Andrea Tomassetti
2023-02-02 17:18                   ` Coly Li
2023-02-02 20:48                     ` Eric Wheeler
2023-02-03  2:41                       ` Coly Li
2023-02-19  9:39                   ` Coly Li
2023-02-20  8:27                     ` mingzhe
2023-02-20 12:29                       ` Coly Li
2023-02-22  8:42                         ` Andrea Tomassetti
2023-02-27 22:08                       ` Eric Wheeler
2023-02-28  2:46                         ` mingzhe
2023-01-27  2:53               ` [RFC] Live resize of bcache " Eric Wheeler

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=50e64fcd-3bd8-4175-c96e-5fa2ffe051d4@devo.com \
    --to=andrea.tomassetti-opensource@devo.com \
    --cc=bcache@lists.ewheeler.net \
    --cc=colyli@suse.de \
    --cc=kent.overstreet@gmail.com \
    --cc=linux-bcache@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.