* [PATCH 1/3] btrfs: convert volume rotating flag into bitmap
@ 2018-06-04 15:00 Anand Jain
2018-06-04 15:00 ` [PATCH 2/3] btrfs: convert volume seeding " Anand Jain
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Anand Jain @ 2018-06-04 15:00 UTC (permalink / raw)
To: linux-btrfs
Add bitmap btrfs_fs_devices::volume_state to maintain the volume states and
flags. This patch in perticular converts btrfs_fs_devices::rotating into
flag BTRFS_VOLUME_STATE_ROTATING.
Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
fs/btrfs/disk-io.c | 3 ++-
fs/btrfs/volumes.c | 7 ++++---
fs/btrfs/volumes.h | 15 ++++++++++-----
3 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 002d20b3ed61..b3c9e1f1a11d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3146,7 +3146,8 @@ int open_ctree(struct super_block *sb,
goto fail_cleaner;
if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
+ !test_bit(BTRFS_VOLUME_STATE_ROTATING,
+ &fs_info->fs_devices->volume_state)) {
btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 16c320a7661d..0513ecd694e6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -726,7 +726,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ set_bit(BTRFS_VOLUME_STATE_ROTATING, &fs_devices->volume_state);
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -2328,7 +2328,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
- fs_devices->rotating = 0;
+ clear_bit(BTRFS_VOLUME_STATE_ROTATING, &fs_devices->volume_state);
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2524,7 +2524,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
- fs_info->fs_devices->rotating = 1;
+ set_bit(BTRFS_VOLUME_STATE_ROTATING,
+ &fs_info->fs_devices->volume_state);
tmp = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139ec8daf4c..6bedfd0e918f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -206,6 +206,14 @@ BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
+/*
+ * btrfs_fs_devices::volume_state bitmap
+ */
+/*
+ * Set when we find or add a device that doesn't have the nonrot flag set
+ */
+#define BTRFS_VOLUME_STATE_ROTATING (0)
+
struct btrfs_fs_devices {
u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
struct list_head fs_list;
@@ -232,15 +240,12 @@ struct btrfs_fs_devices {
struct list_head alloc_list;
struct btrfs_fs_devices *seed;
+
+ unsigned long volume_state;
int seeding;
int opened;
- /* set when we find or add a device that doesn't have the
- * nonrot flag set
- */
- int rotating;
-
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
struct kobject fsid_kobj;
--
2.15.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH 2/3] btrfs: convert volume seeding flag into bitmap 2018-06-04 15:00 [PATCH 1/3] btrfs: convert volume rotating flag into bitmap Anand Jain @ 2018-06-04 15:00 ` Anand Jain 2018-06-04 15:00 ` [PATCH 3/3] btrfs: fix race between mkfs and mount Anand Jain 2018-06-20 14:01 ` [PATCH 1/3] btrfs: convert volume rotating flag into bitmap David Sterba 2 siblings, 0 replies; 13+ messages in thread From: Anand Jain @ 2018-06-04 15:00 UTC (permalink / raw) To: linux-btrfs This patch converts btrfs_fs_devices::seeding into flag BTRFS_VOL_FLAG_SEEDING. Signed-off-by: Anand Jain <anand.jain@oracle.com> --- fs/btrfs/dev-replace.c | 3 ++- fs/btrfs/volumes.c | 25 +++++++++++++++---------- fs/btrfs/volumes.h | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e2ba0419297a..9635a67d8836 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -195,7 +195,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (test_bit(BTRFS_VOLUME_STATE_SEEDING, + &fs_info->fs_devices->volume_state)) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0513ecd694e6..87a4b12f98e3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -716,7 +716,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - fs_devices->seeding = 1; + set_bit(BTRFS_VOLUME_STATE_SEEDING, &fs_devices->volume_state); } else { if (bdev_read_only(bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); @@ -1088,7 +1088,7 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; - fs_devices->seeding = 0; + clear_bit(BTRFS_VOLUME_STATE_SEEDING, &fs_devices->volume_state); return 0; } @@ -2152,7 +2152,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, * target added to the sprout FS, so there will be no more * device left under the seed FS. */ - ASSERT(fs_devices->seeding); + ASSERT(test_bit(BTRFS_VOLUME_STATE_SEEDING, + &fs_devices->volume_state)); tmp_fs_devices = fs_info->fs_devices; while (tmp_fs_devices) { @@ -2293,7 +2294,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) u64 super_flags; lockdep_assert_held(&uuid_mutex); - if (!fs_devices->seeding) + if (!test_bit(BTRFS_VOLUME_STATE_SEEDING, &fs_devices->volume_state)) return -EINVAL; seed_devices = alloc_fs_devices(NULL); @@ -2324,7 +2325,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = 0; + clear_bit(BTRFS_VOLUME_STATE_SEEDING, &fs_devices->volume_state); fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; @@ -2402,7 +2403,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); BUG_ON(!device); /* Logic error */ - if (device->fs_devices->seeding) { + if (test_bit(BTRFS_VOLUME_STATE_SEEDING, + &device->fs_devices->volume_state)) { btrfs_set_device_generation(leaf, dev_item, device->generation); btrfs_mark_buffer_dirty(leaf); @@ -2432,7 +2434,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path int ret = 0; bool unlocked = false; - if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) + if (sb_rdonly(sb) && + !test_bit(BTRFS_VOLUME_STATE_SEEDING, + &fs_info->fs_devices->volume_state)) return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, @@ -2440,7 +2444,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(bdev)) return PTR_ERR(bdev); - if (fs_info->fs_devices->seeding) { + if (test_bit(BTRFS_VOLUME_STATE_SEEDING, + &fs_info->fs_devices->volume_state)) { seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); @@ -6636,7 +6641,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - fs_devices->seeding = 1; + set_bit(BTRFS_VOLUME_STATE_SEEDING, &fs_devices->volume_state); fs_devices->opened = 1; return fs_devices; } @@ -6652,7 +6657,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, goto out; } - if (!fs_devices->seeding) { + if (!test_bit(BTRFS_VOLUME_STATE_SEEDING, &fs_devices->volume_state)) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); fs_devices = ERR_PTR(-EINVAL); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6bedfd0e918f..cd18916f2bbc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -213,6 +213,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); * Set when we find or add a device that doesn't have the nonrot flag set */ #define BTRFS_VOLUME_STATE_ROTATING (0) +#define BTRFS_VOLUME_STATE_SEEDING (1) struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ @@ -242,7 +243,6 @@ struct btrfs_fs_devices { struct btrfs_fs_devices *seed; unsigned long volume_state; - int seeding; int opened; -- 2.15.0 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-04 15:00 [PATCH 1/3] btrfs: convert volume rotating flag into bitmap Anand Jain 2018-06-04 15:00 ` [PATCH 2/3] btrfs: convert volume seeding " Anand Jain @ 2018-06-04 15:00 ` Anand Jain 2018-06-07 16:39 ` David Sterba 2018-06-20 14:06 ` David Sterba 2018-06-20 14:01 ` [PATCH 1/3] btrfs: convert volume rotating flag into bitmap David Sterba 2 siblings, 2 replies; 13+ messages in thread From: Anand Jain @ 2018-06-04 15:00 UTC (permalink / raw) To: linux-btrfs In an instrumented testing it is possible that the mount and a newer mkfs.btrfs thread on the same device can race and if the new mkfs.btrfs wins it will free the older fs_devices, then the mount thread will lead to oops. Thread1 Thread2 ------- ------- mkfs.btrfs -fq /dev/sdb mount /dev/sdb /btrfs |_btrfs_mount_root() |_btrfs_scan_one_device(... &fs_devices) mkfs.btrfs -fq /dev/sdb |_btrfs_contol_ioctl() |_btrfs_scan_one_device(... &fs_devices) |_:: |_btrfs_free_stale_devices() |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. Signed-off-by: Anand Jain <anand.jain@oracle.com> --- fs/btrfs/super.c | 6 ++++++ fs/btrfs/volumes.c | 10 +++++++++- fs/btrfs/volumes.h | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f0c13defc9eb..b60e7cbe39f5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1565,7 +1565,13 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_fs_info; } + if (test_and_set_bit(BTRFS_VOLUME_STATE_EXCL_OPS, &fs_devices->volume_state)) { + error = -EBUSY; + goto error_fs_info; + } + error = btrfs_open_devices(fs_devices, mode, fs_type); + clear_bit(BTRFS_VOLUME_STATE_EXCL_OPS, &fs_devices->volume_state); if (error) goto error_fs_info; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 87a4b12f98e3..3137cc990550 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -635,7 +635,7 @@ static void pending_bios_fn(struct btrfs_work *work) * devices. */ static void free_stale_devices(const char *path, - struct btrfs_device *skip_device) + struct btrfs_device *skip_device) { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; @@ -643,9 +643,15 @@ static void free_stale_devices(const char *path, list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { + if (test_and_set_bit(BTRFS_VOLUME_STATE_EXCL_OPS, + &fs_devices->volume_state)) + continue; + mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); + clear_bit(BTRFS_VOLUME_STATE_EXCL_OPS, + &fs_devices->volume_state); continue; } @@ -680,6 +686,8 @@ static void free_stale_devices(const char *path, list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } + clear_bit(BTRFS_VOLUME_STATE_EXCL_OPS, + &fs_devices->volume_state); } } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index cd18916f2bbc..60eea973a501 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -214,6 +214,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); */ #define BTRFS_VOLUME_STATE_ROTATING (0) #define BTRFS_VOLUME_STATE_SEEDING (1) +#define BTRFS_VOLUME_STATE_EXCL_OPS (2) struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ -- 2.15.0 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-04 15:00 ` [PATCH 3/3] btrfs: fix race between mkfs and mount Anand Jain @ 2018-06-07 16:39 ` David Sterba 2018-06-19 13:53 ` David Sterba 2018-06-20 14:06 ` David Sterba 1 sibling, 1 reply; 13+ messages in thread From: David Sterba @ 2018-06-07 16:39 UTC (permalink / raw) To: Anand Jain; +Cc: linux-btrfs On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: > In an instrumented testing it is possible that the mount and > a newer mkfs.btrfs thread on the same device can race and if the new > mkfs.btrfs wins it will free the older fs_devices, then the mount thread > will lead to oops. > > Thread1 Thread2 > ------- ------- > mkfs.btrfs -fq /dev/sdb > mount /dev/sdb /btrfs > |_btrfs_mount_root() > |_btrfs_scan_one_device(... &fs_devices) > > mkfs.btrfs -fq /dev/sdb > |_btrfs_contol_ioctl() > |_btrfs_scan_one_device(... &fs_devices) > |_:: > |_btrfs_free_stale_devices() > > |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. > > Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. I'm not sure this is the right way to fix it, adding another bit to the uuid_mutex and device_list_mutex combo. To fix the race between mount and mkfs we can add a bit of logic to the device scanning so that mount calling scan will track the purpose and mkfs scan will not be allowed to free that device. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-07 16:39 ` David Sterba @ 2018-06-19 13:53 ` David Sterba 2018-06-26 6:25 ` Anand Jain 0 siblings, 1 reply; 13+ messages in thread From: David Sterba @ 2018-06-19 13:53 UTC (permalink / raw) To: dsterba, Anand Jain, linux-btrfs On Thu, Jun 07, 2018 at 06:39:32PM +0200, David Sterba wrote: > On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: > > In an instrumented testing it is possible that the mount and > > a newer mkfs.btrfs thread on the same device can race and if the new > > mkfs.btrfs wins it will free the older fs_devices, then the mount thread > > will lead to oops. > > > > Thread1 Thread2 > > ------- ------- > > mkfs.btrfs -fq /dev/sdb > > mount /dev/sdb /btrfs > > |_btrfs_mount_root() > > |_btrfs_scan_one_device(... &fs_devices) > > > > mkfs.btrfs -fq /dev/sdb > > |_btrfs_contol_ioctl() > > |_btrfs_scan_one_device(... &fs_devices) > > |_:: > > |_btrfs_free_stale_devices() > > > > |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. > > > > Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. > > I'm not sure this is the right way to fix it, adding another bit to the > uuid_mutex and device_list_mutex combo. > > To fix the race between mount and mkfs we can add a bit of logic to the > device scanning so that mount calling scan will track the purpose and > mkfs scan will not be allowed to free that device. Last version of the proposed fix is to extend the uuid_mutex over the whole mount callback and use it around calls to btrfs_scan_one_device. That way we'll be sure the mount will always get to the device it scanned and that will not be freed by the parallel scan. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-19 13:53 ` David Sterba @ 2018-06-26 6:25 ` Anand Jain 2018-06-26 12:19 ` David Sterba 0 siblings, 1 reply; 13+ messages in thread From: Anand Jain @ 2018-06-26 6:25 UTC (permalink / raw) To: dsterba, linux-btrfs (sorry for the delay in reply due to my vacation and, other urgent things took my priority too). Please see inline below. On 06/19/2018 09:53 PM, David Sterba wrote: > On Thu, Jun 07, 2018 at 06:39:32PM +0200, David Sterba wrote: >> On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: >>> In an instrumented testing it is possible that the mount and >>> a newer mkfs.btrfs thread on the same device can race and if the new >>> mkfs.btrfs wins it will free the older fs_devices, then the mount thread >>> will lead to oops. >>> >>> Thread1 Thread2 >>> ------- ------- >>> mkfs.btrfs -fq /dev/sdb >>> mount /dev/sdb /btrfs >>> |_btrfs_mount_root() >>> |_btrfs_scan_one_device(... &fs_devices) >>> >>> mkfs.btrfs -fq /dev/sdb >>> |_btrfs_contol_ioctl() >>> |_btrfs_scan_one_device(... &fs_devices) >>> |_:: >>> |_btrfs_free_stale_devices() >>> >>> |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. >>> >>> Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. >> >> I'm not sure this is the right way to fix it, adding another bit to the >> uuid_mutex and device_list_mutex combo. Hmm I wonder why? >> To fix the race between mount and mkfs we can add a bit of logic to the >> device scanning so that mount calling scan will track the purpose and >> mkfs scan will not be allowed to free that device. Right. To implement such a logic requisites are.. #1 The lock must be FSID local so that concurrent mount and or scan of two independent FSID+DEV is possible. #2 It should not return EBUSY when either of scan or mount is in progress but smart enough to complete the (re)scan and or mount in parallel #1 is must, but #2 is good to have and if EBUSY is returned its not wrong as well. > Last version of the proposed fix is to extend the uuid_mutex over the > whole mount callback and use it around calls to btrfs_scan_one_device. > That way we'll be sure the mount will always get to the device it > scanned and that will not be freed by the parallel scan. That will break the requisite #1 as above. Thanks, Anand ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-26 6:25 ` Anand Jain @ 2018-06-26 12:19 ` David Sterba 2018-06-26 14:42 ` Anand Jain 0 siblings, 1 reply; 13+ messages in thread From: David Sterba @ 2018-06-26 12:19 UTC (permalink / raw) To: Anand Jain; +Cc: dsterba, linux-btrfs On Tue, Jun 26, 2018 at 02:25:11PM +0800, Anand Jain wrote: > > > (sorry for the delay in reply due to my vacation and, other > urgent things took my priority too). > > Please see inline below. > > On 06/19/2018 09:53 PM, David Sterba wrote: > > On Thu, Jun 07, 2018 at 06:39:32PM +0200, David Sterba wrote: > >> On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: > >>> In an instrumented testing it is possible that the mount and > >>> a newer mkfs.btrfs thread on the same device can race and if the new > >>> mkfs.btrfs wins it will free the older fs_devices, then the mount thread > >>> will lead to oops. > >>> > >>> Thread1 Thread2 > >>> ------- ------- > >>> mkfs.btrfs -fq /dev/sdb > >>> mount /dev/sdb /btrfs > >>> |_btrfs_mount_root() > >>> |_btrfs_scan_one_device(... &fs_devices) > >>> > >>> mkfs.btrfs -fq /dev/sdb > >>> |_btrfs_contol_ioctl() > >>> |_btrfs_scan_one_device(... &fs_devices) > >>> |_:: > >>> |_btrfs_free_stale_devices() > >>> > >>> |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. > >>> > >>> Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. > >> > >> I'm not sure this is the right way to fix it, adding another bit to the > >> uuid_mutex and device_list_mutex combo. > > Hmm I wonder why? > > >> To fix the race between mount and mkfs we can add a bit of logic to the > >> device scanning so that mount calling scan will track the purpose and > >> mkfs scan will not be allowed to free that device. > > Right. To implement such a logic requisites are.. > #1 The lock must be FSID local so that concurrent mount and or scan > of two independent FSID+DEV is possible. > #2 It should not return EBUSY when either of scan or mount is in > progress but smart enough to complete the (re)scan and or mount > in parallel > > #1 is must, but #2 is good to have and if EBUSY is returned its not > wrong as well. > > > > Last version of the proposed fix is to extend the uuid_mutex over the > > whole mount callback and use it around calls to btrfs_scan_one_device. > > That way we'll be sure the mount will always get to the device it > > scanned and that will not be freed by the parallel scan. > > That will break the requisite #1 as above. I don't see how it breaks 'mount and or scan of two independent fsid+dev is possible'. It is possible, but the lock does not need to be filesystem local. Concurrent mount or scan will block on the uuid_mutex, so all callers will proceed once the lock is released and there's no unexpected behaviour. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-26 12:19 ` David Sterba @ 2018-06-26 14:42 ` Anand Jain 2018-06-29 12:06 ` David Sterba 0 siblings, 1 reply; 13+ messages in thread From: Anand Jain @ 2018-06-26 14:42 UTC (permalink / raw) To: dsterba, linux-btrfs On 06/26/2018 08:19 PM, David Sterba wrote: > On Tue, Jun 26, 2018 at 02:25:11PM +0800, Anand Jain wrote: >> >> >> (sorry for the delay in reply due to my vacation and, other >> urgent things took my priority too). >> >> Please see inline below. >> >> On 06/19/2018 09:53 PM, David Sterba wrote: >>> On Thu, Jun 07, 2018 at 06:39:32PM +0200, David Sterba wrote: >>>> On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: >>>>> In an instrumented testing it is possible that the mount and >>>>> a newer mkfs.btrfs thread on the same device can race and if the new >>>>> mkfs.btrfs wins it will free the older fs_devices, then the mount thread >>>>> will lead to oops. >>>>> >>>>> Thread1 Thread2 >>>>> ------- ------- >>>>> mkfs.btrfs -fq /dev/sdb >>>>> mount /dev/sdb /btrfs >>>>> |_btrfs_mount_root() >>>>> |_btrfs_scan_one_device(... &fs_devices) >>>>> >>>>> mkfs.btrfs -fq /dev/sdb >>>>> |_btrfs_contol_ioctl() >>>>> |_btrfs_scan_one_device(... &fs_devices) >>>>> |_:: >>>>> |_btrfs_free_stale_devices() >>>>> >>>>> |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. >>>>> >>>>> Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. >>>> >>>> I'm not sure this is the right way to fix it, adding another bit to the >>>> uuid_mutex and device_list_mutex combo. >> >> Hmm I wonder why? >> >>>> To fix the race between mount and mkfs we can add a bit of logic to the >>>> device scanning so that mount calling scan will track the purpose and >>>> mkfs scan will not be allowed to free that device. >> >> Right. To implement such a logic requisites are.. >> #1 The lock must be FSID local so that concurrent mount and or scan >> of two independent FSID+DEV is possible. >> #2 It should not return EBUSY when either of scan or mount is in >> progress but smart enough to complete the (re)scan and or mount >> in parallel >> >> #1 is must, but #2 is good to have and if EBUSY is returned its not >> wrong as well. >> >> >>> Last version of the proposed fix is to extend the uuid_mutex over the >>> whole mount callback and use it around calls to btrfs_scan_one_device. >>> That way we'll be sure the mount will always get to the device it >>> scanned and that will not be freed by the parallel scan. >> >> That will break the requisite #1 as above. > > I don't see how it breaks 'mount and or scan of two independent fsid+dev > is possible'. It is possible, but the lock does not need to be > filesystem local. > > Concurrent mount or scan will block on the uuid_mutex, As uuid_mutex is global, if fsid1 is being mounted, the fsid2 mount will wait upto certain extent with this code. My efforts here was to use uuid_mutex only to protect the fs_uuid update part, in this way there is concurrency in the mount process of fsid1 and fsid2 and provides shorter bootup time when the user uses the mount at boot option. Thanks, Anand > so all callers > will proceed once the lock is released and there's no unexpected > behaviour. > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-26 14:42 ` Anand Jain @ 2018-06-29 12:06 ` David Sterba 2018-06-30 2:16 ` Anand Jain 0 siblings, 1 reply; 13+ messages in thread From: David Sterba @ 2018-06-29 12:06 UTC (permalink / raw) To: Anand Jain; +Cc: dsterba, linux-btrfs On Tue, Jun 26, 2018 at 10:42:32PM +0800, Anand Jain wrote: > >>> Last version of the proposed fix is to extend the uuid_mutex over the > >>> whole mount callback and use it around calls to btrfs_scan_one_device. > >>> That way we'll be sure the mount will always get to the device it > >>> scanned and that will not be freed by the parallel scan. > >> > >> That will break the requisite #1 as above. > > > > I don't see how it breaks 'mount and or scan of two independent fsid+dev > > is possible'. It is possible, but the lock does not need to be > > filesystem local. > > > > Concurrent mount or scan will block on the uuid_mutex, > > As uuid_mutex is global, if fsid1 is being mounted, the fsid2 mount > will wait upto certain extent with this code. Yes it will wait a bit, but the critical section is short. There's some IO done and it's reading of 4K in btrfs_read_disk_super. The rest is cpu-bound and hardly measurable in practice, in context of concurrent mount and scanning. I took the approach to fix the bug first and then make it faster or cleaner, also to fix it in a way that's still acceptable for current development cycle. > My efforts here was to > use uuid_mutex only to protect the fs_uuid update part, in this way > there is concurrency in the mount process of fsid1 and fsid2 and > provides shorter bootup time when the user uses the mount at boot > option. The locking can be still improved but the uuid_mutex is not a contended lock, mount is an operation that does not happen thousand times a second, same for the scanning. So even if there are several mounts happening during boot, it's just a few and the delay is IMO unnoticeable. If the boot time is longer by say 100ms at worst, I'm still ok with my patches as a fix. But not as a final fix, so the updates to locking you mentioned are still possible. We now have a point of reference where syzbot does is not able to reproduce the bugs. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-29 12:06 ` David Sterba @ 2018-06-30 2:16 ` Anand Jain 0 siblings, 0 replies; 13+ messages in thread From: Anand Jain @ 2018-06-30 2:16 UTC (permalink / raw) To: dsterba, linux-btrfs On 06/29/2018 08:06 PM, David Sterba wrote: > On Tue, Jun 26, 2018 at 10:42:32PM +0800, Anand Jain wrote: >>>>> Last version of the proposed fix is to extend the uuid_mutex over the >>>>> whole mount callback and use it around calls to btrfs_scan_one_device. >>>>> That way we'll be sure the mount will always get to the device it >>>>> scanned and that will not be freed by the parallel scan. >>>> >>>> That will break the requisite #1 as above. >>> >>> I don't see how it breaks 'mount and or scan of two independent fsid+dev >>> is possible'. It is possible, but the lock does not need to be >>> filesystem local. >>> >>> Concurrent mount or scan will block on the uuid_mutex, >> >> As uuid_mutex is global, if fsid1 is being mounted, the fsid2 mount >> will wait upto certain extent with this code. > > Yes it will wait a bit, but the critical section is short. There's some > IO done and it's reading of 4K in btrfs_read_disk_super. The rest is > cpu-bound and hardly measurable in practice, in context of concurrent > mount and scanning. > > I took the approach to fix the bug first and then make it faster or > cleaner, also to fix it in a way that's still acceptable for current > development cycle. >> My efforts here was to >> use uuid_mutex only to protect the fs_uuid update part, in this way >> there is concurrency in the mount process of fsid1 and fsid2 and >> provides shorter bootup time when the user uses the mount at boot >> option. > > The locking can be still improved but the uuid_mutex is not a contended > lock, mount is an operation that does not happen thousand times a > second, same for the scanning. My concern is about the boot up time when there are larger number of LUN configured with independent btrfs FSIDs to mount at boot. Since BTRFS is a kind of infrastructure for the servers, we can't rule out that these kind of configuration won't exists at all. Anyway as you said we can use uuid_mutex for the current development cycle. However a review on [1] which does fix your earlier concern of returning -EBUSY is appreciated. And pls let me know if going ahead with this approach would be accepted in the current development cycle.? > So even if there are several mounts happening during boot, it's just a > few and the delay is IMO unnoticeable. If the boot time is longer by > say 100ms at worst, I'm still ok with my patches as a fix. > > But not as a final fix, so the updates to locking you mentioned are > still possible. We now have a point of reference where syzbot does is > not able to reproduce the bugs. [1] ---------------------------------- When %fs_devices::opened > 0 the device is mounted, %fs_devices is never freed so its safe to use %fs_devices and it does not need any locks or flags. However, when %fs_devices::opened == 0 (idle) that means device is not yet mounted, and it can be either transition to opened or freed. When it transitions to freed, fs_devices gets freed and any pointer accessing will endup with UAF error. Here are places where we access fs_devcies and it needs locking and using uuid_mutex is one of method 1. READY ioctl 2. Mount 3. SCAN ioctl 4. Stale cleanup during SCAN 5. planned device FORGET ioctl #4 and #5 may free the fs_devices while #1, #2, and #3 are still accessing the fs_devices. using uuid_mutex is one choice however it would slow down the mount at boot when there are larger number of independent btrfs fsid being mounted at boot. Proposed Fix ------------- This does not return the -EBUSY. If there is any better way I am ok to use it. struct btrfs_fs_devices { :: int ref_count; :: } To acquire a fs_devices.. volume_devices_excl_state_enter(x) { fs_devices = NULL lock uuid_mutex if (fs_devices = find fs_devices(x)) fs_devices::ref_count++ unlock uuid_mutex return fs_devices } To release a fs_devices.. volume_devices_excl_state_exit(fs_devices) { lock uuid_mutex fs_devices::ref_count-- unlock uuid_mutex } To delete a fs_devices.. again: lock uuid_mutex find fs_devices if (fs_devices::ref_count != 0) { unlock uuid_mutex goto agin; } if (fs_devices->opened > 0) { unlock uuid_mutex return -EBUSY } free_fs_devices() unlock uuid_mutex ------------------------------------------ Thanks, Anand > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-04 15:00 ` [PATCH 3/3] btrfs: fix race between mkfs and mount Anand Jain 2018-06-07 16:39 ` David Sterba @ 2018-06-20 14:06 ` David Sterba 2018-06-26 6:38 ` Anand Jain 1 sibling, 1 reply; 13+ messages in thread From: David Sterba @ 2018-06-20 14:06 UTC (permalink / raw) To: Anand Jain; +Cc: linux-btrfs On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: > In an instrumented testing it is possible that the mount and > a newer mkfs.btrfs thread on the same device can race and if the new > mkfs.btrfs wins it will free the older fs_devices, then the mount thread > will lead to oops. > > Thread1 Thread2 > ------- ------- > mkfs.btrfs -fq /dev/sdb > mount /dev/sdb /btrfs > |_btrfs_mount_root() > |_btrfs_scan_one_device(... &fs_devices) > > mkfs.btrfs -fq /dev/sdb > |_btrfs_contol_ioctl() > |_btrfs_scan_one_device(... &fs_devices) > |_:: > |_btrfs_free_stale_devices() > > |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. > > Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. > > Signed-off-by: Anand Jain <anand.jain@oracle.com> > --- > fs/btrfs/super.c | 6 ++++++ > fs/btrfs/volumes.c | 10 +++++++++- > fs/btrfs/volumes.h | 1 + > 3 files changed, 16 insertions(+), 1 deletion(-) > > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c > index f0c13defc9eb..b60e7cbe39f5 100644 > --- a/fs/btrfs/super.c > +++ b/fs/btrfs/super.c > @@ -1565,7 +1565,13 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, > goto error_fs_info; > } > > + if (test_and_set_bit(BTRFS_VOLUME_STATE_EXCL_OPS, &fs_devices->volume_state)) { > + error = -EBUSY; We'd need to wait until the bit is not set instead of BUSY, as the parallel scan is not really a reason to fail the whole mount. I'll post the patch series to address this problem today, it utilizes the uuid_mutex in a similar way you try to do with the new bit, but it will not lead to EBUSY. ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 3/3] btrfs: fix race between mkfs and mount 2018-06-20 14:06 ` David Sterba @ 2018-06-26 6:38 ` Anand Jain 0 siblings, 0 replies; 13+ messages in thread From: Anand Jain @ 2018-06-26 6:38 UTC (permalink / raw) To: dsterba, linux-btrfs On 06/20/2018 10:06 PM, David Sterba wrote: > On Mon, Jun 04, 2018 at 11:00:30PM +0800, Anand Jain wrote: >> In an instrumented testing it is possible that the mount and >> a newer mkfs.btrfs thread on the same device can race and if the new >> mkfs.btrfs wins it will free the older fs_devices, then the mount thread >> will lead to oops. >> >> Thread1 Thread2 >> ------- ------- >> mkfs.btrfs -fq /dev/sdb >> mount /dev/sdb /btrfs >> |_btrfs_mount_root() >> |_btrfs_scan_one_device(... &fs_devices) >> >> mkfs.btrfs -fq /dev/sdb >> |_btrfs_contol_ioctl() >> |_btrfs_scan_one_device(... &fs_devices) >> |_:: >> |_btrfs_free_stale_devices() >> >> |_btrfs_open_devices(fs_devices ..) <-- stale fs_devices. >> >> Fix this with a mutually exclusive flag BTRFS_VOL_FLAG_EXCL_OPS. >> >> Signed-off-by: Anand Jain <anand.jain@oracle.com> >> --- >> fs/btrfs/super.c | 6 ++++++ >> fs/btrfs/volumes.c | 10 +++++++++- >> fs/btrfs/volumes.h | 1 + >> 3 files changed, 16 insertions(+), 1 deletion(-) >> >> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c >> index f0c13defc9eb..b60e7cbe39f5 100644 >> --- a/fs/btrfs/super.c >> +++ b/fs/btrfs/super.c >> @@ -1565,7 +1565,13 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, >> goto error_fs_info; >> } >> >> + if (test_and_set_bit(BTRFS_VOLUME_STATE_EXCL_OPS, &fs_devices->volume_state)) { >> + error = -EBUSY; > > We'd need to wait until the bit is not set instead of BUSY, as the > parallel scan is not really a reason to fail the whole mount. > I'll post the patch series to address this problem today, it utilizes > the uuid_mutex in a similar way you try to do with the new bit, but it > will not lead to EBUSY. Ok. Shall review. Thanks, Anand > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH 1/3] btrfs: convert volume rotating flag into bitmap 2018-06-04 15:00 [PATCH 1/3] btrfs: convert volume rotating flag into bitmap Anand Jain 2018-06-04 15:00 ` [PATCH 2/3] btrfs: convert volume seeding " Anand Jain 2018-06-04 15:00 ` [PATCH 3/3] btrfs: fix race between mkfs and mount Anand Jain @ 2018-06-20 14:01 ` David Sterba 2 siblings, 0 replies; 13+ messages in thread From: David Sterba @ 2018-06-20 14:01 UTC (permalink / raw) To: Anand Jain; +Cc: linux-btrfs On Mon, Jun 04, 2018 at 11:00:28PM +0800, Anand Jain wrote: > Add bitmap btrfs_fs_devices::volume_state to maintain the volume states and > flags. This patch in perticular converts btrfs_fs_devices::rotating into > flag BTRFS_VOLUME_STATE_ROTATING. I'm not sure we need this. There are 2 flags, we don't need the atomicity of test/set _bit, the values don't change too often so protecting by the mutex should be suffictient. The size of btrfs_device is also not a big concern, it will be stored in the 512 byte kmalloc bin and it's size is close to the size so replacing 2 ints with one long will not gain anything. The 3rd flag would cause other problems and is not the right solution to the scan/mount problem. ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2018-06-30 2:13 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2018-06-04 15:00 [PATCH 1/3] btrfs: convert volume rotating flag into bitmap Anand Jain 2018-06-04 15:00 ` [PATCH 2/3] btrfs: convert volume seeding " Anand Jain 2018-06-04 15:00 ` [PATCH 3/3] btrfs: fix race between mkfs and mount Anand Jain 2018-06-07 16:39 ` David Sterba 2018-06-19 13:53 ` David Sterba 2018-06-26 6:25 ` Anand Jain 2018-06-26 12:19 ` David Sterba 2018-06-26 14:42 ` Anand Jain 2018-06-29 12:06 ` David Sterba 2018-06-30 2:16 ` Anand Jain 2018-06-20 14:06 ` David Sterba 2018-06-26 6:38 ` Anand Jain 2018-06-20 14:01 ` [PATCH 1/3] btrfs: convert volume rotating flag into bitmap David Sterba
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).