* [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction
@ 2022-03-08 5:36 Qu Wenruo
2022-03-09 2:12 ` Anand Jain
2022-03-14 20:03 ` David Sterba
0 siblings, 2 replies; 5+ messages in thread
From: Qu Wenruo @ 2022-03-08 5:36 UTC (permalink / raw)
To: linux-btrfs; +Cc: Luca Béla Palkovics
[BUG]
There is a report that a btrfs has a bad super block num devices.
This makes btrfs to reject the fs completely.
BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here
BTRFS error (device sdd3): failed to read chunk tree: -22
BTRFS error (device sdd3): open_ctree failed
[CAUSE]
During btrfs device removal, chunk tree and super block num devs are
updated in two different transactions:
btrfs_rm_device()
|- btrfs_rm_dev_item(device)
| |- trans = btrfs_start_transaction()
| | Now we got transaction X
| |
| |- btrfs_del_item()
| | Now device item is removed from chunk tree
| |
| |- btrfs_commit_transaction()
| Transaction X got committed, super num devs untouched,
| but device item removed from chunk tree.
| (AKA, super num devs is already incorrect)
|
|- cur_devices->num_devices--;
|- cur_devices->total_devices--;
|- btrfs_set_super_num_devices()
All those operations are not in transaction X, thus it will
only be written back to disk in next transaction.
So after the transaction X in btrfs_rm_dev_item() committed, but before
transaction X+1 (which can be minutes away), a power loss happen, then
we got the super num mismatch.
[FIX]
Instead of starting and committing a transaction inside
btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and
pass it to btrfs_rm_dev_item().
And only commit the transaction after everything is done.
Reported-by: Luca Béla Palkovics <luca.bela.palkovics@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/
Signed-off-by: Qu Wenruo <wqu@suse.com>
---
fs/btrfs/volumes.c | 65 ++++++++++++++++++++--------------------------
1 file changed, 28 insertions(+), 37 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 57a754b33f10..6115c302f4ae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path)
path_put(&path);
}
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
if (ret) {
if (ret > 0)
ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
-
out:
btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
+ return ret;
device = btrfs_find_device(fs_info->fs_devices, args);
if (!device) {
@@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = -ENOENT;
- goto out;
+ return ret;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ return -ETXTBSY;
}
- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
if (ret)
goto error_undo;
- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
free_fs_devices(cur_devices);
}
-out:
+ ret = btrfs_commit_transaction(trans);
+
return ret;
error_undo:
@@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
--
2.35.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction 2022-03-08 5:36 [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction Qu Wenruo @ 2022-03-09 2:12 ` Anand Jain 2022-03-09 2:58 ` Qu Wenruo 2022-03-14 20:03 ` David Sterba 1 sibling, 1 reply; 5+ messages in thread From: Anand Jain @ 2022-03-09 2:12 UTC (permalink / raw) To: Qu Wenruo, linux-btrfs; +Cc: Luca Béla Palkovics On 08/03/2022 13:36, Qu Wenruo wrote: > [BUG] > There is a report that a btrfs has a bad super block num devices. > > This makes btrfs to reject the fs completely. > > BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here > BTRFS error (device sdd3): failed to read chunk tree: -22 > BTRFS error (device sdd3): open_ctree failed > > [CAUSE] > During btrfs device removal, chunk tree and super block num devs are > updated in two different transactions: > > btrfs_rm_device() > |- btrfs_rm_dev_item(device) > | |- trans = btrfs_start_transaction() > | | Now we got transaction X > | | > | |- btrfs_del_item() > | | Now device item is removed from chunk tree > | | > | |- btrfs_commit_transaction() > | Transaction X got committed, super num devs untouched, > | but device item removed from chunk tree. > | (AKA, super num devs is already incorrect) > | > |- cur_devices->num_devices--; > |- cur_devices->total_devices--; > |- btrfs_set_super_num_devices() > All those operations are not in transaction X, thus it will > only be written back to disk in next transaction. > > So after the transaction X in btrfs_rm_dev_item() committed, but before > transaction X+1 (which can be minutes away), a power loss happen, then > we got the super num mismatch. > > [FIX] > Instead of starting and committing a transaction inside > btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and > pass it to btrfs_rm_dev_item(). > > And only commit the transaction after everything is done. > > Reported-by: Luca Béla Palkovics <luca.bela.palkovics@gmail.com> > Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/ > Signed-off-by: Qu Wenruo <wqu@suse.com> > --- > fs/btrfs/volumes.c | 65 ++++++++++++++++++++-------------------------- > 1 file changed, 28 insertions(+), 37 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 57a754b33f10..6115c302f4ae 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) > path_put(&path); > } > > -static int btrfs_rm_dev_item(struct btrfs_device *device) > +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, > + struct btrfs_device *device) > { > struct btrfs_root *root = device->fs_info->chunk_root; > int ret; > struct btrfs_path *path; > struct btrfs_key key; > - struct btrfs_trans_handle *trans; > > path = btrfs_alloc_path(); > if (!path) > return -ENOMEM; > > - trans = btrfs_start_transaction(root, 0); > - if (IS_ERR(trans)) { > - btrfs_free_path(path); > - return PTR_ERR(trans); > - } > key.objectid = BTRFS_DEV_ITEMS_OBJECTID; > key.type = BTRFS_DEV_ITEM_KEY; > key.offset = device->devid; > @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) > if (ret) { > if (ret > 0) > ret = -ENOENT; > - btrfs_abort_transaction(trans, ret); > - btrfs_end_transaction(trans); > goto out; > } > > ret = btrfs_del_item(trans, root, path); > - if (ret) { > - btrfs_abort_transaction(trans, ret); > - btrfs_end_transaction(trans); > - } > - > out: > btrfs_free_path(path); > - if (!ret) > - ret = btrfs_commit_transaction(trans); > return ret; > } > > @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, > struct btrfs_dev_lookup_args *args, > struct block_device **bdev, fmode_t *mode) > { > + struct btrfs_trans_handle *trans; > struct btrfs_device *device; > struct btrfs_fs_devices *cur_devices; > struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; > @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, > > ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); > if (ret) > - goto out; > + return ret; > > device = btrfs_find_device(fs_info->fs_devices, args); > if (!device) { > @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, > ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; > else > ret = -ENOENT; > - goto out; > + return ret; > } > > if (btrfs_pinned_by_swapfile(fs_info, device)) { > btrfs_warn_in_rcu(fs_info, > "cannot remove device %s (devid %llu) due to active swapfile", > rcu_str_deref(device->name), device->devid); > - ret = -ETXTBSY; > - goto out; > + return -ETXTBSY; > } > > - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { > - ret = BTRFS_ERROR_DEV_TGT_REPLACE; > - goto out; > - } > + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) > + return BTRFS_ERROR_DEV_TGT_REPLACE; > > if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && > - fs_info->fs_devices->rw_devices == 1) { > - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; > - goto out; > - } > + fs_info->fs_devices->rw_devices == 1) > + return BTRFS_ERROR_DEV_ONLY_WRITABLE; > > if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { > mutex_lock(&fs_info->chunk_mutex); > @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, > if (ret) > goto error_undo; > > - /* > - * TODO: the superblock still includes this device in its num_devices > - * counter although write_all_supers() is not locked out. This > - * could give a filesystem state which requires a degraded mount. > - */ > - ret = btrfs_rm_dev_item(device); > - if (ret) > + trans = btrfs_start_transaction(fs_info->chunk_root, 0); > + if (IS_ERR(trans)) { > + ret = PTR_ERR(trans); > goto error_undo; > + } > + > + ret = btrfs_rm_dev_item(trans, device); > + if (ret) { > + /* Any error in dev item removal is critical */ > + btrfs_crit(fs_info, > + "failed to remove device item for devid %llu: %d", > + device->devid, ret); > + btrfs_abort_transaction(trans, ret); > + btrfs_end_transaction(trans); > + return ret; Missed error_undo part of the undo here. Thanks, Anand > + } > > clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); > btrfs_scrub_cancel_dev(device); > @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, > free_fs_devices(cur_devices); > } > > -out: > + ret = btrfs_commit_transaction(trans); > + > return ret; > > error_undo: > @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, > device->fs_devices->rw_devices++; > mutex_unlock(&fs_info->chunk_mutex); > } > - goto out; > + return ret; > } > > void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction 2022-03-09 2:12 ` Anand Jain @ 2022-03-09 2:58 ` Qu Wenruo 2022-03-09 7:31 ` Anand Jain 0 siblings, 1 reply; 5+ messages in thread From: Qu Wenruo @ 2022-03-09 2:58 UTC (permalink / raw) To: Anand Jain, Qu Wenruo, linux-btrfs; +Cc: Luca Béla Palkovics On 2022/3/9 10:12, Anand Jain wrote: > On 08/03/2022 13:36, Qu Wenruo wrote: >> [BUG] >> There is a report that a btrfs has a bad super block num devices. >> >> This makes btrfs to reject the fs completely. >> >> BTRFS error (device sdd3): super_num_devices 3 mismatch with >> num_devices 2 found here >> BTRFS error (device sdd3): failed to read chunk tree: -22 >> BTRFS error (device sdd3): open_ctree failed >> >> [CAUSE] >> During btrfs device removal, chunk tree and super block num devs are >> updated in two different transactions: >> >> btrfs_rm_device() >> |- btrfs_rm_dev_item(device) >> | |- trans = btrfs_start_transaction() >> | | Now we got transaction X >> | | >> | |- btrfs_del_item() >> | | Now device item is removed from chunk tree >> | | >> | |- btrfs_commit_transaction() >> | Transaction X got committed, super num devs untouched, >> | but device item removed from chunk tree. >> | (AKA, super num devs is already incorrect) >> | >> |- cur_devices->num_devices--; >> |- cur_devices->total_devices--; >> |- btrfs_set_super_num_devices() >> All those operations are not in transaction X, thus it will >> only be written back to disk in next transaction. >> >> So after the transaction X in btrfs_rm_dev_item() committed, but before >> transaction X+1 (which can be minutes away), a power loss happen, then >> we got the super num mismatch. >> >> [FIX] >> Instead of starting and committing a transaction inside >> btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and >> pass it to btrfs_rm_dev_item(). >> >> And only commit the transaction after everything is done. >> > Reported-by: Luca Béla Palkovics <luca.bela.palkovics@gmail.com> >> Link: >> https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/ >> >> Signed-off-by: Qu Wenruo <wqu@suse.com> >> --- >> fs/btrfs/volumes.c | 65 ++++++++++++++++++++-------------------------- >> 1 file changed, 28 insertions(+), 37 deletions(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index 57a754b33f10..6115c302f4ae 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -1896,23 +1896,18 @@ static void update_dev_time(const char >> *device_path) >> path_put(&path); >> } >> -static int btrfs_rm_dev_item(struct btrfs_device *device) >> +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, >> + struct btrfs_device *device) >> { >> struct btrfs_root *root = device->fs_info->chunk_root; >> int ret; >> struct btrfs_path *path; >> struct btrfs_key key; >> - struct btrfs_trans_handle *trans; >> path = btrfs_alloc_path(); >> if (!path) >> return -ENOMEM; >> - trans = btrfs_start_transaction(root, 0); >> - if (IS_ERR(trans)) { >> - btrfs_free_path(path); >> - return PTR_ERR(trans); >> - } >> key.objectid = BTRFS_DEV_ITEMS_OBJECTID; >> key.type = BTRFS_DEV_ITEM_KEY; >> key.offset = device->devid; >> @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct >> btrfs_device *device) >> if (ret) { >> if (ret > 0) >> ret = -ENOENT; >> - btrfs_abort_transaction(trans, ret); >> - btrfs_end_transaction(trans); >> goto out; >> } >> ret = btrfs_del_item(trans, root, path); >> - if (ret) { >> - btrfs_abort_transaction(trans, ret); >> - btrfs_end_transaction(trans); >> - } >> - >> out: >> btrfs_free_path(path); >> - if (!ret) >> - ret = btrfs_commit_transaction(trans); >> return ret; >> } >> @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >> struct btrfs_dev_lookup_args *args, >> struct block_device **bdev, fmode_t *mode) >> { >> + struct btrfs_trans_handle *trans; >> struct btrfs_device *device; >> struct btrfs_fs_devices *cur_devices; >> struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; >> @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >> ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); >> if (ret) >> - goto out; >> + return ret; >> device = btrfs_find_device(fs_info->fs_devices, args); >> if (!device) { >> @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info >> *fs_info, >> ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; >> else >> ret = -ENOENT; >> - goto out; >> + return ret; >> } >> if (btrfs_pinned_by_swapfile(fs_info, device)) { >> btrfs_warn_in_rcu(fs_info, >> "cannot remove device %s (devid %llu) due to active >> swapfile", >> rcu_str_deref(device->name), device->devid); >> - ret = -ETXTBSY; >> - goto out; >> + return -ETXTBSY; >> } >> - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { >> - ret = BTRFS_ERROR_DEV_TGT_REPLACE; >> - goto out; >> - } >> + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) >> + return BTRFS_ERROR_DEV_TGT_REPLACE; >> if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && >> - fs_info->fs_devices->rw_devices == 1) { >> - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; >> - goto out; >> - } >> + fs_info->fs_devices->rw_devices == 1) >> + return BTRFS_ERROR_DEV_ONLY_WRITABLE; >> if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { >> mutex_lock(&fs_info->chunk_mutex); >> @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info >> *fs_info, >> if (ret) >> goto error_undo; >> - /* >> - * TODO: the superblock still includes this device in its >> num_devices >> - * counter although write_all_supers() is not locked out. This >> - * could give a filesystem state which requires a degraded mount. >> - */ >> - ret = btrfs_rm_dev_item(device); >> - if (ret) >> + trans = btrfs_start_transaction(fs_info->chunk_root, 0); >> + if (IS_ERR(trans)) { >> + ret = PTR_ERR(trans); >> goto error_undo; >> + } >> + >> + ret = btrfs_rm_dev_item(trans, device); >> + if (ret) { >> + /* Any error in dev item removal is critical */ >> + btrfs_crit(fs_info, >> + "failed to remove device item for devid %llu: %d", >> + device->devid, ret); >> + btrfs_abort_transaction(trans, ret); >> + btrfs_end_transaction(trans); >> + return ret; > > Missed error_undo part of the undo here. Nope, that's exactly expected. We abort transaction, thus nothing committed, no need to undo. In fact, after the btrfs_rm_dev_item() call, there is no real way to rollback the delete. Thanks, Qu > > Thanks, Anand > >> + } >> clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); >> btrfs_scrub_cancel_dev(device); >> @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >> free_fs_devices(cur_devices); >> } >> -out: >> + ret = btrfs_commit_transaction(trans); >> + >> return ret; >> error_undo: >> @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >> device->fs_devices->rw_devices++; >> mutex_unlock(&fs_info->chunk_mutex); >> } >> - goto out; >> + return ret; >> } >> void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) > ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction 2022-03-09 2:58 ` Qu Wenruo @ 2022-03-09 7:31 ` Anand Jain 0 siblings, 0 replies; 5+ messages in thread From: Anand Jain @ 2022-03-09 7:31 UTC (permalink / raw) To: Qu Wenruo, Qu Wenruo, linux-btrfs; +Cc: Luca Béla Palkovics On 09/03/2022 10:58, Qu Wenruo wrote: > > > On 2022/3/9 10:12, Anand Jain wrote: >> On 08/03/2022 13:36, Qu Wenruo wrote: >>> [BUG] >>> There is a report that a btrfs has a bad super block num devices. >>> >>> This makes btrfs to reject the fs completely. >>> >>> BTRFS error (device sdd3): super_num_devices 3 mismatch with >>> num_devices 2 found here >>> BTRFS error (device sdd3): failed to read chunk tree: -22 >>> BTRFS error (device sdd3): open_ctree failed >>> >>> [CAUSE] >>> During btrfs device removal, chunk tree and super block num devs are >>> updated in two different transactions: >>> >>> btrfs_rm_device() >>> |- btrfs_rm_dev_item(device) >>> | |- trans = btrfs_start_transaction() >>> | | Now we got transaction X >>> | | >>> | |- btrfs_del_item() >>> | | Now device item is removed from chunk tree >>> | | >>> | |- btrfs_commit_transaction() >>> | Transaction X got committed, super num devs untouched, >>> | but device item removed from chunk tree. >>> | (AKA, super num devs is already incorrect) >>> | >>> |- cur_devices->num_devices--; >>> |- cur_devices->total_devices--; >>> |- btrfs_set_super_num_devices() >>> All those operations are not in transaction X, thus it will >>> only be written back to disk in next transaction. >>> >>> So after the transaction X in btrfs_rm_dev_item() committed, but before >>> transaction X+1 (which can be minutes away), a power loss happen, then >>> we got the super num mismatch. >>> >>> [FIX] >>> Instead of starting and committing a transaction inside >>> btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and >>> pass it to btrfs_rm_dev_item(). >>> >>> And only commit the transaction after everything is done. >>> > Reported-by: Luca Béla Palkovics <luca.bela.palkovics@gmail.com> >>> Link: >>> https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/ >>> >>> Signed-off-by: Qu Wenruo <wqu@suse.com> >>> --- >>> fs/btrfs/volumes.c | 65 ++++++++++++++++++++-------------------------- >>> 1 file changed, 28 insertions(+), 37 deletions(-) >>> >>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >>> index 57a754b33f10..6115c302f4ae 100644 >>> --- a/fs/btrfs/volumes.c >>> +++ b/fs/btrfs/volumes.c >>> @@ -1896,23 +1896,18 @@ static void update_dev_time(const char >>> *device_path) >>> path_put(&path); >>> } >>> -static int btrfs_rm_dev_item(struct btrfs_device *device) >>> +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, >>> + struct btrfs_device *device) >>> { >>> struct btrfs_root *root = device->fs_info->chunk_root; >>> int ret; >>> struct btrfs_path *path; >>> struct btrfs_key key; >>> - struct btrfs_trans_handle *trans; >>> path = btrfs_alloc_path(); >>> if (!path) >>> return -ENOMEM; >>> - trans = btrfs_start_transaction(root, 0); >>> - if (IS_ERR(trans)) { >>> - btrfs_free_path(path); >>> - return PTR_ERR(trans); >>> - } >>> key.objectid = BTRFS_DEV_ITEMS_OBJECTID; >>> key.type = BTRFS_DEV_ITEM_KEY; >>> key.offset = device->devid; >>> @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct >>> btrfs_device *device) >>> if (ret) { >>> if (ret > 0) >>> ret = -ENOENT; >>> - btrfs_abort_transaction(trans, ret); >>> - btrfs_end_transaction(trans); >>> goto out; >>> } >>> ret = btrfs_del_item(trans, root, path); >>> - if (ret) { >>> - btrfs_abort_transaction(trans, ret); >>> - btrfs_end_transaction(trans); >>> - } >>> - >>> out: >>> btrfs_free_path(path); >>> - if (!ret) >>> - ret = btrfs_commit_transaction(trans); >>> return ret; >>> } >>> @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >>> struct btrfs_dev_lookup_args *args, >>> struct block_device **bdev, fmode_t *mode) >>> { >>> + struct btrfs_trans_handle *trans; >>> struct btrfs_device *device; >>> struct btrfs_fs_devices *cur_devices; >>> struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; >>> @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >>> ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); >>> if (ret) >>> - goto out; >>> + return ret; >>> device = btrfs_find_device(fs_info->fs_devices, args); >>> if (!device) { >>> @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info >>> *fs_info, >>> ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; >>> else >>> ret = -ENOENT; >>> - goto out; >>> + return ret; >>> } >>> if (btrfs_pinned_by_swapfile(fs_info, device)) { >>> btrfs_warn_in_rcu(fs_info, >>> "cannot remove device %s (devid %llu) due to active >>> swapfile", >>> rcu_str_deref(device->name), device->devid); >>> - ret = -ETXTBSY; >>> - goto out; >>> + return -ETXTBSY; >>> } >>> - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { >>> - ret = BTRFS_ERROR_DEV_TGT_REPLACE; >>> - goto out; >>> - } >>> + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) >>> + return BTRFS_ERROR_DEV_TGT_REPLACE; >>> if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && >>> - fs_info->fs_devices->rw_devices == 1) { >>> - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; >>> - goto out; >>> - } >>> + fs_info->fs_devices->rw_devices == 1) >>> + return BTRFS_ERROR_DEV_ONLY_WRITABLE; >>> if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { >>> mutex_lock(&fs_info->chunk_mutex); >>> @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info >>> *fs_info, >>> if (ret) >>> goto error_undo; >>> - /* >>> - * TODO: the superblock still includes this device in its >>> num_devices >>> - * counter although write_all_supers() is not locked out. This >>> - * could give a filesystem state which requires a degraded mount. >>> - */ >>> - ret = btrfs_rm_dev_item(device); >>> - if (ret) >>> + trans = btrfs_start_transaction(fs_info->chunk_root, 0); >>> + if (IS_ERR(trans)) { >>> + ret = PTR_ERR(trans); >>> goto error_undo; >>> + } >>> + >>> + ret = btrfs_rm_dev_item(trans, device); >>> + if (ret) { >>> + /* Any error in dev item removal is critical */ >>> + btrfs_crit(fs_info, >>> + "failed to remove device item for devid %llu: %d", >>> + device->devid, ret); >>> + btrfs_abort_transaction(trans, ret); >>> + btrfs_end_transaction(trans); >>> + return ret; >> >> Missed error_undo part of the undo here. > > Nope, that's exactly expected. > > We abort transaction, thus nothing committed, no need to undo. My concern is device->fs_devices->rw_devices is not equal to device->fs_devices->num_devices and fs is ro at this stage. I am a bit nervous if our close devices would be ok. But it looks ok. Anyway, after the unmount and mount recycle the rw_devices == num_devices again. But a device shall have zero disk_total_bytes. Which is fine. The user can try rm device again. Reviewed-by: Anand Jain <anand.jain@oracle.com> Thanks, Anand > In fact, after the btrfs_rm_dev_item() call, there is no real way to > rollback the delete. > Thanks, > Qu >> >> Thanks, Anand >> >>> + } >>> clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); >>> btrfs_scrub_cancel_dev(device); >>> @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >>> free_fs_devices(cur_devices); >>> } >>> -out: >>> + ret = btrfs_commit_transaction(trans); >>> + >>> return ret; >>> error_undo: >>> @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, >>> device->fs_devices->rw_devices++; >>> mutex_unlock(&fs_info->chunk_mutex); >>> } >>> - goto out; >>> + return ret; >>> } >>> void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) >> ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction 2022-03-08 5:36 [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction Qu Wenruo 2022-03-09 2:12 ` Anand Jain @ 2022-03-14 20:03 ` David Sterba 1 sibling, 0 replies; 5+ messages in thread From: David Sterba @ 2022-03-14 20:03 UTC (permalink / raw) To: Qu Wenruo; +Cc: linux-btrfs, Luca Béla Palkovics On Tue, Mar 08, 2022 at 01:36:38PM +0800, Qu Wenruo wrote: > [BUG] > There is a report that a btrfs has a bad super block num devices. People have reported this on IRC in the past too. In some cases it was a report with two devices expected but one found and "I have never added nor removed a device on this filesystem", so it was a bit mysterious. The split update over the transaction is otherwise a clear cause. Added to misc-next, thanks. ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2022-03-14 20:07 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2022-03-08 5:36 [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction Qu Wenruo 2022-03-09 2:12 ` Anand Jain 2022-03-09 2:58 ` Qu Wenruo 2022-03-09 7:31 ` Anand Jain 2022-03-14 20:03 ` David Sterba
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox