public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Anand Jain <anand.jain@oracle.com>
To: Qu Wenruo <wqu@suse.com>, linux-btrfs@vger.kernel.org
Cc: "Luca Béla Palkovics" <luca.bela.palkovics@gmail.com>
Subject: Re: [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction
Date: Wed, 9 Mar 2022 10:12:43 +0800	[thread overview]
Message-ID: <760da013-f15b-805f-ceaa-c8be838eb9ff@oracle.com> (raw)
In-Reply-To: <b86647c31a09ccc44447367865ecac8d5b358b7c.1646717720.git.wqu@suse.com>

On 08/03/2022 13:36, Qu Wenruo wrote:
> [BUG]
> There is a report that a btrfs has a bad super block num devices.
> 
> This makes btrfs to reject the fs completely.
> 
>    BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here
>    BTRFS error (device sdd3): failed to read chunk tree: -22
>    BTRFS error (device sdd3): open_ctree failed
> 
> [CAUSE]
> During btrfs device removal, chunk tree and super block num devs are
> updated in two different transactions:
> 
>    btrfs_rm_device()
>    |- btrfs_rm_dev_item(device)
>    |  |- trans = btrfs_start_transaction()
>    |  |  Now we got transaction X
>    |  |
>    |  |- btrfs_del_item()
>    |  |  Now device item is removed from chunk tree
>    |  |
>    |  |- btrfs_commit_transaction()
>    |     Transaction X got committed, super num devs untouched,
>    |     but device item removed from chunk tree.
>    |     (AKA, super num devs is already incorrect)
>    |
>    |- cur_devices->num_devices--;
>    |- cur_devices->total_devices--;
>    |- btrfs_set_super_num_devices()
>       All those operations are not in transaction X, thus it will
>       only be written back to disk in next transaction.
> 
> So after the transaction X in btrfs_rm_dev_item() committed, but before
> transaction X+1 (which can be minutes away), a power loss happen, then
> we got the super num mismatch.
> 
> [FIX]
> Instead of starting and committing a transaction inside
> btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and
> pass it to btrfs_rm_dev_item().
> 
> And only commit the transaction after everything is done.
>  > Reported-by: Luca Béla Palkovics <luca.bela.palkovics@gmail.com>
> Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46CKmp3igr44okQ@mail.gmail.com/
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
>   fs/btrfs/volumes.c | 65 ++++++++++++++++++++--------------------------
>   1 file changed, 28 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 57a754b33f10..6115c302f4ae 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path)
>   	path_put(&path);
>   }
>   
> -static int btrfs_rm_dev_item(struct btrfs_device *device)
> +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
> +			     struct btrfs_device *device)
>   {
>   	struct btrfs_root *root = device->fs_info->chunk_root;
>   	int ret;
>   	struct btrfs_path *path;
>   	struct btrfs_key key;
> -	struct btrfs_trans_handle *trans;
>   
>   	path = btrfs_alloc_path();
>   	if (!path)
>   		return -ENOMEM;
>   
> -	trans = btrfs_start_transaction(root, 0);
> -	if (IS_ERR(trans)) {
> -		btrfs_free_path(path);
> -		return PTR_ERR(trans);
> -	}
>   	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
>   	key.type = BTRFS_DEV_ITEM_KEY;
>   	key.offset = device->devid;
> @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
>   	if (ret) {
>   		if (ret > 0)
>   			ret = -ENOENT;
> -		btrfs_abort_transaction(trans, ret);
> -		btrfs_end_transaction(trans);
>   		goto out;
>   	}
>   
>   	ret = btrfs_del_item(trans, root, path);
> -	if (ret) {
> -		btrfs_abort_transaction(trans, ret);
> -		btrfs_end_transaction(trans);
> -	}
> -
>   out:
>   	btrfs_free_path(path);
> -	if (!ret)
> -		ret = btrfs_commit_transaction(trans);
>   	return ret;
>   }
>   
> @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>   		    struct btrfs_dev_lookup_args *args,
>   		    struct block_device **bdev, fmode_t *mode)
>   {
> +	struct btrfs_trans_handle *trans;
>   	struct btrfs_device *device;
>   	struct btrfs_fs_devices *cur_devices;
>   	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>   
>   	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
>   	if (ret)
> -		goto out;
> +		return ret;
>   
>   	device = btrfs_find_device(fs_info->fs_devices, args);
>   	if (!device) {
> @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>   			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
>   		else
>   			ret = -ENOENT;
> -		goto out;
> +		return ret;
>   	}
>   
>   	if (btrfs_pinned_by_swapfile(fs_info, device)) {
>   		btrfs_warn_in_rcu(fs_info,
>   		  "cannot remove device %s (devid %llu) due to active swapfile",
>   				  rcu_str_deref(device->name), device->devid);
> -		ret = -ETXTBSY;
> -		goto out;
> +		return -ETXTBSY;
>   	}
>   
> -	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
> -		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
> -		goto out;
> -	}
> +	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
> +		return BTRFS_ERROR_DEV_TGT_REPLACE;
>   
>   	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
> -	    fs_info->fs_devices->rw_devices == 1) {
> -		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
> -		goto out;
> -	}
> +	    fs_info->fs_devices->rw_devices == 1)
> +		return BTRFS_ERROR_DEV_ONLY_WRITABLE;
>   
>   	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
>   		mutex_lock(&fs_info->chunk_mutex);
> @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>   	if (ret)
>   		goto error_undo;
>   
> -	/*
> -	 * TODO: the superblock still includes this device in its num_devices
> -	 * counter although write_all_supers() is not locked out. This
> -	 * could give a filesystem state which requires a degraded mount.
> -	 */
> -	ret = btrfs_rm_dev_item(device);
> -	if (ret)
> +	trans = btrfs_start_transaction(fs_info->chunk_root, 0);
> +	if (IS_ERR(trans)) {
> +		ret = PTR_ERR(trans);
>   		goto error_undo;
> +	}
> +
> +	ret = btrfs_rm_dev_item(trans, device);
> +	if (ret) {
> +		/* Any error in dev item removal is critical */
> +		btrfs_crit(fs_info,
> +			   "failed to remove device item for devid %llu: %d",
> +			   device->devid, ret);
> +		btrfs_abort_transaction(trans, ret);
> +		btrfs_end_transaction(trans);
> +		return ret;

  Missed error_undo part of the undo here.

Thanks, Anand

> +	}
>   
>   	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
>   	btrfs_scrub_cancel_dev(device);
> @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>   		free_fs_devices(cur_devices);
>   	}
>   
> -out:
> +	ret = btrfs_commit_transaction(trans);
> +
>   	return ret;
>   
>   error_undo:
> @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
>   		device->fs_devices->rw_devices++;
>   		mutex_unlock(&fs_info->chunk_mutex);
>   	}
> -	goto out;
> +	return ret;
>   }
>   
>   void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)


  reply	other threads:[~2022-03-09  2:13 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-08  5:36 [PATCH] btrfs: make device item removal and super block num devices update happen in the same transaction Qu Wenruo
2022-03-09  2:12 ` Anand Jain [this message]
2022-03-09  2:58   ` Qu Wenruo
2022-03-09  7:31     ` Anand Jain
2022-03-14 20:03 ` David Sterba

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=760da013-f15b-805f-ceaa-c8be838eb9ff@oracle.com \
    --to=anand.jain@oracle.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=luca.bela.palkovics@gmail.com \
    --cc=wqu@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox