Re: [PATCH v3 1/3] btrfs: Introduce per-profile available space facility

public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed

From: David Sterba <dsterba@suse.cz>
To: Qu Wenruo <wqu@suse.com>
Cc: linux-btrfs@vger.kernel.org, Josef Bacik <josef@toxicpanda.com>
Subject: Re: [PATCH v3 1/3] btrfs: Introduce per-profile available space facility
Date: Mon, 6 Jan 2020 15:32:43 +0100	[thread overview]
Message-ID: <20200106143242.GG3929@twin.jikos.cz> (raw)
In-Reply-To: <20200106061343.18772-2-wqu@suse.com>

On Mon, Jan 06, 2020 at 02:13:41PM +0800, Qu Wenruo wrote:
> +/*
> + * Return 0 if we allocated any virtual(*) chunk, and restore the size to
> + * @allocated_size
> + * Return -ENOSPC if we have no more space to allocate virtual chunk
> + *
> + * *: virtual chunk is a space holder for per-profile available space
> + *    calculator.
> + *    Such virtual chunks won't take on-disk space, thus called virtual, and
> + *    only affects per-profile available space calulation.
> + */
> +static int alloc_virtual_chunk(struct btrfs_fs_info *fs_info,
> +			       struct btrfs_device_info *devices_info,
> +			       enum btrfs_raid_types type,
> +			       u64 to_alloc, u64 *allocated)
> +{
> +	const struct btrfs_raid_attr *raid_attr = &btrfs_raid_array[type];
> +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> +	struct btrfs_device *device;
> +	u64 stripe_size;
> +	int i;
> +	int ndevs = 0;
> +
> +	lockdep_assert_held(&fs_info->chunk_mutex);
> +
> +	/* Go through devices to collect their unallocated space */
> +	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
> +		u64 avail;
> +		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
> +					&device->dev_state) ||
> +		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
> +			continue;
> +
> +		if (device->total_bytes > device->bytes_used +
> +				device->virtual_allocated)
> +			avail = device->total_bytes - device->bytes_used -
> +				device->virtual_allocated;
> +		else
> +			avail = 0;
> +
> +		/* And exclude the [0, 1M) reserved space */
> +		if (avail > SZ_1M)
> +			avail -= SZ_1M;
> +		else
> +			avail = 0;
> +
> +		if (avail == 0)
> +			continue;
> +		/*
> +		 * Unlike chunk allocator, we don't care about stripe or hole
> +		 * size, so here we use @avail directly
> +		 */
> +		devices_info[ndevs].dev_offset = 0;
> +		devices_info[ndevs].total_avail = avail;
> +		devices_info[ndevs].max_avail = avail;
> +		devices_info[ndevs].dev = device;
> +		++ndevs;
> +	}
> +	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
> +	     btrfs_cmp_device_info, NULL);
> +	ndevs -= ndevs % raid_attr->devs_increment;
> +	if (ndevs < raid_attr->devs_min)
> +		return -ENOSPC;
> +	if (raid_attr->devs_max)
> +		ndevs = min(ndevs, (int)raid_attr->devs_max);
> +	else
> +		ndevs = min(ndevs, (int)BTRFS_MAX_DEVS(fs_info));
> +
> +	/*
> +	 * Now allocate a virtual chunk using the unallocate space of the
> +	 * device with the least unallocated space.
> +	 */
> +	stripe_size = round_down(devices_info[ndevs - 1].total_avail,
> +				 fs_info->sectorsize);
> +
> +	/* We can't directly do round_up for (u64)-1 as that would result 0 */
> +	if (to_alloc != (u64)-1)
> +		stripe_size = min_t(u64, stripe_size,
> +				    round_up(div_u64(to_alloc, ndevs),
> +					     fs_info->sectorsize));
> +	if (stripe_size == 0)
> +		return -ENOSPC;
> +
> +	for (i = 0; i < ndevs; i++)
> +		devices_info[i].dev->virtual_allocated += stripe_size;
> +	*allocated = stripe_size * (ndevs - raid_attr->nparity) /
> +		     raid_attr->ncopies;
> +	return 0;
> +}
> +
> +static int calc_one_profile_avail(struct btrfs_fs_info *fs_info,
> +				  enum btrfs_raid_types type)
> +{
> +	struct btrfs_device_info *devices_info = NULL;
> +	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> +	struct btrfs_device *device;
> +	u64 allocated;
> +	u64 result = 0;
> +	int ret = 0;
> +
> +	ASSERT(type >= 0 && type < BTRFS_NR_RAID_TYPES);
> +
> +	/* Not enough devices, quick exit, just update the result */
> +	if (fs_devices->rw_devices < btrfs_raid_array[type].devs_min)
> +		goto out;
> +
> +	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
> +			       GFP_NOFS);

Calling kcalloc is another potential slowdown, for the statfs
considerations.

> +	if (!devices_info) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	/* Clear virtual chunk used space for each device */
> +	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list)
> +		device->virtual_allocated = 0;
> +	while (ret == 0) {
> +		ret = alloc_virtual_chunk(fs_info, devices_info, type,
> +					  (u64)-1, &allocated);
> +		if (ret == 0)
> +			result += allocated;
> +	}
> +	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list)
> +		device->virtual_allocated = 0;
> +out:
> +	kfree(devices_info);
> +	if (ret < 0 && ret != -ENOSPC)
> +		return ret;
> +	spin_lock(&fs_devices->per_profile_lock);
> +	fs_devices->per_profile_avail[type] = result;
> +	spin_unlock(&fs_devices->per_profile_lock);
> +	return 0;
> +}
> +
> +/*
> + * Calculate the per-profile available space array.
> + *
> + * Return 0 if we succeeded updating the array.
> + * Return <0 if something went wrong. (ENOMEM)
> + */
> +static int calc_per_profile_avail(struct btrfs_fs_info *fs_info)
> +{
> +	int i;
> +	int ret;
> +
> +	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
> +		ret = calc_one_profile_avail(fs_info, i);
> +		if (ret < 0)
> +			break;
> +	}
> +	return ret;
> +}
> +
>  int btrfs_grow_device(struct btrfs_trans_handle *trans,
>  		      struct btrfs_device *device, u64 new_size)
>  {
> @@ -2635,6 +2806,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
>  	struct btrfs_super_block *super_copy = fs_info->super_copy;
>  	u64 old_total;
>  	u64 diff;
> +	int ret;
>  
>  	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
>  		return -EACCES;
> @@ -2661,7 +2833,12 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
>  	if (list_empty(&device->post_commit_list))
>  		list_add_tail(&device->post_commit_list,
>  			      &trans->transaction->dev_update_list);
> +	ret = calc_per_profile_avail(fs_info);
>  	mutex_unlock(&fs_info->chunk_mutex);
> +	if (ret < 0) {
> +		btrfs_abort_transaction(trans, ret);
> +		return ret;
> +	}
>  
>  	return btrfs_update_device(trans, device);
>  }
> @@ -2831,7 +3008,13 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>  					device->bytes_used - dev_extent_len);
>  			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
>  			btrfs_clear_space_info_full(fs_info);
> +			ret = calc_per_profile_avail(fs_info);

Adding new failure modes

>  			mutex_unlock(&fs_info->chunk_mutex);
> +			if (ret < 0) {
> +				mutex_unlock(&fs_devices->device_list_mutex);
> +				btrfs_abort_transaction(trans, ret);
> +				goto out;
> +			}
>  		}
>  
>  		ret = btrfs_update_device(trans, device);
> @@ -4526,6 +4709,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
>  		atomic64_sub(diff, &fs_info->free_chunk_space);
>  	}
>  
> +	ret = calc_per_profile_avail(fs_info);
> +	if (ret < 0) {
> +		btrfs_abort_transaction(trans, ret);
> +		btrfs_end_transaction(trans);
> +		goto done;
> +	}
>  	/*
>  	 * Once the device's size has been set to the new size, ensure all
>  	 * in-memory chunks are synced to disk so that the loop below sees them
> @@ -4690,25 +4879,6 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
>  	return 0;
>  }
>  
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -138,6 +138,13 @@ struct btrfs_device {
>  	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
>  
>  	struct extent_io_tree alloc_state;
> +
> +	/*
> +	 * the "virtual" allocated space by virtual chunk allocator, which
> +	 * is used to do accurate estimation on available space.
> +	 * Doesn't affect real chunk allocator.
> +	 */
> +	u64 virtual_allocated;

I find it a bit confusing to use 'virtual', though I get what you mean.
As this is per-device it accounts overall space, so the name should
reflect somthing like that. I maybe have a more concrete suggestion once
I read through the whole series.

next prev parent reply	other threads:[~2020-01-06 14:32 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-06  6:13 [PATCH v3 0/3] Introduce per-profile available space array to avoid over-confident can_overcommit() Qu Wenruo
2020-01-06  6:13 ` [PATCH v3 1/3] btrfs: Introduce per-profile available space facility Qu Wenruo
2020-01-06 14:32   ` David Sterba [this message]
2020-01-07  2:13     ` Qu Wenruo
2020-01-08 15:04       ` David Sterba
2020-01-08 23:53         ` Qu WenRuo
2020-01-09  6:26           ` Qu Wenruo
2020-01-06 23:45   ` kbuild test robot
2020-01-06  6:13 ` [PATCH v3 2/3] btrfs: space-info: Use per-profile available space in can_overcommit() Qu Wenruo
2020-01-06  6:13 ` [PATCH v3 3/3] btrfs: statfs: Use virtual chunk allocation to calculation available data space Qu Wenruo
2020-01-06 14:44   ` Josef Bacik
2020-01-07  1:03     ` Qu Wenruo
2020-01-06 14:06 ` [PATCH v3 0/3] Introduce per-profile available space array to avoid over-confident can_overcommit() David Sterba
2020-01-07  2:04   ` Qu Wenruo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200106143242.GG3929@twin.jikos.cz \
    --to=dsterba@suse.cz \
    --cc=josef@toxicpanda.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=wqu@suse.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox