linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: David Sterba <dsterba@suse.cz>
To: jbacik@fb.com
Cc: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>, linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v2] btrfs: introduce priority based delalloc shrink mechanism
Date: Mon, 24 Oct 2016 17:43:52 +0200	[thread overview]
Message-ID: <20161024154352.GD22935@twin.jikos.cz> (raw)
In-Reply-To: <1476351085-26821-1-git-send-email-wangxg.fnst@cn.fujitsu.com>

Hi Josef,

are you fine with V2?

On Thu, Oct 13, 2016 at 05:31:25PM +0800, Wang Xiaoguang wrote:
> Since commit b02441999efcc6152b87cd58e7970bb7843f76cf, we don't wait all
> ordered extents, but I run into some enospc errors when doing large file
> create and delete tests, it's because shrink_delalloc() does not write
> enough delalloc bytes and wait them finished:
>     From: Miao Xie <miaox@cn.fujitsu.com>
>     Date: Mon, 4 Nov 2013 23:13:25 +0800
>     Subject: [PATCH] Btrfs: don't wait for the completion of all the ordered extents
> 
>     It is very likely that there are lots of ordered extents in the filesytem,
>     if we wait for the completion of all of them when we want to reclaim some
>     space for the metadata space reservation, we would be blocked for a long
>     time. The performance would drop down suddenly for a long time.
> 
> Here we introduce a simple reclaim_priority variable, the lower the
> value, the higher the priority, 0 is the maximum priority. The core
> idea is:
>     delalloc_bytes = percpu_counter_sum_positive(&root->fs_info->delalloc_bytes);
>     if (reclaim_priority)
>         to_reclaim = orig * (2 << (BTRFS_DEFAULT_FLUSH_PRIORITY - reclaim_priority));
>     else
>         to_reclaim = delalloc_bytes;
> 
> Here 'orig' is the number of metadata we want to reserve, and as the priority
> increases, we will try wo write more delalloc bytes, meanwhile if
> "reclaim_priority == 0" returns true, we'll also wait all current ordered
> extents to finish.
> 
> Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
> ---
>  fs/btrfs/extent-tree.c       | 63 ++++++++++++++++++++++++++------------------
>  include/trace/events/btrfs.h | 11 +++-----
>  2 files changed, 42 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index e08791d..7477c25 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4686,16 +4686,18 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
>  }
>  
>  #define EXTENT_SIZE_PER_ITEM	SZ_256K
> +#define	BTRFS_DEFAULT_FLUSH_PRIORITY	3
>  
>  /*
>   * shrink metadata reservation for delalloc
>   */
> -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
> -			    bool wait_ordered)
> +static void shrink_delalloc(struct btrfs_root *root, u64 orig,
> +			    bool wait_ordered, int reclaim_priority)
>  {
>  	struct btrfs_block_rsv *block_rsv;
>  	struct btrfs_space_info *space_info;
>  	struct btrfs_trans_handle *trans;
> +	u64 to_reclaim;
>  	u64 delalloc_bytes;
>  	u64 max_reclaim;
>  	long time_left;
> @@ -4703,22 +4705,36 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
>  	int loops;
>  	int items;
>  	enum btrfs_reserve_flush_enum flush;
> +	int items_to_wait;
> +
> +	delalloc_bytes = percpu_counter_sum_positive(
> +				&root->fs_info->delalloc_bytes);
> +	if (reclaim_priority < 0)
> +		reclaim_priority = 0;
> +
> +	if (reclaim_priority)
> +		to_reclaim = orig * (2 << (BTRFS_DEFAULT_FLUSH_PRIORITY -
> +					reclaim_priority));
> +	else
> +		to_reclaim = delalloc_bytes;
>  
>  	/* Calc the number of the pages we need flush for space reservation */
>  	items = calc_reclaim_items_nr(root, to_reclaim);
>  	to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
> +	if (reclaim_priority)
> +		items_to_wait = items;
> +	else
> +		items_to_wait = -1;
>  
>  	trans = (struct btrfs_trans_handle *)current->journal_info;
>  	block_rsv = &root->fs_info->delalloc_block_rsv;
>  	space_info = block_rsv->space_info;
>  
> -	delalloc_bytes = percpu_counter_sum_positive(
> -						&root->fs_info->delalloc_bytes);
>  	if (delalloc_bytes == 0) {
>  		if (trans)
>  			return;
>  		if (wait_ordered)
> -			btrfs_wait_ordered_roots(root->fs_info, items,
> +			btrfs_wait_ordered_roots(root->fs_info, items_to_wait,
>  						 0, (u64)-1);
>  		return;
>  	}
> @@ -4763,7 +4779,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
>  
>  		loops++;
>  		if (wait_ordered && !trans) {
> -			btrfs_wait_ordered_roots(root->fs_info, items,
> +			btrfs_wait_ordered_roots(root->fs_info, items_to_wait,
>  						 0, (u64)-1);
>  		} else {
>  			time_left = schedule_timeout_killable(1);
> @@ -4836,7 +4852,7 @@ struct reserve_ticket {
>  
>  static int flush_space(struct btrfs_root *root,
>  		       struct btrfs_space_info *space_info, u64 num_bytes,
> -		       u64 orig_bytes, int state)
> +		       int state, int reclaim_priority)
>  {
>  	struct btrfs_trans_handle *trans;
>  	int nr;
> @@ -4860,8 +4876,8 @@ static int flush_space(struct btrfs_root *root,
>  		break;
>  	case FLUSH_DELALLOC:
>  	case FLUSH_DELALLOC_WAIT:
> -		shrink_delalloc(root, num_bytes * 2, orig_bytes,
> -				state == FLUSH_DELALLOC_WAIT);
> +		shrink_delalloc(root, num_bytes, state == FLUSH_DELALLOC_WAIT,
> +				reclaim_priority);
>  		break;
>  	case ALLOC_CHUNK:
>  		trans = btrfs_join_transaction(root);
> @@ -4877,7 +4893,7 @@ static int flush_space(struct btrfs_root *root,
>  			ret = 0;
>  		break;
>  	case COMMIT_TRANS:
> -		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
> +		ret = may_commit_transaction(root, space_info, num_bytes, 0);
>  		break;
>  	default:
>  		ret = -ENOSPC;
> @@ -4885,7 +4901,7 @@ static int flush_space(struct btrfs_root *root,
>  	}
>  
>  	trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
> -				orig_bytes, state, ret);
> +				state, ret);
>  	return ret;
>  }
>  
> @@ -5008,8 +5024,8 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  	struct btrfs_space_info *space_info;
>  	u64 to_reclaim;
>  	int flush_state;
> -	int commit_cycles = 0;
>  	u64 last_tickets_id;
> +	int reclaim_priority = BTRFS_DEFAULT_FLUSH_PRIORITY;
>  
>  	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
>  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> @@ -5030,8 +5046,11 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  		struct reserve_ticket *ticket;
>  		int ret;
>  
> +		if (flush_state > COMMIT_TRANS)
> +			flush_state = FLUSH_DELAYED_ITEMS_NR;
>  		ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
> -			    to_reclaim, flush_state);
> +				  flush_state, reclaim_priority);
> +
>  		spin_lock(&space_info->lock);
>  		if (!ret)
>  			try_to_wake_tickets(fs_info->fs_root, space_info);
> @@ -5049,21 +5068,15 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  		} else {
>  			last_tickets_id = space_info->tickets_id;
>  			flush_state = FLUSH_DELAYED_ITEMS_NR;
> -			if (commit_cycles)
> -				commit_cycles--;
> +			reclaim_priority = BTRFS_DEFAULT_FLUSH_PRIORITY;
>  		}
>  
> -		if (flush_state > COMMIT_TRANS) {
> -			commit_cycles++;
> -			if (commit_cycles > 2) {
> -				wake_all_tickets(&space_info->tickets);
> -				space_info->flush = 0;
> -			} else {
> -				flush_state = FLUSH_DELAYED_ITEMS_NR;
> -			}
> +		if (flush_state > COMMIT_TRANS && reclaim_priority == 0) {
> +			wake_all_tickets(&space_info->tickets);
> +			space_info->flush = 0;
>  		}
>  		spin_unlock(&space_info->lock);
> -	} while (flush_state <= COMMIT_TRANS);
> +	} while ((flush_state <= COMMIT_TRANS) || (--reclaim_priority >= 0));
>  }
>  
>  void btrfs_init_async_reclaim_work(struct work_struct *work)
> @@ -5089,7 +5102,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
>  
>  	do {
>  		flush_space(fs_info->fs_root, space_info, to_reclaim,
> -			    to_reclaim, flush_state);
> +			    flush_state, 1);
>  		flush_state++;
>  		spin_lock(&space_info->lock);
>  		if (ticket->bytes == 0) {
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index e030d6f..7d953a6 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -857,15 +857,14 @@ TRACE_EVENT(btrfs_trigger_flush,
>  TRACE_EVENT(btrfs_flush_space,
>  
>  	TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
> -		 u64 orig_bytes, int state, int ret),
> +		 int state, int ret),
>  
> -	TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
> +	TP_ARGS(fs_info, flags, num_bytes, state, ret),
>  
>  	TP_STRUCT__entry(
>  		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
>  		__field(	u64,	flags			)
>  		__field(	u64,	num_bytes		)
> -		__field(	u64,	orig_bytes		)
>  		__field(	int,	state			)
>  		__field(	int,	ret			)
>  	),
> @@ -874,19 +873,17 @@ TRACE_EVENT(btrfs_flush_space,
>  		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
>  		__entry->flags		=	flags;
>  		__entry->num_bytes	=	num_bytes;
> -		__entry->orig_bytes	=	orig_bytes;
>  		__entry->state		=	state;
>  		__entry->ret		=	ret;
>  	),
>  
>  	TP_printk("%pU: state = %d(%s), flags = %llu(%s), num_bytes = %llu, "
> -		  "orig_bytes = %llu, ret = %d", __entry->fsid, __entry->state,
> +		  "ret = %d", __entry->fsid, __entry->state,
>  		  show_flush_state(__entry->state),
>  		  (unsigned long long)__entry->flags,
>  		  __print_flags((unsigned long)__entry->flags, "|",
>  				BTRFS_GROUP_FLAGS),
> -		  (unsigned long long)__entry->num_bytes,
> -		  (unsigned long long)__entry->orig_bytes, __entry->ret)
> +		  (unsigned long long)__entry->num_bytes, __entry->ret)
>  );
>  
>  DECLARE_EVENT_CLASS(btrfs__reserved_extent,
> -- 
> 2.7.4
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

  reply	other threads:[~2016-10-24 15:43 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-12  9:03 [PATCH 1/2] btrfs: introduce priority based delalloc shrink mechanism Wang Xiaoguang
2016-10-12  9:03 ` [PATCH 2/2] btrfs: try to satisfy metadata requests if wen can overcommit Wang Xiaoguang
2016-10-12 17:21   ` Josef Bacik
2016-10-12 17:20 ` [PATCH 1/2] btrfs: introduce priority based delalloc shrink mechanism Josef Bacik
2016-10-13  5:40   ` Wang Xiaoguang
2016-10-13  9:31     ` [PATCH v2] " Wang Xiaoguang
2016-10-24 15:43       ` David Sterba [this message]
2016-11-22 19:35         ` Chris Mason

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161024154352.GD22935@twin.jikos.cz \
    --to=dsterba@suse.cz \
    --cc=jbacik@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=wangxg.fnst@cn.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).