linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Liu Bo <bo.li.liu@oracle.com>
To: Josef Bacik <jbacik@fb.com>
Cc: linux-btrfs@vger.kernel.org
Subject: Re: [PATCH 07/14] Btrfs: introduce ticketed enospc infrastructure
Date: Mon, 9 May 2016 14:29:14 -0700	[thread overview]
Message-ID: <20160509212914.GB4954@localhost.localdomain> (raw)
In-Reply-To: <1458926760-17563-8-git-send-email-jbacik@fb.com>

On Fri, Mar 25, 2016 at 01:25:53PM -0400, Josef Bacik wrote:
> Our enospc flushing sucks.  It is born from a time where we were early
> enospc'ing constantly because multiple threads would race in for the same
> reservation and randomly starve other ones out.  So I came up with this solution
> to block any other reservations from happening while one guy tried to flush
> stuff to satisfy his reservation.  This gives us pretty good correctness, but
> completely crap latency.
> 
> The solution I've come up with is ticketed reservations.  Basically we try to
> make our reservation, and if we can't we put a ticket on a list in order and
> kick off an async flusher thread.  This async flusher thread does the same old
> flushing we always did, just asynchronously.  As space is freed and added back
> to the space_info it checks and sees if we have any tickets that need
> satisfying, and adds space to the tickets and wakes up anything we've satisfied.
> 
> Once the flusher thread stops making progress it wakes up all the current
> tickets and tells them to take a hike.
> 
> There is a priority list for things that can't flush, since the async flusher
> could do anything we need to avoid deadlocks.  These guys get priority for
> having their reservation made, and will still do manual flushing themselves in
> case the async flusher isn't running.
> 
> This patch gives us significantly better latencies.  Thanks,
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/ctree.h       |   2 +
>  fs/btrfs/extent-tree.c | 524 +++++++++++++++++++++++++++++++++++--------------
>  2 files changed, 375 insertions(+), 151 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b675066..7437c8a 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1229,6 +1229,8 @@ struct btrfs_space_info {
>  	struct list_head list;
>  	/* Protected by the spinlock 'lock'. */
>  	struct list_head ro_bgs;
> +	struct list_head priority_tickets;
> +	struct list_head tickets;
>  
>  	struct rw_semaphore groups_sem;
>  	/* for block groups in our same type */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 0db4319..1673365 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
>  			       u64 num_bytes);
>  int btrfs_pin_extent(struct btrfs_root *root,
>  		     u64 bytenr, u64 num_bytes, int reserved);
> +static int __reserve_metadata_bytes(struct btrfs_root *root,
> +				    struct btrfs_space_info *space_info,
> +				    u64 orig_bytes,
> +				    enum btrfs_reserve_flush_enum flush);
> +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
> +				     struct btrfs_space_info *space_info,
> +				     u64 num_bytes);
> +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
> +				     struct btrfs_space_info *space_info,
> +				     u64 num_bytes);
>  
>  static noinline int
>  block_group_cache_done(struct btrfs_block_group_cache *cache)
> @@ -3867,6 +3877,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>  		found->bytes_readonly += bytes_readonly;
>  		if (total_bytes > 0)
>  			found->full = 0;
> +		space_info_add_new_bytes(info, found, total_bytes -
> +					 bytes_used - bytes_readonly);
>  		spin_unlock(&found->lock);
>  		*space_info = found;
>  		return 0;
> @@ -3901,6 +3913,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>  	found->flush = 0;
>  	init_waitqueue_head(&found->wait);
>  	INIT_LIST_HEAD(&found->ro_bgs);
> +	INIT_LIST_HEAD(&found->tickets);
> +	INIT_LIST_HEAD(&found->priority_tickets);
>  
>  	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
>  				    info->space_info_kobj, "%s",
> @@ -4514,12 +4528,19 @@ static int can_overcommit(struct btrfs_root *root,
>  			  struct btrfs_space_info *space_info, u64 bytes,
>  			  enum btrfs_reserve_flush_enum flush)
>  {
> -	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
> -	u64 profile = btrfs_get_alloc_profile(root, 0);
> +	struct btrfs_block_rsv *global_rsv;
> +	u64 profile;
>  	u64 space_size;
>  	u64 avail;
>  	u64 used;
>  
> +	/* Don't overcommit when in mixed mode. */
> +	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
> +		return 0;
> +
> +	BUG_ON(root->fs_info == NULL);
> +	global_rsv = &root->fs_info->global_block_rsv;
> +	profile = btrfs_get_alloc_profile(root, 0);
>  	used = space_info->bytes_used + space_info->bytes_reserved +
>  		space_info->bytes_pinned + space_info->bytes_readonly;
>  
> @@ -4669,6 +4690,11 @@ skip_async:
>  			spin_unlock(&space_info->lock);
>  			break;
>  		}
> +		if (list_empty(&space_info->tickets) &&
> +		    list_empty(&space_info->priority_tickets)) {
> +			spin_unlock(&space_info->lock);
> +			break;
> +		}
>  		spin_unlock(&space_info->lock);
>  
>  		loops++;
> @@ -4745,6 +4771,13 @@ enum flush_state {
>  	COMMIT_TRANS		=	6,
>  };
>  
> +struct reserve_ticket {
> +	u64 bytes;
> +	int error;
> +	struct list_head list;
> +	wait_queue_head_t wait;
> +};
> +
>  static int flush_space(struct btrfs_root *root,
>  		       struct btrfs_space_info *space_info, u64 num_bytes,
>  		       u64 orig_bytes, int state)
> @@ -4802,17 +4835,22 @@ static inline u64
>  btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
>  				 struct btrfs_space_info *space_info)
>  {
> +	struct reserve_ticket *ticket;
>  	u64 used;
>  	u64 expected;
> -	u64 to_reclaim;
> +	u64 to_reclaim = 0;
>  
>  	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
> -	spin_lock(&space_info->lock);
>  	if (can_overcommit(root, space_info, to_reclaim,
> -			   BTRFS_RESERVE_FLUSH_ALL)) {
> -		to_reclaim = 0;
> -		goto out;
> -	}
> +			   BTRFS_RESERVE_FLUSH_ALL))
> +		return 0;
> +
> +	list_for_each_entry(ticket, &space_info->tickets, list)
> +		to_reclaim += ticket->bytes;
> +	list_for_each_entry(ticket, &space_info->priority_tickets, list)
> +		to_reclaim += ticket->bytes;
> +	if (to_reclaim)
> +		return to_reclaim;
>  
>  	used = space_info->bytes_used + space_info->bytes_reserved +
>  	       space_info->bytes_pinned + space_info->bytes_readonly +
> @@ -4828,9 +4866,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
>  		to_reclaim = 0;
>  	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
>  				     space_info->bytes_reserved);
> -out:
> -	spin_unlock(&space_info->lock);
> -
>  	return to_reclaim;
>  }
>  
> @@ -4847,69 +4882,169 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
>  		!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
>  }
>  
> -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
> -				       struct btrfs_fs_info *fs_info,
> -				       int flush_state)
> +static void wake_all_tickets(struct list_head *head)
>  {
> -	u64 used;
> +	struct reserve_ticket *ticket;
>  
> -	spin_lock(&space_info->lock);
> -	/*
> -	 * We run out of space and have not got any free space via flush_space,
> -	 * so don't bother doing async reclaim.
> -	 */
> -	if (flush_state > COMMIT_TRANS && space_info->full) {
> -		spin_unlock(&space_info->lock);
> -		return 0;
> +	while (!list_empty(head)) {
> +		ticket = list_first_entry(head, struct reserve_ticket, list);
> +		list_del_init(&ticket->list);
> +		ticket->error = -ENOSPC;
> +		wake_up(&ticket->wait);
>  	}
> -
> -	used = space_info->bytes_used + space_info->bytes_reserved +
> -	       space_info->bytes_pinned + space_info->bytes_readonly +
> -	       space_info->bytes_may_use;
> -	if (need_do_async_reclaim(space_info, fs_info, used)) {
> -		spin_unlock(&space_info->lock);
> -		return 1;
> -	}
> -	spin_unlock(&space_info->lock);
> -
> -	return 0;
>  }
>  
> +/*
> + * This is for normal flushers, we can wait all goddamned day if we want to.  We
> + * will loop and continuously try to flush as long as we are making progress.
> + * We count progress as clearing off tickets each time we have to loop.
> + */
>  static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
>  {
> +	struct reserve_ticket *last_ticket = NULL;
>  	struct btrfs_fs_info *fs_info;
>  	struct btrfs_space_info *space_info;
>  	u64 to_reclaim;
>  	int flush_state;
> +	int commit_cycles = 0;
>  
>  	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
>  	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
>  
> +	spin_lock(&space_info->lock);
>  	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
>  						      space_info);
> -	if (!to_reclaim)
> +	if (!to_reclaim) {
> +		space_info->flush = 0;
> +		spin_unlock(&space_info->lock);
>  		return;
> +	}
> +	last_ticket = list_first_entry(&space_info->tickets,
> +				       struct reserve_ticket, list);
> +	spin_unlock(&space_info->lock);
>  
>  	flush_state = FLUSH_DELAYED_ITEMS_NR;
>  	do {
> +		struct reserve_ticket *ticket;
> +		int ret;
> +
> +		ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
> +			    to_reclaim, flush_state);
> +		spin_lock(&space_info->lock);
> +		if (list_empty(&space_info->tickets)) {
> +			space_info->flush = 0;
> +			spin_unlock(&space_info->lock);
> +			return;
> +		}
> +		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
> +							      space_info);
> +		ticket = list_first_entry(&space_info->tickets,
> +					  struct reserve_ticket, list);
> +		if (last_ticket == ticket) {
> +			flush_state++;
> +		} else {
> +			last_ticket = ticket;
> +			flush_state = FLUSH_DELAYED_ITEMS_NR;
> +			if (commit_cycles)
> +				commit_cycles--;
> +		}
> +
> +		if (flush_state > COMMIT_TRANS) {
> +			commit_cycles++;
> +			if (commit_cycles > 2) {
> +				wake_all_tickets(&space_info->tickets);
> +				space_info->flush = 0;
> +			} else {
> +				flush_state = FLUSH_DELAYED_ITEMS_NR;
> +			}
> +		}
> +		spin_unlock(&space_info->lock);
> +	} while (flush_state <= COMMIT_TRANS);
> +}
> +
> +void btrfs_init_async_reclaim_work(struct work_struct *work)
> +{
> +	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
> +}
> +
> +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
> +					    struct btrfs_space_info *space_info,
> +					    struct reserve_ticket *ticket)
> +{
> +	u64 to_reclaim;
> +	int flush_state = FLUSH_DELAYED_ITEMS_NR;
> +
> +	spin_lock(&space_info->lock);
> +	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
> +						      space_info);
> +	if (!to_reclaim) {
> +		spin_unlock(&space_info->lock);
> +		return;
> +	}
> +	spin_unlock(&space_info->lock);
> +
> +	do {
>  		flush_space(fs_info->fs_root, space_info, to_reclaim,
>  			    to_reclaim, flush_state);
>  		flush_state++;
> -		if (!btrfs_need_do_async_reclaim(space_info, fs_info,
> -						 flush_state))
> +		spin_lock(&space_info->lock);
> +		if (ticket->bytes == 0) {
> +			spin_unlock(&space_info->lock);
>  			return;
> +		}
> +		spin_unlock(&space_info->lock);
> +
> +		/*
> +		 * Priority flushers can't wait on delalloc without
> +		 * deadlocking.
> +		 */
> +		if (flush_state == FLUSH_DELALLOC ||
> +		    flush_state == FLUSH_DELALLOC_WAIT)
> +			flush_state = ALLOC_CHUNK;
>  	} while (flush_state < COMMIT_TRANS);
>  }
>  
> -void btrfs_init_async_reclaim_work(struct work_struct *work)
> +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
> +			       struct btrfs_space_info *space_info,
> +			       struct reserve_ticket *ticket, u64 orig_bytes)
> +
>  {
> -	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
> +	DEFINE_WAIT(wait);
> +	int ret = 0;
> +
> +	spin_lock(&space_info->lock);
> +	while (ticket->bytes > 0 && ticket->error == 0) {
> +		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
> +		if (ret) {
> +			ret = -EINTR;
> +			break;
> +		}
> +		spin_unlock(&space_info->lock);
> +
> +		schedule();
> +
> +		finish_wait(&ticket->wait, &wait);
> +		spin_lock(&space_info->lock);
> +	}
> +	if (!ret)
> +		ret = ticket->error;
> +	if (!list_empty(&ticket->list))
> +		list_del_init(&ticket->list);
> +	if (ticket->bytes && ticket->bytes < orig_bytes) {
> +		u64 num_bytes = orig_bytes - ticket->bytes;
> +		space_info->bytes_may_use -= num_bytes;
> +		trace_btrfs_space_reservation(fs_info, "space_info",
> +					      space_info->flags, num_bytes, 0);
> +	}
> +	spin_unlock(&space_info->lock);
> +
> +	return ret;
>  }
>  
>  /**
>   * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
>   * @root - the root we're allocating for
> - * @block_rsv - the block_rsv we're allocating for
> + * @space_info - the space info we want to allocate from
>   * @orig_bytes - the number of bytes we want
>   * @flush - whether or not we can flush to make our reservation
>   *
> @@ -4920,81 +5055,34 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
>   * regain reservations will be made and this will fail if there is not enough
>   * space already.
>   */
> -static int reserve_metadata_bytes(struct btrfs_root *root,
> -				  struct btrfs_block_rsv *block_rsv,
> -				  u64 orig_bytes,
> -				  enum btrfs_reserve_flush_enum flush)
> +static int __reserve_metadata_bytes(struct btrfs_root *root,
> +				    struct btrfs_space_info *space_info,
> +				    u64 orig_bytes,
> +				    enum btrfs_reserve_flush_enum flush)
>  {
> -	struct btrfs_space_info *space_info = block_rsv->space_info;
> +	struct reserve_ticket ticket;
>  	u64 used;
> -	u64 num_bytes = orig_bytes;
> -	int flush_state = FLUSH_DELAYED_ITEMS_NR;
>  	int ret = 0;
> -	bool flushing = false;
>  
> -again:
> -	ret = 0;
> +	ASSERT(orig_bytes);
>  	spin_lock(&space_info->lock);
> -	/*
> -	 * We only want to wait if somebody other than us is flushing and we
> -	 * are actually allowed to flush all things.
> -	 */
> -	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
> -	       space_info->flush) {
> -		spin_unlock(&space_info->lock);
> -		/*
> -		 * If we have a trans handle we can't wait because the flusher
> -		 * may have to commit the transaction, which would mean we would
> -		 * deadlock since we are waiting for the flusher to finish, but
> -		 * hold the current transaction open.
> -		 */
> -		if (current->journal_info)
> -			return -EAGAIN;
> -		ret = wait_event_killable(space_info->wait, !space_info->flush);
> -		/* Must have been killed, return */
> -		if (ret)
> -			return -EINTR;
> -
> -		spin_lock(&space_info->lock);
> -	}
> -
>  	ret = -ENOSPC;
>  	used = space_info->bytes_used + space_info->bytes_reserved +
>  		space_info->bytes_pinned + space_info->bytes_readonly +
>  		space_info->bytes_may_use;
>  
>  	/*
> -	 * The idea here is that we've not already over-reserved the block group
> -	 * then we can go ahead and save our reservation first and then start
> -	 * flushing if we need to.  Otherwise if we've already overcommitted
> -	 * lets start flushing stuff first and then come back and try to make
> -	 * our reservation.
> +	 * If we have enough space then hooray, make our reservation and carry
> +	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
> +	 * If not things get more complicated.
>  	 */
> -	if (used <= space_info->total_bytes) {
> -		if (used + orig_bytes <= space_info->total_bytes) {
> -			space_info->bytes_may_use += orig_bytes;
> -			trace_btrfs_space_reservation(root->fs_info,
> -				"space_info", space_info->flags, orig_bytes, 1);
> -			ret = 0;
> -		} else {
> -			/*
> -			 * Ok set num_bytes to orig_bytes since we aren't
> -			 * overocmmitted, this way we only try and reclaim what
> -			 * we need.
> -			 */
> -			num_bytes = orig_bytes;
> -		}
> -	} else {
> -		/*
> -		 * Ok we're over committed, set num_bytes to the overcommitted
> -		 * amount plus the amount of bytes that we need for this
> -		 * reservation.
> -		 */
> -		num_bytes = used - space_info->total_bytes +
> -			(orig_bytes * 2);
> -	}
> -
> -	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
> +	if (used + orig_bytes <= space_info->total_bytes) {
> +		space_info->bytes_may_use += orig_bytes;
> +		trace_btrfs_space_reservation(root->fs_info, "space_info",
> +					      space_info->flags, orig_bytes,
> +					      1);
> +		ret = 0;
> +	} else if (can_overcommit(root, space_info, orig_bytes, flush)) {
>  		space_info->bytes_may_use += orig_bytes;
>  		trace_btrfs_space_reservation(root->fs_info, "space_info",
>  					      space_info->flags, orig_bytes,
> @@ -5003,16 +5091,27 @@ again:
>  	}
>  
>  	/*
> -	 * Couldn't make our reservation, save our place so while we're trying
> -	 * to reclaim space we can actually use it instead of somebody else
> -	 * stealing it from us.
> +	 * If we couldn't make a reservation then setup our reservation ticket
> +	 * and kick the async worker if it's not already running.
>  	 *
> -	 * We make the other tasks wait for the flush only when we can flush
> -	 * all things.
> +	 * If we are a priority flusher then we just need to add our ticket to
> +	 * the list and we will do our own flushing further down.
>  	 */
>  	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
> -		flushing = true;
> -		space_info->flush = 1;
> +		ticket.bytes = orig_bytes;
> +		ticket.error = 0;
> +		init_waitqueue_head(&ticket.wait);
> +		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
> +			list_add_tail(&ticket.list, &space_info->tickets);
> +			if (!space_info->flush) {
> +				space_info->flush = 1;
> +				queue_work(system_unbound_wq,
> +					   &root->fs_info->async_reclaim_work);
> +			}
> +		} else {
> +			list_add_tail(&ticket.list,
> +				      &space_info->priority_tickets);
> +		}
>  	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
>  		used += orig_bytes;
>  		/*
> @@ -5027,33 +5126,56 @@ again:
>  				   &root->fs_info->async_reclaim_work);
>  	}
>  	spin_unlock(&space_info->lock);
> -
>  	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
> -		goto out;
> +		return ret;
>  
> -	ret = flush_space(root, space_info, num_bytes, orig_bytes,
> -			  flush_state);
> -	flush_state++;
> +	if (flush == BTRFS_RESERVE_FLUSH_ALL)
> +		return wait_reserve_ticket(root->fs_info, space_info, &ticket,
> +					   orig_bytes);
>  
> -	/*
> -	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
> -	 * would happen. So skip delalloc flush.
> -	 */
> -	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
> -	    (flush_state == FLUSH_DELALLOC ||
> -	     flush_state == FLUSH_DELALLOC_WAIT))
> -		flush_state = ALLOC_CHUNK;
> +	ret = 0;
> +	priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
> +	spin_lock(&space_info->lock);
> +	if (ticket.bytes) {
> +		if (ticket.bytes < orig_bytes) {
> +			u64 num_bytes = orig_bytes - ticket.bytes;
> +			space_info->bytes_may_use -= num_bytes;
> +			trace_btrfs_space_reservation(root->fs_info,
> +					"space_info", space_info->flags,
> +					num_bytes, 0);
>  
> -	if (!ret)
> -		goto again;
> -	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
> -		 flush_state < COMMIT_TRANS)
> -		goto again;
> -	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
> -		 flush_state <= COMMIT_TRANS)
> -		goto again;
> +		}
> +		list_del_init(&ticket.list);
> +		ret = -ENOSPC;
> +	}
> +	spin_unlock(&space_info->lock);
> +	ASSERT(list_empty(&ticket.list));
> +	return ret;
> +}
>  
> -out:
> +/**
> + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
> + * @root - the root we're allocating for
> + * @block_rsv - the block_rsv we're allocating for
> + * @orig_bytes - the number of bytes we want
> + * @flush - whether or not we can flush to make our reservation
> + *
> + * This will reserve orgi_bytes number of bytes from the space info associated
> + * with the block_rsv.  If there is not enough space it will make an attempt to
> + * flush out space to make room.  It will do this by flushing delalloc if
> + * possible or committing the transaction.  If flush is 0 then no attempts to
> + * regain reservations will be made and this will fail if there is not enough
> + * space already.
> + */
> +static int reserve_metadata_bytes(struct btrfs_root *root,
> +				  struct btrfs_block_rsv *block_rsv,
> +				  u64 orig_bytes,
> +				  enum btrfs_reserve_flush_enum flush)
> +{
> +	int ret;
> +
> +	ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
> +				       flush);
>  	if (ret == -ENOSPC &&
>  	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
>  		struct btrfs_block_rsv *global_rsv =
> @@ -5066,13 +5188,8 @@ out:
>  	if (ret == -ENOSPC)
>  		trace_btrfs_space_reservation(root->fs_info,
>  					      "space_info:enospc",
> -					      space_info->flags, orig_bytes, 1);
> -	if (flushing) {
> -		spin_lock(&space_info->lock);
> -		space_info->flush = 0;
> -		wake_up_all(&space_info->wait);
> -		spin_unlock(&space_info->lock);
> -	}
> +					      block_rsv->space_info->flags,
> +					      orig_bytes, 1);
>  	return ret;
>  }
>  
> @@ -5148,6 +5265,103 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
>  	return 0;
>  }
>  
> +/*
> + * This is for space we already have accounted in space_info->bytes_may_use, so
> + * basically when we're returning space from block_rsv's.
> + */
> +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
> +				     struct btrfs_space_info *space_info,
> +				     u64 num_bytes)
> +{
> +	struct reserve_ticket *ticket;
> +	struct list_head *head;
> +	u64 used;
> +	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
> +	bool check_overcommit = false;
> +
> +	spin_lock(&space_info->lock);
> +	head = &space_info->priority_tickets;
> +
> +	/*
> +	 * First we want to see if we're over our limit, because if we are then
> +	 * we need to make sure we are still ok overcommitting before we satisfy
> +	 * another reservation.
> +	 */
> +	used = space_info->bytes_used + space_info->bytes_reserved +
> +		space_info->bytes_pinned + space_info->bytes_readonly;
> +	if (used - num_bytes >= space_info->total_bytes)
> +		check_overcommit = true;

'used' without bytes_may_use should be less than ->total_bytes,
you wanna check if (used + num_bytes >= space_info->total_bytes) ?

Others are sane to me.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>

Thanks,

-liubo

> +again:
> +	while (!list_empty(head) && num_bytes) {
> +		ticket = list_first_entry(head, struct reserve_ticket,
> +					  list);
> +		if (check_overcommit &&
> +		    !can_overcommit(fs_info->extent_root, space_info,
> +				    ticket->bytes, flush))
> +			break;
> +		if (num_bytes >= ticket->bytes) {
> +			list_del_init(&ticket->list);
> +			num_bytes -= ticket->bytes;
> +			ticket->bytes = 0;
> +			wake_up(&ticket->wait);
> +		} else {
> +			ticket->bytes -= num_bytes;
> +			num_bytes = 0;
> +		}
> +	}
> +
> +	if (num_bytes && head == &space_info->priority_tickets) {
> +		head = &space_info->tickets;
> +		flush = BTRFS_RESERVE_FLUSH_ALL;
> +		goto again;
> +	}
> +	space_info->bytes_may_use -= num_bytes;
> +	trace_btrfs_space_reservation(fs_info, "space_info",
> +				      space_info->flags, num_bytes, 0);
> +	spin_unlock(&space_info->lock);
> +}
> +
> +/*
> + * This is for newly allocated space that isn't accounted in
> + * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
> + * we use this helper.
> + */
> +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
> +				     struct btrfs_space_info *space_info,
> +				     u64 num_bytes)
> +{
> +	struct reserve_ticket *ticket;
> +	struct list_head *head = &space_info->priority_tickets;
> +
> +again:
> +	while (!list_empty(head) && num_bytes) {
> +		ticket = list_first_entry(head, struct reserve_ticket,
> +					  list);
> +		if (num_bytes >= ticket->bytes) {
> +			trace_btrfs_space_reservation(fs_info, "space_info",
> +						      space_info->flags,
> +						      ticket->bytes, 1);
> +			list_del_init(&ticket->list);
> +			num_bytes -= ticket->bytes;
> +			space_info->bytes_may_use += ticket->bytes;
> +			ticket->bytes = 0;
> +			wake_up(&ticket->wait);
> +		} else {
> +			trace_btrfs_space_reservation(fs_info, "space_info",
> +						      space_info->flags,
> +						      num_bytes, 1);
> +			space_info->bytes_may_use += num_bytes;
> +			ticket->bytes -= num_bytes;
> +			num_bytes = 0;
> +		}
> +	}
> +
> +	if (num_bytes && head == &space_info->priority_tickets) {
> +		head = &space_info->tickets;
> +		goto again;
> +	}
> +}
> +
>  static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
>  				    struct btrfs_block_rsv *block_rsv,
>  				    struct btrfs_block_rsv *dest, u64 num_bytes)
> @@ -5182,13 +5396,9 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
>  			}
>  			spin_unlock(&dest->lock);
>  		}
> -		if (num_bytes) {
> -			spin_lock(&space_info->lock);
> -			space_info->bytes_may_use -= num_bytes;
> -			trace_btrfs_space_reservation(fs_info, "space_info",
> -					space_info->flags, num_bytes, 0);
> -			spin_unlock(&space_info->lock);
> -		}
> +		if (num_bytes)
> +			space_info_add_old_bytes(fs_info, space_info,
> +						 num_bytes);
>  	}
>  }
>  
> @@ -6346,17 +6556,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
>  			readonly = true;
>  		}
>  		spin_unlock(&cache->lock);
> -		if (!readonly && global_rsv->space_info == space_info) {
> +		if (!readonly && return_free_space &&
> +		    global_rsv->space_info == space_info) {
> +			u64 to_add = len;
> +			WARN_ON(!return_free_space);
>  			spin_lock(&global_rsv->lock);
>  			if (!global_rsv->full) {
> -				len = min(len, global_rsv->size -
> -					  global_rsv->reserved);
> -				global_rsv->reserved += len;
> -				space_info->bytes_may_use += len;
> +				to_add = min(len, global_rsv->size -
> +					     global_rsv->reserved);
> +				global_rsv->reserved += to_add;
> +				space_info->bytes_may_use += to_add;
>  				if (global_rsv->reserved >= global_rsv->size)
>  					global_rsv->full = 1;
> +				trace_btrfs_space_reservation(fs_info,
> +							      "space_info",
> +							      space_info->flags,
> +							      to_add, 1);
> +				len -= to_add;
>  			}
>  			spin_unlock(&global_rsv->lock);
> +			/* Add to any tickets we may have */
> +			if (len)
> +				space_info_add_new_bytes(fs_info, space_info,
> +							 len);
>  		}
>  		spin_unlock(&space_info->lock);
>  	}
> -- 
> 2.5.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

  reply	other threads:[~2016-05-09 21:27 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-25 17:25 [PATCH 00/14] Enospc rework Josef Bacik
2016-03-25 17:25 ` [PATCH 01/14] Btrfs: add bytes_readonly to the spaceinfo at once Josef Bacik
2016-03-25 17:25 ` [PATCH 02/14] Btrfs: fix callers of btrfs_block_rsv_migrate Josef Bacik
2016-03-25 17:25 ` [PATCH 03/14] Btrfs: always reserve metadata for delalloc extents Josef Bacik
2016-03-25 18:04   ` Liu Bo
2016-03-25 17:25 ` [PATCH 04/14] Btrfs: change delayed reservation fallback behavior Josef Bacik
2016-03-25 17:25 ` [PATCH 05/14] Btrfs: warn_on for unaccounted spaces Josef Bacik
2016-06-27  4:47   ` Qu Wenruo
2016-06-27 13:03     ` Chris Mason
2016-06-28  0:16       ` Qu Wenruo
2016-03-25 17:25 ` [PATCH 06/14] Btrfs: add tracepoint for adding block groups Josef Bacik
2016-03-25 17:25 ` [PATCH 07/14] Btrfs: introduce ticketed enospc infrastructure Josef Bacik
2016-05-09 21:29   ` Liu Bo [this message]
2016-05-17 17:30   ` [PATCH V2] " Josef Bacik
2016-05-18 11:24     ` Austin S. Hemmelgarn
2016-05-19 12:47       ` Austin S. Hemmelgarn
2016-05-18 22:46     ` David Sterba
2016-03-25 17:25 ` [PATCH 08/14] Btrfs: trace pinned extents Josef Bacik
2016-03-25 17:25 ` [PATCH 09/14] Btrfs: fix delalloc reservation amount tracepoint Josef Bacik
2016-03-25 17:25 ` [PATCH 10/14] Btrfs: add tracepoints for flush events Josef Bacik
2016-03-25 17:25 ` [PATCH 11/14] Btrfs: add fsid to some tracepoints Josef Bacik
2016-03-25 17:25 ` [PATCH 12/14] Btrfs: fix release reserved extents trace points Josef Bacik
2016-05-09 21:33   ` Liu Bo
2016-03-25 17:25 ` [PATCH 13/14] Btrfs: don't bother kicking async if there's nothing to reclaim Josef Bacik
2016-03-25 17:26 ` [PATCH 14/14] Btrfs: don't do nocow check unless we have to Josef Bacik
2016-03-25 17:50   ` Liu Bo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160509212914.GB4954@localhost.localdomain \
    --to=bo.li.liu@oracle.com \
    --cc=jbacik@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).