Re: [PATCH 8/8] vm: Add an tuning knob for vm.max_writeback_mb

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Wu Fengguang <fengguang.wu@intel.com>
To: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Chris Mason <chris.mason@oracle.com>,
	Artem Bityutskiy <dedekind1@gmail.com>,
	Jens Axboe <jens.axboe@oracle.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
	"david@fromorbit.com" <david@fromorbit.com>,
	"hch@infradead.org" <hch@infradead.org>,
	"akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	Theodore Ts'o <tytso@mit.edu>
Subject: Re: [PATCH 8/8] vm: Add an tuning knob for vm.max_writeback_mb
Date: Thu, 1 Oct 2009 21:36:10 +0800	[thread overview]
Message-ID: <20091001133610.GA7228@localhost> (raw)
In-Reply-To: <20090930115539.GA16074@duck.suse.cz>

On Wed, Sep 30, 2009 at 07:55:39PM +0800, Jan Kara wrote:
> > writeback: let balance_dirty_pages() wait on background writeback
> > 
> > CC: Chris Mason <chris.mason@oracle.com> 
> > CC: Dave Chinner <david@fromorbit.com> 
> > CC: Jan Kara <jack@suse.cz> 
> > CC: Peter Zijlstra <a.p.zijlstra@chello.nl> 
> > CC: Jens Axboe <jens.axboe@oracle.com> 
> > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> > ---
> >  fs/fs-writeback.c           |   89 ++++++++++++++++++++++++++++++++--
> >  include/linux/backing-dev.h |   15 +++++
> >  mm/backing-dev.c            |    4 +
> >  mm/page-writeback.c         |   43 ++--------------
> >  4 files changed, 109 insertions(+), 42 deletions(-)
> > 
> > --- linux.orig/mm/page-writeback.c	2009-09-28 19:01:40.000000000 +0800
> > +++ linux/mm/page-writeback.c	2009-09-28 19:02:48.000000000 +0800
> > @@ -218,6 +218,10 @@ static inline void __bdi_writeout_inc(st
> >  {
> >  	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
> >  			      bdi->max_prop_frac);
> > +
> > +	if (atomic_read(&bdi->throttle_pages) < DIRTY_THROTTLE_PAGES_STOP &&
> > +	    atomic_dec_and_test(&bdi->throttle_pages))
> > +		bdi_writeback_wakeup(bdi);
> >  }
> >  
> >  void bdi_writeout_inc(struct backing_dev_info *bdi)
> > @@ -458,20 +462,10 @@ static void balance_dirty_pages(struct a
> >  	unsigned long background_thresh;
> >  	unsigned long dirty_thresh;
> >  	unsigned long bdi_thresh;
> > -	unsigned long pages_written = 0;
> > -	unsigned long pause = 1;
> >  	int dirty_exceeded;
> >  	struct backing_dev_info *bdi = mapping->backing_dev_info;
> >  
> >  	for (;;) {
> > -		struct writeback_control wbc = {
> > -			.bdi		= bdi,
> > -			.sync_mode	= WB_SYNC_NONE,
> > -			.older_than_this = NULL,
> > -			.nr_to_write	= write_chunk,
> > -			.range_cyclic	= 1,
> > -		};
> > -
> >  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> >  				 global_page_state(NR_UNSTABLE_NFS);
> >  		nr_writeback = global_page_state(NR_WRITEBACK) +
> > @@ -518,31 +512,7 @@ static void balance_dirty_pages(struct a
> >  		if (!bdi->dirty_exceeded)
> >  			bdi->dirty_exceeded = 1;
> >  
> > -		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> > -		 * Unstable writes are a feature of certain networked
> > -		 * filesystems (i.e. NFS) in which data may have been
> > -		 * written to the server's write cache, but has not yet
> > -		 * been flushed to permanent storage.
> > -		 * Only move pages to writeback if this bdi is over its
> > -		 * threshold otherwise wait until the disk writes catch
> > -		 * up.
> > -		 */
> > -		if (bdi_nr_reclaimable > bdi_thresh) {
> > -			writeback_inodes_wbc(&wbc);
> > -			pages_written += write_chunk - wbc.nr_to_write;
> > -			/* don't wait if we've done enough */
> > -			if (pages_written >= write_chunk)
> > -				break;
> > -		}
> > -		schedule_timeout_interruptible(pause);
> > -
> > -		/*
> > -		 * Increase the delay for each loop, up to our previous
> > -		 * default of taking a 100ms nap.
> > -		 */
> > -		pause <<= 1;
> > -		if (pause > HZ / 10)
> > -			pause = HZ / 10;
> > +		bdi_writeback_wait(bdi, write_chunk);

Added a "break;" line here: we can remove the loop now :)

> >  	}
> >  
> >  	if (!dirty_exceeded && bdi->dirty_exceeded)
> > @@ -559,8 +529,7 @@ static void balance_dirty_pages(struct a
> >  	 * In normal mode, we start background writeout at the lower
> >  	 * background_thresh, to keep the amount of dirty memory low.
> >  	 */
> > -	if ((laptop_mode && pages_written) ||
> > -	    (!laptop_mode && (nr_reclaimable > background_thresh)))
> > +	if (!laptop_mode && (nr_reclaimable > background_thresh))
> >  		bdi_start_writeback(bdi, NULL, 0);
> >  }
> >  
> > --- linux.orig/include/linux/backing-dev.h	2009-09-28 18:52:51.000000000 +0800
> > +++ linux/include/linux/backing-dev.h	2009-09-28 19:02:45.000000000 +0800
> > @@ -86,6 +86,13 @@ struct backing_dev_info {
> >  
> >  	struct list_head work_list;
> >  
> > +	/*
> > +	 * dirtier process throttling
> > +	 */
> > +	spinlock_t		throttle_lock;
> > +	struct list_head	throttle_list;	/* nr to sync for each task */
> > +	atomic_t		throttle_pages; /* nr to sync for head task */
> > +
> >  	struct device *dev;
> >  
> >  #ifdef CONFIG_DEBUG_FS
> > @@ -94,6 +101,12 @@ struct backing_dev_info {
> >  #endif
> >  };
> >  
> > +/*
> > + * when no task is throttled, set throttle_pages to larger than this,
> > + * to avoid unnecessary atomic decreases.
> > + */
> > +#define DIRTY_THROTTLE_PAGES_STOP	(1 << 22)
> > +
> >  int bdi_init(struct backing_dev_info *bdi);
> >  void bdi_destroy(struct backing_dev_info *bdi);
> >  
> > @@ -105,6 +118,8 @@ void bdi_start_writeback(struct backing_
> >  				long nr_pages);
> >  int bdi_writeback_task(struct bdi_writeback *wb);
> >  int bdi_has_dirty_io(struct backing_dev_info *bdi);
> > +int bdi_writeback_wakeup(struct backing_dev_info *bdi);
> > +void bdi_writeback_wait(struct backing_dev_info *bdi, long nr_pages);
> >  
> >  extern spinlock_t bdi_lock;
> >  extern struct list_head bdi_list;
> > --- linux.orig/fs/fs-writeback.c	2009-09-28 18:57:51.000000000 +0800
> > +++ linux/fs/fs-writeback.c	2009-09-28 19:02:45.000000000 +0800
> > @@ -25,6 +25,7 @@
> >  #include <linux/blkdev.h>
> >  #include <linux/backing-dev.h>
> >  #include <linux/buffer_head.h>
> > +#include <linux/completion.h>
> >  #include "internal.h"
> >  
> >  #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
> > @@ -136,14 +137,14 @@ static void wb_work_complete(struct bdi_
> >  		call_rcu(&work->rcu_head, bdi_work_free);
> >  }
> >  
> > -static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
> > +static void wb_clear_pending(struct backing_dev_info *bdi,
> > +			     struct bdi_work *work)
> >  {
> >  	/*
> >  	 * The caller has retrieved the work arguments from this work,
> >  	 * drop our reference. If this is the last ref, delete and free it
> >  	 */
> >  	if (atomic_dec_and_test(&work->pending)) {
> > -		struct backing_dev_info *bdi = wb->bdi;
> >  
> >  		spin_lock(&bdi->wb_lock);
> >  		list_del_rcu(&work->list);
> > @@ -275,6 +276,81 @@ void bdi_start_writeback(struct backing_
> >  	bdi_alloc_queue_work(bdi, &args);
> >  }
> >  
> > +struct dirty_throttle_task {
> > +	long			nr_pages;
> > +	struct list_head	list;
> > +	struct completion	complete;
> > +};
> > +
> > +void bdi_writeback_wait(struct backing_dev_info *bdi, long nr_pages)
> > +{
> > +	struct dirty_throttle_task tt = {
> > +		.nr_pages = nr_pages,
> > +		.complete = COMPLETION_INITIALIZER_ONSTACK(tt.complete),
> > +	};
> > +	struct wb_writeback_args args = {
> > +		.sync_mode	= WB_SYNC_NONE,
> > +		.nr_pages	= LONG_MAX,
> > +		.range_cyclic	= 1,
> > +		.for_background	= 1,
> > +	};
> > +	struct bdi_work work;
> > +
> > +	bdi_work_init(&work, &args);
> > +	work.state |= WS_ONSTACK;
> > +
> > +	/*
> > +	 * make sure we will be waken up by someone
> > +	 */
> > +	bdi_queue_work(bdi, &work);
>   This is wrong, you shouldn't submit the work like this because you'll
> have to wait for completion (wb_clear_pending below is just bogus). You
> should rather do bdi_start_writeback(bdi, NULL, 0).

No I don't intent to wait for completion of this work (that would
wait too long). This bdi work is to ensure writeback IO submissions
are now in progress. Thus __bdi_writeout_inc() will be called to
decrease bdi->throttle_pages, and when it counts down to 0, wake up
this process.

The alternative way is to do

        if (no background work queued)
                bdi_start_writeback(bdi, NULL, 0)

It looks a saner solution, thanks for the suggestion :)

> > +
> > +	/*
> > +	 * register throttle pages
> > +	 */
> > +	spin_lock(&bdi->throttle_lock);
> > +	if (list_empty(&bdi->throttle_list))
> > +		atomic_set(&bdi->throttle_pages, nr_pages);
> > +	list_add(&tt.list, &bdi->throttle_list);
> > +	spin_unlock(&bdi->throttle_lock);
> > +
> > +	wait_for_completion(&tt.complete);

> > +	wb_clear_pending(bdi, &work); /* XXX */

For the above reason, I remove the work here and don't care whether it
has been executed or is running or not seen at all. We have been waken up.

Sorry I definitely "misused" wb_clear_pending() for a slightly
different purpose..

This didn't really cancel the work if it has already been running.
So bdi_writeback_wait() achieves another goal of starting background
writeback if bdi-flush is previously idle.

> > +}
> > +
> > +/*
> > + * return 1 if there are more waiting tasks.
> > + */
> > +int bdi_writeback_wakeup(struct backing_dev_info *bdi)
> > +{
> > +	struct dirty_throttle_task *tt;
> > +
> > +	spin_lock(&bdi->throttle_lock);
> > +	/*
> > +	 * remove and wakeup head task
> > +	 */
> > +	if (!list_empty(&bdi->throttle_list)) {
> > +		tt = list_entry(bdi->throttle_list.prev,
> > +				struct dirty_throttle_task, list);
> > +		list_del(&tt->list);
> > +		complete(&tt->complete);
> > +	}
> > +	/*
> > +	 * update throttle pages
> > +	 */
> > +	if (!list_empty(&bdi->throttle_list)) {
> > +		tt = list_entry(bdi->throttle_list.prev,
> > +				struct dirty_throttle_task, list);
> > +		atomic_set(&bdi->throttle_pages, tt->nr_pages);
> > +	} else {
> > +		tt = NULL;
> > +		atomic_set(&bdi->throttle_pages, DIRTY_THROTTLE_PAGES_STOP * 2);
>   Why is here * 2?
 
Because we do a racy test in another place:

    +	if (atomic_read(&bdi->throttle_pages) < DIRTY_THROTTLE_PAGES_STOP &&
    +	    atomic_dec_and_test(&bdi->throttle_pages))
    +		bdi_writeback_wakeup(bdi);

The *2 is for reducing the race possibility. It might still be racy, but
that's OK, because it's mainly an optimization. It's perfectly correct
if we simply do 

    +	if (atomic_dec_and_test(&bdi->throttle_pages))
    +		bdi_writeback_wakeup(bdi);

> > +	}
> > +	spin_unlock(&bdi->throttle_lock);
> > +
> > +	return tt != NULL;
> > +}
> > +
> >  /*
> >   * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
> >   * furthest end of its superblock's dirty-inode list.
> > @@ -788,8 +864,11 @@ static long wb_writeback(struct bdi_writ
> >  		 * For background writeout, stop when we are below the
> >  		 * background dirty threshold
> >  		 */
> > -		if (args->for_background && !over_bground_thresh())
> > +		if (args->for_background && !over_bground_thresh()) {
> > +			while (bdi_writeback_wakeup(wb->bdi))
> > +				;  /* unthrottle all tasks */
> >  			break;
> > +		}
>   You probably didn't understand my comment in the previous email. This is
> too late to wakeup all the tasks. There are two limits - background_limit
> (set to 5%) and dirty_limit (set to 10%). When amount of dirty data is
> above background_limit, we start the writeback but we don't throttle tasks
> yet. We start throttling tasks only when amount of dirty data on the bdi
> exceeds the part of the dirty limit belonging to the bdi. In case of a
> single bdi, this means we start throttling threads only when 10% of memory
> is dirty. To keep this behavior, we have to wakeup waiting threads as soon
> as their BDI gets below the dirty limit or when global number of dirty
> pages gets below (background_limit + dirty_limit) / 2.

Sure, but the design goal is to wakeup the throttled tasks in the
__bdi_writeout_inc() path instead of here. As long as some (background)
writeback is running, __bdi_writeout_inc() will be called to wakeup
the tasks.  This "unthrottle all on exit of background writeback" is
merely a safeguard, since once background writeback (which could be
queued by the throttled task itself, in bdi_writeback_wait) exits, the
calls to __bdi_writeout_inc() is likely to stop.

> >  
> >  		wbc.more_io = 0;
> >  		wbc.encountered_congestion = 0;
> > @@ -911,7 +990,7 @@ long wb_do_writeback(struct bdi_writebac
> >  		 * that we have seen this work and we are now starting it.
> >  		 */
> >  		if (args.sync_mode == WB_SYNC_NONE)
> > -			wb_clear_pending(wb, work);
> > +			wb_clear_pending(bdi, work);
> >  
> >  		wrote += wb_writeback(wb, &args);
> >  
> > @@ -920,7 +999,7 @@ long wb_do_writeback(struct bdi_writebac
> >  		 * notification when we have completed the work.
> >  		 */
> >  		if (args.sync_mode == WB_SYNC_ALL)
> > -			wb_clear_pending(wb, work);
> > +			wb_clear_pending(bdi, work);
> >  	}
> >  
> >  	/*
> > --- linux.orig/mm/backing-dev.c	2009-09-28 18:52:18.000000000 +0800
> > +++ linux/mm/backing-dev.c	2009-09-28 19:02:45.000000000 +0800
> > @@ -645,6 +645,10 @@ int bdi_init(struct backing_dev_info *bd
> >  	bdi->wb_mask = 1;
> >  	bdi->wb_cnt = 1;
> >  
> > +	spin_lock_init(&bdi->throttle_lock);
> > +	INIT_LIST_HEAD(&bdi->throttle_list);
> > +	atomic_set(&bdi->throttle_pages, DIRTY_THROTTLE_PAGES_STOP * 2);
> > +
>   Again, why is * 2 here? I'd just set DIRTY_THROTTLE_PAGES_STOP to some
> magic value (like ~0) and use it directly...

See above. ~0 is not used because atomic_t only promises 24bit data
space, and to reduce a small race :)

Thanks,
Fengguang

> >  	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
> >  		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
> >  		if (err)
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.cz>
> SUSE Labs, CR

next prev parent reply	other threads:[~2009-10-01 13:36 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-08  9:23 [PATCH 0/8] Per-bdi writeback flusher threads v19 Jens Axboe
2009-09-08  9:23 ` [PATCH 1/8] writeback: get rid of generic_sync_sb_inodes() export Jens Axboe
2009-09-08 10:27   ` Artem Bityutskiy
2009-09-08 10:41     ` Jens Axboe
2009-09-08 10:52       ` Artem Bityutskiy
2009-09-08 10:57         ` Jens Axboe
2009-09-08 11:01           ` Artem Bityutskiy
2009-09-08 11:05             ` Jens Axboe
2009-09-08 11:31               ` Artem Bityutskiy
2009-09-08  9:23 ` [PATCH 2/8] writeback: move dirty inodes from super_block to backing_dev_info Jens Axboe
2009-09-08  9:23 ` [PATCH 3/8] writeback: switch to per-bdi threads for flushing data Jens Axboe
2009-09-08 13:46   ` Daniel Walker
2009-09-08 14:21     ` Jens Axboe
2009-09-08  9:23 ` [PATCH 4/8] writeback: get rid of pdflush completely Jens Axboe
2009-09-08  9:23 ` [PATCH 5/8] writeback: add some debug inode list counters to bdi stats Jens Axboe
2009-09-08  9:23 ` [PATCH 6/8] writeback: add name to backing_dev_info Jens Axboe
2009-09-08  9:23 ` [PATCH 7/8] writeback: check for registered bdi in flusher add and inode dirty Jens Axboe
2009-09-08  9:23 ` [PATCH 8/8] vm: Add an tuning knob for vm.max_writeback_mb Jens Axboe
2009-09-08 10:37   ` Artem Bityutskiy
2009-09-08 16:06     ` Peter Zijlstra
2009-09-08 16:29       ` Chris Mason
2009-09-08 16:56         ` Peter Zijlstra
2009-09-08 17:28           ` Chris Mason
2009-09-08 17:46             ` Peter Zijlstra
2009-09-08 17:55               ` Peter Zijlstra
2009-09-08 18:32                 ` Peter Zijlstra
2009-09-09 14:23                   ` Jan Kara
2009-09-09 14:37                     ` Wu Fengguang
2009-09-10 15:49                     ` Peter Zijlstra
2009-09-14 11:17                       ` Jan Kara
2009-09-24  8:33                         ` Wu Fengguang
2009-09-24 15:38                           ` Peter Zijlstra
2009-09-25  1:33                             ` Wu Fengguang
2009-09-29 17:35                           ` Jan Kara
2009-09-30  1:24                             ` Wu Fengguang
2009-09-30 11:55                               ` Jan Kara
2009-09-30 12:10                                 ` Jens Axboe
2009-10-01 15:17                                   ` Wu Fengguang
2009-10-01 13:36                                 ` Wu Fengguang [this message]
2009-10-01 14:22                                   ` Jan Kara
2009-10-01 14:54                                     ` Wu Fengguang
2009-10-01 21:35                                       ` Jan Kara
2009-10-02  2:25                                         ` Wu Fengguang
2009-10-02  9:54                                           ` Jan Kara
2009-10-02 10:34                                             ` Wu Fengguang
2009-09-08 18:35                 ` Chris Mason
2009-09-08 17:57               ` Chris Mason
2009-09-08 18:28                 ` Peter Zijlstra
2009-09-09  1:53           ` Dave Chinner
2009-09-09  3:52             ` Wu Fengguang
2009-09-08 18:06         ` Theodore Tso
     [not found]           ` <20090908181937.GA11545@infradead.org>
2009-09-08 19:34             ` Theodore Tso
2009-09-09  9:29         ` Wu Fengguang
2009-09-09 12:28           ` Christoph Hellwig
2009-09-09 12:32             ` Wu Fengguang
2009-09-09 12:36               ` Artem Bityutskiy
2009-09-09 12:37               ` Jens Axboe
2009-09-09 12:43                 ` Christoph Hellwig
2009-09-09 12:44                   ` Jens Axboe
2009-09-09 12:51                     ` Christoph Hellwig
2009-09-09 12:57                 ` Wu Fengguang
  -- strict thread matches above, loose matches on Subject: below --
2009-09-04  7:46 [PATCH 0/8] Per-bdi writeback flusher threads v18 Jens Axboe
2009-09-04  7:46 ` [PATCH 8/8] vm: Add an tuning knob for vm.max_writeback_mb Jens Axboe
2009-09-04 15:28   ` Richard Kennedy
2009-09-05 13:26     ` Jamie Lokier
2009-09-05 16:18       ` Richard Kennedy
2009-09-05 16:46     ` Theodore Tso
2009-09-07 19:09   ` Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091001133610.GA7228@localhost \
    --to=fengguang.wu@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=chris.mason@oracle.com \
    --cc=david@fromorbit.com \
    --cc=dedekind1@gmail.com \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=jens.axboe@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=peterz@infradead.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).