Re: + mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-c ache-references.patch added to -mm tree

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Wu Fengguang <fengguang.wu@intel.com>
To: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: "akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	"mm-commits@vger.kernel.org" <mm-commits@vger.kernel.org>,
	"a.p.zijlstra@chello.nl" <a.p.zijlstra@chello.nl>,
	"chris.mason@oracle.com" <chris.mason@oracle.com>,
	"jens.axboe@oracle.com" <jens.axboe@oracle.com>,
	"mbligh@mbligh.org" <mbligh@mbligh.org>,
	"miklos@szeredi.hu" <miklos@szeredi.hu>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>
Subject: Re: + mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-c ache-references.patch added to -mm tree
Date: Sun, 23 Aug 2009 21:00:56 +0800	[thread overview]
Message-ID: <20090823130056.GA10596@localhost> (raw)
In-Reply-To: <1251020013.2270.24.camel@castor>

On Sun, Aug 23, 2009 at 05:33:33PM +0800, Richard Kennedy wrote:
> On Sat, 2009-08-22 at 10:51 +0800, Wu Fengguang wrote:
> 
> > > 
> > >  mm/page-writeback.c |  116 +++++++++++++++---------------------------
> > >  1 file changed, 43 insertions(+), 73 deletions(-)
> > > 
> > > diff -puN mm/page-writeback.c~mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-cache-references mm/page-writeback.c
> > > --- a/mm/page-writeback.c~mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-cache-references
> > > +++ a/mm/page-writeback.c
> > > @@ -249,32 +249,6 @@ static void bdi_writeout_fraction(struct
> > >  	}
> > >  }
> > >  
> > > -/*
> > > - * Clip the earned share of dirty pages to that which is actually available.
> > > - * This avoids exceeding the total dirty_limit when the floating averages
> > > - * fluctuate too quickly.
> > > - */
> > > -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
> > > -		unsigned long dirty, unsigned long *pbdi_dirty)
> > > -{
> > > -	unsigned long avail_dirty;
> > > -
> > > -	avail_dirty = global_page_state(NR_FILE_DIRTY) +
> > > -		 global_page_state(NR_WRITEBACK) +
> > > -		 global_page_state(NR_UNSTABLE_NFS) +
> > > -		 global_page_state(NR_WRITEBACK_TEMP);
> > > -
> > > -	if (avail_dirty < dirty)
> > > -		avail_dirty = dirty - avail_dirty;
> > > -	else
> > > -		avail_dirty = 0;
> > > -
> > > -	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
> > > -		bdi_stat(bdi, BDI_WRITEBACK);
> > > -
> > > -	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
> > > -}
> > > -
> > >  static inline void task_dirties_fraction(struct task_struct *tsk,
> > >  		long *numerator, long *denominator)
> > >  {
> > > @@ -465,7 +439,6 @@ get_dirty_limits(unsigned long *pbackgro
> > >  			bdi_dirty = dirty * bdi->max_ratio / 100;
> > >  
> > >  		*pbdi_dirty = bdi_dirty;
> > > -		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
> > >  		task_dirty_limit(current, pbdi_dirty);
> > >  	}
> > >  }
> > > @@ -499,45 +472,12 @@ static void balance_dirty_pages(struct a
> > >  		};
> > >  
> > >  		get_dirty_limits(&background_thresh, &dirty_thresh,
> > > -				&bdi_thresh, bdi);
> > > +				 &bdi_thresh, bdi);
> > >  
> > >  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > > -					global_page_state(NR_UNSTABLE_NFS);
> > > -		nr_writeback = global_page_state(NR_WRITEBACK);
> > > -
> > > -		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > > -		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > > -
> > > -		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > > -			break;
> > > -
> > > -		/*
> > > -		 * Throttle it only when the background writeback cannot
> > > -		 * catch-up. This avoids (excessively) small writeouts
> > > -		 * when the bdi limits are ramping up.
> > > -		 */
> > > -		if (nr_reclaimable + nr_writeback <
> > > -				(background_thresh + dirty_thresh) / 2)
> > > -			break;
> > > -
> > > -		if (!bdi->dirty_exceeded)
> > > -			bdi->dirty_exceeded = 1;
> > > -
> > > -		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> > > -		 * Unstable writes are a feature of certain networked
> > > -		 * filesystems (i.e. NFS) in which data may have been
> > > -		 * written to the server's write cache, but has not yet
> > > -		 * been flushed to permanent storage.
> > > -		 * Only move pages to writeback if this bdi is over its
> > > -		 * threshold otherwise wait until the disk writes catch
> > > -		 * up.
> > > -		 */
> > > -		if (bdi_nr_reclaimable > bdi_thresh) {
> > > -			generic_sync_bdi_inodes(NULL, &wbc);
> > > -			pages_written += write_chunk - wbc.nr_to_write;
> > > -			get_dirty_limits(&background_thresh, &dirty_thresh,
> > > -				       &bdi_thresh, bdi);
> > > -		}
> > > +			global_page_state(NR_UNSTABLE_NFS);
> > > +		nr_writeback = global_page_state(NR_WRITEBACK) +
> > > +			global_page_state(NR_WRITEBACK_TEMP);
> > >  
> > >  		/*
> > >  		 * In order to avoid the stacked BDI deadlock we need
> > > @@ -557,16 +497,48 @@ static void balance_dirty_pages(struct a
> > >  			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > >  		}
> > >  
> > > -		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > > -			break;
> > > -		if (pages_written >= write_chunk)
> > > -			break;		/* We've done our duty */
> > 
> > > +		/* always throttle if over threshold */
> > > +		if (nr_reclaimable + nr_writeback < dirty_thresh) {
> > 
> > That 'if' is a big behavior change. It effectively blocks every one
> > and canceled Peter's proportional throttling work: the less a process
> > dirtied, the less it should be throttled.
> > 
> I don't think it does. the code ends up looking like
> 
> FOR
> 	IF less than dirty_thresh THEN
> 		check bdi limits etc	
> 	ENDIF
> 
> 	thottle
> ENDFOR
> 
> Therefore we always throttle when over the threshold otherwise we apply
> the per bdi limits to decide if we throttle.
> 
> In the existing code clip_bdi_dirty_limit modified the bdi_thresh so
> that it would not let a bdi dirty enough pages to go over the
> dirty_threshold. All I've done is to bring the check of dirty_thresh up
> into balance_dirty_pages.
> 
> So isn't this effectively the same ?

Yes and no. For the bdi_thresh part it somehow makes the
clip_bdi_dirty_limit() logic more simple and obvious. Which I tend to
agree with you and Peter on doing something like this:

        if (nr_reclaimable + nr_writeback < dirty_thresh) {
                /* compute bdi_* */
                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
         		break;
        }

For other two 'if's..
 
> > I'd propose to remove the above 'if' and liberate the following three 'if's.
> > 
> > > +
> > > +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > > +				break;
> > > +
> > > +			/*
> > > +			 * Throttle it only when the background writeback cannot
> > > +			 * catch-up. This avoids (excessively) small writeouts
> > > +			 * when the bdi limits are ramping up.
> > > +			 */
> > > +			if (nr_reclaimable + nr_writeback <
> > > +			    (background_thresh + dirty_thresh) / 2)
> > > +				break;

That 'if' can be trivially moved out.

> > > +
> > > +			/* done enough? */
> > > +			if (pages_written >= write_chunk)
> > > +				break;

That 'if' must be moved out, otherwise it can block a light writer
for ever, as long as there is another heavy dirtier keeps the dirty
numbers high.

> > > +		}
> > > +		if (!bdi->dirty_exceeded)
> > > +			bdi->dirty_exceeded = 1;
> > >  
> > > +		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> > > +		 * Unstable writes are a feature of certain networked
> > > +		 * filesystems (i.e. NFS) in which data may have been
> > > +		 * written to the server's write cache, but has not yet
> > > +		 * been flushed to permanent storage.

> > > +		 * Only move pages to writeback if this bdi is over its
> > > +		 * threshold otherwise wait until the disk writes catch
> > > +		 * up.
> > > +		 */
> > > +		if (bdi_nr_reclaimable > bdi_thresh) {

I'd much prefer its original form

                if (bdi_nr_reclaimable) {

Let's push dirty pages to disk ASAP :)

> > > +			writeback_inodes(&wbc);
> > > +			pages_written += write_chunk - wbc.nr_to_write;
> > 
> > > +			if (wbc.nr_to_write == 0)
> > > +				continue;
> > 
> > What's the purpose of the above 2 lines?
> 
> This is to try to replicate the existing code as closely as possible.
> 
> If writeback_inodes wrote write_chunk pages in one pass then skip to the
> top of the loop to recheck the limits and decide if we can let the
> application continue. Otherwise it's not making enough forward progress
> due to congestion so do the congestion_wait & loop. 

It makes sense. We have wbc.encountered_congestion for that purpose.
However it may not able to write enough pages for other reasons like
lock contention. So I'd suggest to test (wbc.nr_to_write <= 0).

Thanks,
Fengguang

WARNING: multiple messages have this Message-ID (diff)

From: Wu Fengguang <fengguang.wu@intel.com>
To: Richard Kennedy <richard@rsk.demon.co.uk>
Cc: "akpm@linux-foundation.org" <akpm@linux-foundation.org>,
	"mm-commits@vger.kernel.org" <mm-commits@vger.kernel.org>,
	"a.p.zijlstra@chello.nl" <a.p.zijlstra@chello.nl>,
	"chris.mason@oracle.com" <chris.mason@oracle.com>,
	"jens.axboe@oracle.com" <jens.axboe@oracle.com>,
	"mbligh@mbligh.org" <mbligh@mbligh.org>,
	"miklos@szeredi.hu" <miklos@szeredi.hu>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>
Subject: Re: + mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-c ache-references.patch added to -mm tree
Date: Sun, 23 Aug 2009 21:00:56 +0800	[thread overview]
Message-ID: <20090823130056.GA10596@localhost> (raw)
In-Reply-To: <1251020013.2270.24.camel@castor>

On Sun, Aug 23, 2009 at 05:33:33PM +0800, Richard Kennedy wrote:
> On Sat, 2009-08-22 at 10:51 +0800, Wu Fengguang wrote:
> 
> > > 
> > >  mm/page-writeback.c |  116 +++++++++++++++---------------------------
> > >  1 file changed, 43 insertions(+), 73 deletions(-)
> > > 
> > > diff -puN mm/page-writeback.c~mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-cache-references mm/page-writeback.c
> > > --- a/mm/page-writeback.c~mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-cache-references
> > > +++ a/mm/page-writeback.c
> > > @@ -249,32 +249,6 @@ static void bdi_writeout_fraction(struct
> > >  	}
> > >  }
> > >  
> > > -/*
> > > - * Clip the earned share of dirty pages to that which is actually available.
> > > - * This avoids exceeding the total dirty_limit when the floating averages
> > > - * fluctuate too quickly.
> > > - */
> > > -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
> > > -		unsigned long dirty, unsigned long *pbdi_dirty)
> > > -{
> > > -	unsigned long avail_dirty;
> > > -
> > > -	avail_dirty = global_page_state(NR_FILE_DIRTY) +
> > > -		 global_page_state(NR_WRITEBACK) +
> > > -		 global_page_state(NR_UNSTABLE_NFS) +
> > > -		 global_page_state(NR_WRITEBACK_TEMP);
> > > -
> > > -	if (avail_dirty < dirty)
> > > -		avail_dirty = dirty - avail_dirty;
> > > -	else
> > > -		avail_dirty = 0;
> > > -
> > > -	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
> > > -		bdi_stat(bdi, BDI_WRITEBACK);
> > > -
> > > -	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
> > > -}
> > > -
> > >  static inline void task_dirties_fraction(struct task_struct *tsk,
> > >  		long *numerator, long *denominator)
> > >  {
> > > @@ -465,7 +439,6 @@ get_dirty_limits(unsigned long *pbackgro
> > >  			bdi_dirty = dirty * bdi->max_ratio / 100;
> > >  
> > >  		*pbdi_dirty = bdi_dirty;
> > > -		clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
> > >  		task_dirty_limit(current, pbdi_dirty);
> > >  	}
> > >  }
> > > @@ -499,45 +472,12 @@ static void balance_dirty_pages(struct a
> > >  		};
> > >  
> > >  		get_dirty_limits(&background_thresh, &dirty_thresh,
> > > -				&bdi_thresh, bdi);
> > > +				 &bdi_thresh, bdi);
> > >  
> > >  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> > > -					global_page_state(NR_UNSTABLE_NFS);
> > > -		nr_writeback = global_page_state(NR_WRITEBACK);
> > > -
> > > -		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > > -		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > > -
> > > -		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > > -			break;
> > > -
> > > -		/*
> > > -		 * Throttle it only when the background writeback cannot
> > > -		 * catch-up. This avoids (excessively) small writeouts
> > > -		 * when the bdi limits are ramping up.
> > > -		 */
> > > -		if (nr_reclaimable + nr_writeback <
> > > -				(background_thresh + dirty_thresh) / 2)
> > > -			break;
> > > -
> > > -		if (!bdi->dirty_exceeded)
> > > -			bdi->dirty_exceeded = 1;
> > > -
> > > -		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> > > -		 * Unstable writes are a feature of certain networked
> > > -		 * filesystems (i.e. NFS) in which data may have been
> > > -		 * written to the server's write cache, but has not yet
> > > -		 * been flushed to permanent storage.
> > > -		 * Only move pages to writeback if this bdi is over its
> > > -		 * threshold otherwise wait until the disk writes catch
> > > -		 * up.
> > > -		 */
> > > -		if (bdi_nr_reclaimable > bdi_thresh) {
> > > -			generic_sync_bdi_inodes(NULL, &wbc);
> > > -			pages_written += write_chunk - wbc.nr_to_write;
> > > -			get_dirty_limits(&background_thresh, &dirty_thresh,
> > > -				       &bdi_thresh, bdi);
> > > -		}
> > > +			global_page_state(NR_UNSTABLE_NFS);
> > > +		nr_writeback = global_page_state(NR_WRITEBACK) +
> > > +			global_page_state(NR_WRITEBACK_TEMP);
> > >  
> > >  		/*
> > >  		 * In order to avoid the stacked BDI deadlock we need
> > > @@ -557,16 +497,48 @@ static void balance_dirty_pages(struct a
> > >  			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
> > >  		}
> > >  
> > > -		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > > -			break;
> > > -		if (pages_written >= write_chunk)
> > > -			break;		/* We've done our duty */
> > 
> > > +		/* always throttle if over threshold */
> > > +		if (nr_reclaimable + nr_writeback < dirty_thresh) {
> > 
> > That 'if' is a big behavior change. It effectively blocks every one
> > and canceled Peter's proportional throttling work: the less a process
> > dirtied, the less it should be throttled.
> > 
> I don't think it does. the code ends up looking like
> 
> FOR
> 	IF less than dirty_thresh THEN
> 		check bdi limits etc	
> 	ENDIF
> 
> 	thottle
> ENDFOR
> 
> Therefore we always throttle when over the threshold otherwise we apply
> the per bdi limits to decide if we throttle.
> 
> In the existing code clip_bdi_dirty_limit modified the bdi_thresh so
> that it would not let a bdi dirty enough pages to go over the
> dirty_threshold. All I've done is to bring the check of dirty_thresh up
> into balance_dirty_pages.
> 
> So isn't this effectively the same ?

Yes and no. For the bdi_thresh part it somehow makes the
clip_bdi_dirty_limit() logic more simple and obvious. Which I tend to
agree with you and Peter on doing something like this:

        if (nr_reclaimable + nr_writeback < dirty_thresh) {
                /* compute bdi_* */
                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
         		break;
        }

For other two 'if's..
 
> > I'd propose to remove the above 'if' and liberate the following three 'if's.
> > 
> > > +
> > > +			if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> > > +				break;
> > > +
> > > +			/*
> > > +			 * Throttle it only when the background writeback cannot
> > > +			 * catch-up. This avoids (excessively) small writeouts
> > > +			 * when the bdi limits are ramping up.
> > > +			 */
> > > +			if (nr_reclaimable + nr_writeback <
> > > +			    (background_thresh + dirty_thresh) / 2)
> > > +				break;

That 'if' can be trivially moved out.

> > > +
> > > +			/* done enough? */
> > > +			if (pages_written >= write_chunk)
> > > +				break;

That 'if' must be moved out, otherwise it can block a light writer
for ever, as long as there is another heavy dirtier keeps the dirty
numbers high.

> > > +		}
> > > +		if (!bdi->dirty_exceeded)
> > > +			bdi->dirty_exceeded = 1;
> > >  
> > > +		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
> > > +		 * Unstable writes are a feature of certain networked
> > > +		 * filesystems (i.e. NFS) in which data may have been
> > > +		 * written to the server's write cache, but has not yet
> > > +		 * been flushed to permanent storage.

> > > +		 * Only move pages to writeback if this bdi is over its
> > > +		 * threshold otherwise wait until the disk writes catch
> > > +		 * up.
> > > +		 */
> > > +		if (bdi_nr_reclaimable > bdi_thresh) {

I'd much prefer its original form

                if (bdi_nr_reclaimable) {

Let's push dirty pages to disk ASAP :)

> > > +			writeback_inodes(&wbc);
> > > +			pages_written += write_chunk - wbc.nr_to_write;
> > 
> > > +			if (wbc.nr_to_write == 0)
> > > +				continue;
> > 
> > What's the purpose of the above 2 lines?
> 
> This is to try to replicate the existing code as closely as possible.
> 
> If writeback_inodes wrote write_chunk pages in one pass then skip to the
> top of the loop to recheck the limits and decide if we can let the
> application continue. Otherwise it's not making enough forward progress
> due to congestion so do the congestion_wait & loop. 

It makes sense. We have wbc.encountered_congestion for that purpose.
However it may not able to write enough pages for other reasons like
lock contention. So I'd suggest to test (wbc.nr_to_write <= 0).

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2009-08-23 13:01 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-08-21 22:50 + mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-cache-references.patch added to -mm tree akpm
2009-08-22  2:51 ` + mm-balance_dirty_pages-reduce-calls-to-global_page_state-to-reduce-c ache-references.patch " Wu Fengguang
2009-08-22  2:51   ` Wu Fengguang
2009-08-22 18:11   ` Peter Zijlstra
2009-08-23  1:32     ` Wu Fengguang
2009-08-23  5:31       ` Peter Zijlstra
2009-08-23  7:27         ` Wu Fengguang
2009-08-23  7:45           ` Peter Zijlstra
2009-09-02  8:31     ` Peter Zijlstra
2009-09-02  9:57       ` Wu Fengguang
2009-09-02 10:45         ` Peter Zijlstra
2009-09-02 13:53           ` Richard Kennedy
2009-09-03  2:22             ` Wu Fengguang
2009-09-03  3:09               ` Wu Fengguang
2009-09-03  9:48               ` Richard Kennedy
2009-09-03 11:05                 ` Wu Fengguang
2009-09-03 12:26                   ` Richard Kennedy
2009-09-03  4:53           ` Wu Fengguang
2009-08-23  9:33   ` Richard Kennedy
2009-08-23  9:33     ` Richard Kennedy
2009-08-23 13:00     ` Wu Fengguang [this message]
2009-08-23 13:00       ` Wu Fengguang
2009-08-23 13:46       ` Richard Kennedy
2009-08-23 13:46         ` Richard Kennedy
2009-08-24  1:41         ` Wu Fengguang
2009-08-24  1:41           ` Wu Fengguang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090823130056.GA10596@localhost \
    --to=fengguang.wu@intel.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=chris.mason@oracle.com \
    --cc=jens.axboe@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mbligh@mbligh.org \
    --cc=miklos@szeredi.hu \
    --cc=mm-commits@vger.kernel.org \
    --cc=richard@rsk.demon.co.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.