All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Cc: akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	yumiko.sugita.yf@hitachi.com, masami.hiramatsu.pt@hitachi.com,
	hidehiro.kawai.ez@hitachi.com, yuji.kakutani.uw@hitachi.com,
	soshima@redhat.com, haoki@redhat.com,
	kamezawa.hiroyu@jp.fujitsu.com, nikita@clusterfs.com,
	leroy.vanlogchem@wldelft.nl, Dave Chinner <dgc@sgi.com>
Subject: Re: [PATCH 0/3] VM throttling: avoid blocking occasional writers
Date: Wed, 14 Mar 2007 14:18:22 +0100	[thread overview]
Message-ID: <1173878302.25356.15.camel@twins> (raw)
In-Reply-To: <45F7EDC6.5090303@hitachi.com>

Hi,

I've been working on an alternative solution (see patch below). However
I haven't posted yet because I'm not quite satisfied and haven't done a
lot of testing.

The patch relies on the per backing dev dirty/writeback counts currently
in -mm to which David Chinner objected. I plan to rework those as percpu
counters.

I think my solution might behave better because it fully decouples the
device throttling.

---

Scale writeback cache per backing device, proportional to its writeout speed.

akpm sayeth:
> Which problem are we trying to solve here?  afaik our two uppermost
> problems are:
> 
> a) Heavy write to queue A causes light writer to queue B to blok for a long
> time in balance_dirty_pages().  Even if the devices have the same speed.  

This one; esp when not the same speed. The - my usb stick makes my
computer suck - problem. But even on similar speed, the separation of
device should avoid blocking dev B when dev A is being throttled.

The writeout speed is measure dynamically, so when it doesn't have
anything to write out for a while its writeback cache size goes to 0.

Conversely, when starting up it will in the beginning act almost
synchronous but will quickly build up a 'fair' share of the writeback
cache.

> b) heavy write to device A causes light write to device A to block for a
> long time in balance_dirty_pages(), occasionally.  Harder to fix.

This will indeed take more. I've thought about it though. But one
quickly ends up with per task state.


How it all works:

We pick a 2^n value based on the vm_dirty_ratio and total vm size to act as a
period - vm_cycle_shift. This period measures 'time' in writeout events.

Each writeout increases time and adds to a per bdi counter. This counter is 
halved when a period expires. So per bdi speed is:

  0.5 * (previous cycle speed) + this cycle's events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 block/ll_rw_blk.c           |    3 +
 include/linux/backing-dev.h |    7 +++
 include/linux/writeback.h   |   10 ++++
 kernel/sysctl.c             |   10 +++-
 mm/page-writeback.c         |  102 ++++++++++++++++++++++++++++++++++++++------
 5 files changed, 119 insertions(+), 13 deletions(-)

Index: linux-2.6/include/linux/backing-dev.h
===================================================================
--- linux-2.6.orig/include/linux/backing-dev.h
+++ linux-2.6/include/linux/backing-dev.h
@@ -34,6 +34,13 @@ struct backing_dev_info {
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
 	void *unplug_io_data;
+
+	/*
+	 * data used for scaling the writeback cache
+	 */
+	spinlock_t lock;		/* protect the cycle count */
+	atomic_long_t nr_writeout;	/* writeout scale */
+	unsigned long cycles;		/* writeout cycles */
 };
 
 
Index: linux-2.6/include/linux/writeback.h
===================================================================
--- linux-2.6.orig/include/linux/writeback.h
+++ linux-2.6/include/linux/writeback.h
@@ -4,6 +4,8 @@
 #ifndef WRITEBACK_H
 #define WRITEBACK_H
 
+#include <linux/log2.h>
+
 struct backing_dev_info;
 
 extern spinlock_t inode_lock;
@@ -89,11 +91,19 @@ void throttle_vm_writeout(gfp_t gfp_mask
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
+extern int vm_cycle_shift;
 extern int dirty_writeback_interval;
 extern int dirty_expire_interval;
 extern int block_dump;
 extern int laptop_mode;
 
+extern long vm_total_pages; /* reduce dependancy stuff */
+static inline void update_cycle_shift(void)
+{
+	unsigned long dirty_pages = (vm_dirty_ratio * vm_total_pages) / 100;
+	vm_cycle_shift = 2 + ilog2_up(int_sqrt(dirty_pages));
+}
+
 struct ctl_table;
 struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -612,6 +612,14 @@ static ctl_table kern_table[] = {
 static int zero;
 static int one_hundred = 100;
 
+static int proc_dointvec_vm_dirty_ratio(ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	update_cycle_shift();
+	return ret;
+}
 
 static ctl_table vm_table[] = {
 	{
@@ -663,7 +671,7 @@ static ctl_table vm_table[] = {
 		.data		= &vm_dirty_ratio,
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_dointvec_vm_dirty_ratio,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c
+++ linux-2.6/mm/page-writeback.c
@@ -73,6 +73,9 @@ int dirty_background_ratio = 10;
  * The generator of dirty data starts writeback at this percentage
  */
 int vm_dirty_ratio = 40;
+int vm_cycle_shift;
+
+static DEFINE_PER_CPU(unsigned long, vm_writeout) = {0};
 
 /*
  * The interval between `kupdate'-style writebacks, in jiffies
@@ -102,6 +105,55 @@ EXPORT_SYMBOL(laptop_mode);
 
 static void background_writeout(unsigned long _min_pages);
 
+static unsigned long bdi_total_writeout(void)
+{
+	int cpu;
+	unsigned long sum = 0;
+	for_each_possible_cpu(cpu)
+		sum += per_cpu(vm_writeout, cpu);
+	return sum;
+}
+
+static void bdi_writeout_norm(struct backing_dev_info *bdi)
+{
+	int bits = vm_cycle_shift;
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = ~(cycle - 1);
+	unsigned long total = bdi_total_writeout() << 1;
+
+	if ((bdi->cycles & mask) == (total & mask))
+		return;
+
+	spin_lock(&bdi->lock);
+	while ((bdi->cycles & mask) != (total & mask)) {
+		atomic_long_sub(atomic_long_read(&bdi->nr_writeout) / 2,
+				&bdi->nr_writeout);
+		bdi->cycles += cycle;
+	}
+	spin_unlock(&bdi->lock);
+}
+
+static void bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+	get_cpu_var(vm_writeout)++;
+	put_cpu();
+
+	if (!(atomic_long_inc_return(&bdi->nr_writeout) & 0x7))
+		bdi_writeout_norm(bdi);
+}
+
+static void
+get_writeout_scale(struct address_space *mapping, int *scale, int *div)
+{
+	int bits = vm_cycle_shift - 1;
+	unsigned long total = bdi_total_writeout();
+	unsigned long cycle = 1UL << bits;
+	unsigned long mask = cycle - 1;
+
+	*scale = atomic_long_read(&mapping->backing_dev_info->nr_writeout);
+	*div = cycle + (total & mask);
+}
+
 /*
  * Work out the current dirty-memory clamping and background writeout
  * thresholds.
@@ -120,7 +172,7 @@ static void background_writeout(unsigned
  * clamping level.
  */
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
 					struct address_space *mapping)
 {
 	int background_ratio;		/* Percentages */
@@ -163,6 +215,21 @@ get_dirty_limits(long *pbackground, long
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+
+	if (mapping) {
+		long long tmp = dirty;
+		int scale, div;
+
+		get_writeout_scale(mapping, &scale, &div);
+
+		if (scale > div)
+			scale = div;
+
+		tmp = (tmp * 122) >> 7; /* take ~95% of total dirty value */
+		tmp *= scale;
+		do_div(tmp, div);
+		*pbdi_dirty = (long)tmp;
+	}
 }
 
 /*
@@ -177,6 +244,7 @@ static void balance_dirty_pages(struct a
 	long nr_reclaimable;
 	long background_thresh;
 	long dirty_thresh;
+	long bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long write_chunk = sync_writeback_pages();
 
@@ -191,11 +259,15 @@ static void balance_dirty_pages(struct a
 			.range_cyclic	= 1,
 		};
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+		get_dirty_limits(&background_thresh, &dirty_thresh,
+				&bdi_thresh, mapping);
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
-			dirty_thresh)
+		if ((nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+			dirty_thresh) &&
+		    (atomic_long_read(&bdi->nr_dirty) +
+		     atomic_long_read(&bdi->nr_writeback) <=
+		     	bdi_thresh))
 				break;
 
 		if (!dirty_exceeded)
@@ -209,14 +281,18 @@ static void balance_dirty_pages(struct a
 		 */
 		if (nr_reclaimable) {
 			writeback_inodes(&wbc);
-			get_dirty_limits(&background_thresh,
-					 	&dirty_thresh, mapping);
+
+			get_dirty_limits(&background_thresh, &dirty_thresh,
+				       &bdi_thresh, mapping);
 			nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-			if (nr_reclaimable +
-				global_page_state(NR_WRITEBACK)
-					<= dirty_thresh)
-						break;
+			if ((nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+				dirty_thresh) &&
+			    (atomic_long_read(&bdi->nr_dirty) +
+			     atomic_long_read(&bdi->nr_writeback) <=
+				 bdi_thresh))
+				break;
+
 			pages_written += write_chunk - wbc.nr_to_write;
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
@@ -312,7 +388,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
 	}
 
         for ( ; ; ) {
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -347,7 +423,7 @@ static void background_writeout(unsigned
 		long background_thresh;
 		long dirty_thresh;
 
-		get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
 		if (global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) < background_thresh
 				&& min_pages <= 0)
@@ -555,6 +631,7 @@ void __init page_writeback_init(void)
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
+	update_cycle_shift();
 }
 
 /**
@@ -935,6 +1012,7 @@ int test_clear_page_writeback(struct pag
 						PAGECACHE_TAG_WRITEBACK);
 			atomic_long_dec(&mapping->backing_dev_info->
 					nr_writeback);
+			bdi_writeout_inc(mapping->backing_dev_info);
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
Index: linux-2.6/block/ll_rw_blk.c
===================================================================
--- linux-2.6.orig/block/ll_rw_blk.c
+++ linux-2.6/block/ll_rw_blk.c
@@ -215,6 +215,9 @@ void blk_queue_make_request(request_queu
 	bdi->capabilities = BDI_CAP_MAP_COPY;
 	atomic_long_set(&bdi->nr_dirty, 0);
 	atomic_long_set(&bdi->nr_writeback, 0);
+	spin_lock_init(&bdi->lock);
+	atomic_long_set(&bdi->nr_writeout, 0);
+	bdi->cycles = 0;
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);




  reply	other threads:[~2007-03-14 13:18 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-03-14 12:42 [PATCH 0/3] VM throttling: avoid blocking occasional writers Tomoki Sekiyama
2007-03-14 13:18 ` Peter Zijlstra [this message]
2007-03-15 19:07 ` Andrew Morton
2007-03-18 14:59   ` Bill Davidsen
2007-03-22  5:49     ` Tomoki Sekiyama
2007-03-22 11:41       ` Bill Davidsen
2007-03-26 10:27         ` Tomoki Sekiyama
2007-03-26 17:11           ` Bill Davidsen
2007-04-03 10:42             ` Tomoki Sekiyama
2007-04-03 10:46             ` [PATCH 1/2] VM throttling: Start writeback at dirty_writeback_start_ratio Tomoki Sekiyama
2007-04-06  0:31               ` Andrew Morton
2007-04-10  3:04                 ` Tomoki Sekiyama
2007-04-10  3:46                   ` Andrew Morton
2007-04-03 10:47             ` [PATCH 2/2] VM throttling: Add vm.dirty_start_writeback_ratio to sysctl Tomoki Sekiyama

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1173878302.25356.15.camel@twins \
    --to=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=dgc@sgi.com \
    --cc=haoki@redhat.com \
    --cc=hidehiro.kawai.ez@hitachi.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=leroy.vanlogchem@wldelft.nl \
    --cc=linux-kernel@vger.kernel.org \
    --cc=masami.hiramatsu.pt@hitachi.com \
    --cc=nikita@clusterfs.com \
    --cc=soshima@redhat.com \
    --cc=tomoki.sekiyama.qu@hitachi.com \
    --cc=yuji.kakutani.uw@hitachi.com \
    --cc=yumiko.sugita.yf@hitachi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.