[PATCH 03/13] writeback: per-task rate limit on balance_dirty

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages()
@ 2010-11-17  3:58 Wu Fengguang
  0 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2010-11-17  3:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Theodore Ts'o, Wu Fengguang, Dave Chinner, Jan Kara,
	Peter Zijlstra, Mel Gorman, Rik van Riel, KOSAKI Motohiro,
	Chris Mason, Christoph Hellwig, linux-mm, linux-fsdevel, LKML

Andrew,
References: <20101117035821.000579293@intel.com>
Content-Disposition: inline; filename=writeback-per-task-dirty-count.patch

Try to limit the dirty throttle pause time in range [1 jiffy, 100 ms],
by controlling how many pages can be dirtied before inserting a pause.

The dirty count will be directly billed to the task struct. Slow start
and quick back off is employed, so that the stable range will be biased
towards less than 50ms. Another intention is for fine timing control of
slow devices, which may need to do full 100ms pauses for every 1 page.

The switch from per-cpu to per-task rate limit makes it easier to exceed
the global dirty limit with a fork bomb, where each new task dirties 1 page,
sleep 10m and continue to dirty 1000 more pages. The caveat is, when it
dirties the first page, it may be honoured a high nr_dirtied_pause
because nr_dirty is still low at that time. In this way lots of tasks
get the free tickets to dirty more pages than allowed. The solution is
to disable rate limiting (ie. to ignore nr_dirtied_pause) totally once
the bdi becomes dirty exceeded.

Note that some filesystems will dirty a batch of pages before calling
balance_dirty_pages_ratelimited_nr(). They saves a little CPU overheads
at the cost of possibly overrunning the dirty limits a bit and/or in the
case of very slow devices, pause the application for much more than
100ms at a time. This is a tradeoff, and seems reasonable optimization
as long as the batch size is controlled within a dozen pages.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/sched.h |    7 ++
 mm/memory_hotplug.c   |    3 
 mm/page-writeback.c   |  126 ++++++++++++++++++----------------------
 3 files changed, 65 insertions(+), 71 deletions(-)

--- linux-next.orig/include/linux/sched.h	2010-11-15 21:43:52.000000000 +0800
+++ linux-next/include/linux/sched.h	2010-11-15 21:43:54.000000000 +0800
@@ -1473,6 +1473,13 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+	/*
+	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for some dirty throttling pause
+	 */
+	int nr_dirtied;
+	int nr_dirtied_pause;
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
--- linux-next.orig/mm/page-writeback.c	2010-11-15 21:43:52.000000000 +0800
+++ linux-next/mm/page-writeback.c	2010-11-16 14:13:14.000000000 +0800
@@ -36,12 +36,6 @@
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 
-/*
- * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling.
- */
-static long ratelimit_pages = 32;
-
 /* The following parameters are exported via /proc/sys/vm */
 
 /*
@@ -452,6 +446,40 @@ unsigned long bdi_dirty_limit(struct bac
 }
 
 /*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If ratelimit_pages is too low then big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it adaptively to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long ratelimit_pages(struct backing_dev_info *bdi)
+{
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+	unsigned long dirty_pages;
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+	dirty_pages = global_page_state(NR_FILE_DIRTY) +
+		      global_page_state(NR_WRITEBACK) +
+		      global_page_state(NR_UNSTABLE_NFS);
+
+	if (dirty_pages <= (dirty_thresh + background_thresh) / 2)
+		goto out;
+
+	dirty_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	dirty_pages  = bdi_stat(bdi, BDI_RECLAIMABLE) +
+		       bdi_stat(bdi, BDI_WRITEBACK);
+
+	if (dirty_pages < dirty_thresh)
+		goto out;
+
+	return 1;
+out:
+	return 1 + int_sqrt(dirty_thresh - dirty_pages);
+}
+
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -467,7 +495,7 @@ static void balance_dirty_pages(struct a
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	unsigned long bw;
-	unsigned long pause;
+	unsigned long pause = 0;
 	bool dirty_exceeded = false;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
@@ -549,6 +577,17 @@ pause:
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
+	if (pause == 0 && nr_dirty < background_thresh)
+		current->nr_dirtied_pause = ratelimit_pages(bdi);
+	else if (pause == 1)
+		current->nr_dirtied_pause += current->nr_dirtied_pause >> 5;
+	else if (pause >= HZ/10)
+		/*
+		 * when repeated, writing 1 page per 100ms on slow devices,
+		 * i-(i+2)/4 will be able to reach 1 but never reduce to 0.
+		 */
+		current->nr_dirtied_pause -= (current->nr_dirtied_pause+2) >> 2;
+
 	if (writeback_in_progress(bdi))
 		return;
 
@@ -575,8 +614,6 @@ void set_page_dirty_balance(struct page 
 	}
 }
 
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
-
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -586,36 +623,30 @@ static DEFINE_PER_CPU(unsigned long, bdp
  * which was newly dirtied.  The function will periodically check the system's
  * dirty state and will initiate writeback if needed.
  *
- * On really big machines, get_writeback_state is expensive, so try to avoid
+ * On really big machines, global_page_state() is expensive, so try to avoid
  * calling it too often (ratelimiting).  But once we're over the dirty memory
- * limit we decrease the ratelimiting by a lot, to prevent individual processes
- * from overshooting the limit by (ratelimit_pages) each.
+ * limit we disable the ratelimiting, to prevent individual processes from
+ * overshooting the limit by (ratelimit_pages) each.
  */
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
-	unsigned long ratelimit;
-	unsigned long *p;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	current->nr_dirtied += nr_pages_dirtied;
 
-	ratelimit = ratelimit_pages;
-	if (mapping->backing_dev_info->dirty_exceeded)
-		ratelimit = 8;
+	if (unlikely(!current->nr_dirtied_pause))
+		current->nr_dirtied_pause = ratelimit_pages(bdi);
 
 	/*
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	preempt_disable();
-	p =  &__get_cpu_var(bdp_ratelimits);
-	*p += nr_pages_dirtied;
-	if (unlikely(*p >= ratelimit)) {
-		ratelimit = *p;
-		*p = 0;
-		preempt_enable();
-		balance_dirty_pages(mapping, ratelimit);
-		return;
+	if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause ||
+		     bdi->dirty_exceeded)) {
+		balance_dirty_pages(mapping, current->nr_dirtied);
+		current->nr_dirtied = 0;
 	}
-	preempt_enable();
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
@@ -703,44 +734,6 @@ void laptop_sync_completion(void)
 #endif
 
 /*
- * If ratelimit_pages is too high then we can get into dirty-data overload
- * if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive)
- * get_writeback_state too often.
- *
- * Here we set ratelimit_pages to a level which ensures that when all CPUs are
- * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
- */
-
-void writeback_set_ratelimit(void)
-{
-	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
-	if (ratelimit_pages < 16)
-		ratelimit_pages = 16;
-	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
-}
-
-static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
-{
-	writeback_set_ratelimit();
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block __cpuinitdata ratelimit_nb = {
-	.notifier_call	= ratelimit_handler,
-	.next		= NULL,
-};
-
-/*
  * Called early on to tune the page writeback dirty limits.
  *
  * We used to scale dirty pages according to how total memory
@@ -762,9 +755,6 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	writeback_set_ratelimit();
-	register_cpu_notifier(&ratelimit_nb);
-
 	shift = calc_period_shift();
 	prop_descriptor_init(&vm_completions, shift);
 	prop_descriptor_init(&vm_dirties, shift);
--- linux-next.orig/mm/memory_hotplug.c	2010-11-15 21:43:52.000000000 +0800
+++ linux-next/mm/memory_hotplug.c	2010-11-15 21:43:54.000000000 +0800
@@ -446,8 +446,6 @@ int online_pages(unsigned long pfn, unsi
 
 	vm_total_pages = nr_free_pagecache_pages();
 
-	writeback_set_ratelimit();
-
 	if (onlined_pages)
 		memory_notify(MEM_ONLINE, &arg);
 
@@ -877,7 +875,6 @@ repeat:
 	}
 
 	vm_total_pages = nr_free_pagecache_pages();
-	writeback_set_ratelimit();
 
 	memory_notify(MEM_OFFLINE, &arg);
 	unlock_system_sleep();


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 00/13] IO-less dirty throttling v2
@ 2010-11-17  4:27 Wu Fengguang
  2010-11-17  4:27 ` [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages() Wu Fengguang
  0 siblings, 1 reply; 6+ messages in thread
From: Wu Fengguang @ 2010-11-17  4:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Christoph Hellwig, Dave Chinner, Theodore Ts'o,
	Chris Mason, Peter Zijlstra, Mel Gorman, Rik van Riel,
	KOSAKI Motohiro, Wu Fengguang, linux-mm, linux-fsdevel, LKML

Andrew,

This is a revised subset of "[RFC] soft and dynamic dirty throttling limits"
<http://thread.gmane.org/gmane.linux.kernel.mm/52966>.

The basic idea is to introduce a small region under the bdi dirty threshold.
The task will be throttled gently when stepping into the bottom of region,
and get throttled more and more aggressively as bdi dirty+writeback pages
goes up closer to the top of region. At some point the application will be
throttled at the right bandwidth that balances with the device write bandwidth.
(the first patch and documentation has more details)

Changes from initial RFC:

- adaptive rate limiting, to reduce overheads when under throttle threshold
- prevent overrunning dirty limit on lots of concurrent dirtiers
- add Documentation/filesystems/writeback-throttling-design.txt
- lower max pause time from 200ms to 100ms; min pause time from 10ms to 1jiffy
- don't drop the laptop mode code
- update and comment the trace event
- benchmarks on concurrent dd and fs_mark covering both large and tiny files
- bdi->write_bandwidth updates should be rate limited on concurrent dirtiers,
  otherwise it will drift fast and fluctuate
- don't call balance_dirty_pages_ratelimit() when writing to already dirtied
  pages, otherwise the task will be throttled too much

The patches are based on 2.6.37-rc2 and Jan's sync livelock patches. For easier
access I put them in

git://git.kernel.org/pub/scm/linux/kernel/git/wfg/writeback.git dirty-throttling-v2

Wu Fengguang (12):
      writeback: IO-less balance_dirty_pages()
      writeback: consolidate variable names in balance_dirty_pages()
      writeback: per-task rate limit on balance_dirty_pages()
      writeback: prevent duplicate balance_dirty_pages_ratelimited() calls
      writeback: bdi write bandwidth estimation
      writeback: show bdi write bandwidth in debugfs
      writeback: quit throttling when bdi dirty pages dropped
      writeback: reduce per-bdi dirty threshold ramp up time
      writeback: make reasonable gap between the dirty/background thresholds
      writeback: scale down max throttle bandwidth on concurrent dirtiers
      writeback: add trace event for balance_dirty_pages()
      writeback: make nr_to_write a per-file limit

Jan Kara (1):
      writeback: account per-bdi accumulated written pages

 .../filesystems/writeback-throttling-design.txt    |  210 +++++++++++++
 fs/fs-writeback.c                                  |   16 +
 include/linux/backing-dev.h                        |    3 +
 include/linux/sched.h                              |    7 +
 include/linux/writeback.h                          |   14 +
 include/trace/events/writeback.h                   |   61 ++++-
 mm/backing-dev.c                                   |   29 +-
 mm/filemap.c                                       |    5 +-
 mm/memory_hotplug.c                                |    3 -
 mm/page-writeback.c                                |  320 +++++++++++---------
 10 files changed, 511 insertions(+), 157 deletions(-)

It runs smoothly on typical configurations. Under small memory system the pause
time will fluctuate much more due to the limited range for soft throttling.

The soft dirty threshold is now lowered to (background + dirty)/2=15%. So it
will be throttling the applications a bit earlier, and may be perceived by end
users as performance "slow down" if his application happens to dirty a bit more
than 15%. Note that vanilla kernel also has this limit at fresh boot: it starts
checking bdi limits when exceeding the global 15%, however the bdi limit ramps
up pretty slowly in common configurations, so the task is immediately throttled.

The task's think time is not considered for now when computing the pause time.
So it will throttle an "scp" over network way harder than a local "cp". When
to take the user space think time into account and ensure accurate throttle
bandwidth, we will effectively create a simple write I/O bandwidth controller.

On a simple test of 100 dd, it reduces the CPU %system time from 30% to 3%, and
improves IO throughput from 38MB/s to 42MB/s.

The fs_mark benchmark is interesting. The CPU overheads are almost reduced by
half. Before patch the benchmark is actually bounded by CPU. After patch it's
IO bound, but strangely the throughput becomes slightly slower.

#  ./fs_mark  -D  10000  -S0  -n  100000  -s  1  -L  63  -d  /mnt/scratch/0  -d  /mnt/scratch/1  -d  /mnt/scratch/2  -d /mnt/scratch/3  -d  /mnt/scratch/4  -d  /mnt/scratch/5  -d  /mnt/scratch/6  -d  /mnt/scratch/7  -d  /mnt/scratch/8  -d  /mnt/scratch/9  -d  /mnt/scratch/10  -d  /mnt/scratch/11 
#       Version 3.3, 12 thread(s) starting at Thu Nov 11 21:01:36 2010
#       Sync method: NO SYNC: Test does not issue sync() or fsync() calls.
#       Directories:  Time based hash between directories across 10000 subdirectories with 180 seconds per subdirectory.
#       File names: 40 bytes long, (16 initial bytes of time stamp with 24 random bytes at end of name)
#       Files info: size 1 bytes, written with an IO size of 16384 bytes per write
#       App overhead is time in microseconds spent in the test not doing file writing related system calls.
#

2.6.36
FSUse%        Count         Size    Files/sec     App Overhead
     0      1200000            1       1261.7        524762513
     0      2400000            1       1195.3        537844546
     0      3600000            1       1231.9        496441566
     1      4800000            1       1175.8        552421522
     1      6000000            1       1191.6        558529735
     1      7200000            1       1165.3        551178395
     2      8400000            1       1175.0        533209632
     2      9600000            1       1200.6        534862246
     2     10800000            1       1181.2        540616486
     2     12000000            1       1137.4        554551797
     3     13200000            1       1143.7        563319651
     3     14400000            1       1169.0        519527533
     3     15600000            1       1184.0        533550370
     4     16800000            1       1161.3        534358727
     4     18000000            1       1193.4        521610050
     4     19200000            1       1177.6        524117437
     5     20400000            1       1172.6        506166634
     5     21600000            1       1172.3        515725633

avg                                    1182.761      533488581.833

2.6.36+
FSUse%        Count         Size    Files/sec     App Overhead
     0      1200000            1       1125.0        357885976
     0      2400000            1       1155.6        288103795
     0      3600000            1       1172.4        296521755
     1      4800000            1       1136.0        301718887
     1      6000000            1       1156.7        303605077
     1      7200000            1       1102.9        288852150
     2      8400000            1       1140.9        294894485
     2      9600000            1       1148.0        314394450
     2     10800000            1       1099.7        296365560
     2     12000000            1       1153.6        316283083
     3     13200000            1       1087.9        339988006
     3     14400000            1       1183.9        270836344
     3     15600000            1       1122.7        276400918
     4     16800000            1       1132.1        285272223
     4     18000000            1       1154.8        283424055
     4     19200000            1       1202.5        294558877
     5     20400000            1       1158.1        293971332
     5     21600000            1       1159.4        287720335
     5     22800000            1       1150.1        282987509
     5     24000000            1       1150.7        283870613
     6     25200000            1       1123.8        288094185
     6     26400000            1       1152.1        296984323
     6     27600000            1       1190.7        282403174
     7     28800000            1       1088.6        290493643
     7     30000000            1       1144.1        290311419
     7     31200000            1       1186.0        290021271
     7     32400000            1       1213.9        279465138
     8     33600000            1       1117.3        275745401

avg                                    1146.768      294684785.143


I noticed that

1) BdiWriteback can grow very large. For example, bdi 8:16 has 72960KB
   writeback pages, however the disk IO queue can hold at most
   nr_request*max_sectors_kb=128*512kb=64MB writeback pages. Maybe xfs manages
   to create perfect sequential layouts and writes, and the other 8MB writeback
   pages are flying inside the disk?

	root@wfg-ne02 /cc/fs_mark-3.3/ne02-2.6.36+# g BdiWriteback /debug/bdi/8:*/*
	/debug/bdi/8:0/stats:BdiWriteback:            0 kB
	/debug/bdi/8:112/stats:BdiWriteback:        68352 kB
	/debug/bdi/8:128/stats:BdiWriteback:        62336 kB
	/debug/bdi/8:144/stats:BdiWriteback:        61824 kB
	/debug/bdi/8:160/stats:BdiWriteback:        67328 kB
	/debug/bdi/8:16/stats:BdiWriteback:        72960 kB
	/debug/bdi/8:176/stats:BdiWriteback:        57984 kB
	/debug/bdi/8:192/stats:BdiWriteback:        71936 kB
	/debug/bdi/8:32/stats:BdiWriteback:        68352 kB
	/debug/bdi/8:48/stats:BdiWriteback:        56704 kB
	/debug/bdi/8:64/stats:BdiWriteback:        50304 kB
	/debug/bdi/8:80/stats:BdiWriteback:        68864 kB
	/debug/bdi/8:96/stats:BdiWriteback:         2816 kB

2) the 12 disks are not all 100% utilized. Not even close: sdd, sdf, sdh, sdj
   are almost idle at the moment. Dozens of seconds later, some other disks
   become idle. This happens both before/after patch. There may be some hidden
   bugs (unrelated to this patchset).

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           0.17    0.00   97.87    1.08    0.00    0.88

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await  svctm  %util
sda               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
sdc               0.00    63.00    0.00  125.00     0.00  1909.33    30.55     3.88   31.65   6.57  82.13
sdd               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
sde               0.00    19.00    0.00  112.00     0.00  1517.17    27.09     3.95   35.33   8.00  89.60
sdg               0.00    92.67    0.33  126.00     2.67  1773.33    28.12    14.83  120.78   7.73  97.60
sdf               0.00    32.33    0.00   91.67     0.00  1408.17    30.72     4.84   52.97   7.72  70.80
sdh               0.00    17.67    0.00    5.00     0.00   124.00    49.60     0.07   13.33   9.60   4.80
sdi               0.00    44.67    0.00    5.00     0.00   253.33   101.33     0.15   29.33  10.93   5.47
sdl               0.00   168.00    0.00  135.67     0.00  2216.33    32.67     6.41   45.42   5.75  78.00
sdk               0.00   225.00    0.00  123.00     0.00  2355.83    38.31     9.50   73.03   6.94  85.33
sdj               0.00     1.00    0.00    2.33     0.00    26.67    22.86     0.01    2.29   1.71   0.40
sdb               0.00    14.33    0.00  101.67     0.00  1278.00    25.14     2.02   19.95   7.16  72.80
sdm               0.00   150.33    0.00  144.33     0.00  2344.50    32.49     5.43   33.94   5.39  77.73

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           0.12    0.00   98.63    0.83    0.00    0.42

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await  svctm  %util
sda               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
sdc               0.00   105.67    0.00  127.33     0.00  1810.17    28.43     4.39   32.43   6.67  84.93
sdd               0.00     5.33    0.00   10.67     0.00   128.00    24.00     0.03    2.50   1.25   1.33
sde               0.00   180.33    0.33  107.67     2.67  2109.33    39.11     8.11   73.93   8.99  97.07
sdg               0.00     7.67    0.00   63.67     0.00  1387.50    43.59     1.45   24.29  11.08  70.53
sdf               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
sdh               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
sdi               0.00    62.67    0.00   94.67     0.00  1743.50    36.83     3.28   34.68   8.52  80.67
sdl               0.00   162.00    0.00  141.67     0.00  2295.83    32.41     7.09   51.79   6.14  86.93
sdk               0.00    34.33    0.00  143.67     0.00  1910.17    26.59     5.07   38.90   6.26  90.00
sdj               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00   0.00   0.00
sdb               0.00   195.00    0.00   96.67     0.00  1949.50    40.33     5.54   57.23   8.39  81.07
sdm               0.00   155.00    0.00  143.00     0.00  2357.50    32.97     5.21   39.98   5.71  81.60

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages()
  2010-11-17  4:27 [PATCH 00/13] IO-less dirty throttling v2 Wu Fengguang
@ 2010-11-17  4:27 ` Wu Fengguang
  2010-11-17 14:39   ` Wu Fengguang
  2010-11-24 10:23   ` Peter Zijlstra
  0 siblings, 2 replies; 6+ messages in thread
From: Wu Fengguang @ 2010-11-17  4:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Wu Fengguang, Christoph Hellwig, Dave Chinner,
	Theodore Ts'o, Chris Mason, Peter Zijlstra, Mel Gorman,
	Rik van Riel, KOSAKI Motohiro, linux-mm, linux-fsdevel, LKML

[-- Attachment #1: writeback-per-task-dirty-count.patch --]
[-- Type: text/plain, Size: 9615 bytes --]

Try to limit the dirty throttle pause time in range [1 jiffy, 100 ms],
by controlling how many pages can be dirtied before inserting a pause.

The dirty count will be directly billed to the task struct. Slow start
and quick back off is employed, so that the stable range will be biased
towards less than 50ms. Another intention is for fine timing control of
slow devices, which may need to do full 100ms pauses for every 1 page.

The switch from per-cpu to per-task rate limit makes it easier to exceed
the global dirty limit with a fork bomb, where each new task dirties 1 page,
sleep 10m and continue to dirty 1000 more pages. The caveat is, when it
dirties the first page, it may be honoured a high nr_dirtied_pause
because nr_dirty is still low at that time. In this way lots of tasks
get the free tickets to dirty more pages than allowed. The solution is
to disable rate limiting (ie. to ignore nr_dirtied_pause) totally once
the bdi becomes dirty exceeded.

Note that some filesystems will dirty a batch of pages before calling
balance_dirty_pages_ratelimited_nr(). They saves a little CPU overheads
at the cost of possibly overrunning the dirty limits a bit and/or in the
case of very slow devices, pause the application for much more than
100ms at a time. This is a tradeoff, and seems reasonable optimization
as long as the batch size is controlled within a dozen pages.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/sched.h |    7 ++
 mm/memory_hotplug.c   |    3 
 mm/page-writeback.c   |  126 ++++++++++++++++++----------------------
 3 files changed, 65 insertions(+), 71 deletions(-)

--- linux-next.orig/include/linux/sched.h	2010-11-15 21:43:52.000000000 +0800
+++ linux-next/include/linux/sched.h	2010-11-15 21:43:54.000000000 +0800
@@ -1473,6 +1473,13 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+	/*
+	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
+	 * balance_dirty_pages() for some dirty throttling pause
+	 */
+	int nr_dirtied;
+	int nr_dirtied_pause;
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
--- linux-next.orig/mm/page-writeback.c	2010-11-15 21:43:52.000000000 +0800
+++ linux-next/mm/page-writeback.c	2010-11-16 14:13:14.000000000 +0800
@@ -36,12 +36,6 @@
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 
-/*
- * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling.
- */
-static long ratelimit_pages = 32;
-
 /* The following parameters are exported via /proc/sys/vm */
 
 /*
@@ -452,6 +446,40 @@ unsigned long bdi_dirty_limit(struct bac
 }
 
 /*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If ratelimit_pages is too low then big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it adaptively to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long ratelimit_pages(struct backing_dev_info *bdi)
+{
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+	unsigned long dirty_pages;
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+	dirty_pages = global_page_state(NR_FILE_DIRTY) +
+		      global_page_state(NR_WRITEBACK) +
+		      global_page_state(NR_UNSTABLE_NFS);
+
+	if (dirty_pages <= (dirty_thresh + background_thresh) / 2)
+		goto out;
+
+	dirty_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	dirty_pages  = bdi_stat(bdi, BDI_RECLAIMABLE) +
+		       bdi_stat(bdi, BDI_WRITEBACK);
+
+	if (dirty_pages < dirty_thresh)
+		goto out;
+
+	return 1;
+out:
+	return 1 + int_sqrt(dirty_thresh - dirty_pages);
+}
+
+/*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to perform writeback if the system is over `vm_dirty_ratio'.
@@ -467,7 +495,7 @@ static void balance_dirty_pages(struct a
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
 	unsigned long bw;
-	unsigned long pause;
+	unsigned long pause = 0;
 	bool dirty_exceeded = false;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
@@ -549,6 +577,17 @@ pause:
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
+	if (pause == 0 && nr_dirty < background_thresh)
+		current->nr_dirtied_pause = ratelimit_pages(bdi);
+	else if (pause == 1)
+		current->nr_dirtied_pause += current->nr_dirtied_pause >> 5;
+	else if (pause >= HZ/10)
+		/*
+		 * when repeated, writing 1 page per 100ms on slow devices,
+		 * i-(i+2)/4 will be able to reach 1 but never reduce to 0.
+		 */
+		current->nr_dirtied_pause -= (current->nr_dirtied_pause+2) >> 2;
+
 	if (writeback_in_progress(bdi))
 		return;
 
@@ -575,8 +614,6 @@ void set_page_dirty_balance(struct page 
 	}
 }
 
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
-
 /**
  * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -586,36 +623,30 @@ static DEFINE_PER_CPU(unsigned long, bdp
  * which was newly dirtied.  The function will periodically check the system's
  * dirty state and will initiate writeback if needed.
  *
- * On really big machines, get_writeback_state is expensive, so try to avoid
+ * On really big machines, global_page_state() is expensive, so try to avoid
  * calling it too often (ratelimiting).  But once we're over the dirty memory
- * limit we decrease the ratelimiting by a lot, to prevent individual processes
- * from overshooting the limit by (ratelimit_pages) each.
+ * limit we disable the ratelimiting, to prevent individual processes from
+ * overshooting the limit by (ratelimit_pages) each.
  */
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
-	unsigned long ratelimit;
-	unsigned long *p;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	current->nr_dirtied += nr_pages_dirtied;
 
-	ratelimit = ratelimit_pages;
-	if (mapping->backing_dev_info->dirty_exceeded)
-		ratelimit = 8;
+	if (unlikely(!current->nr_dirtied_pause))
+		current->nr_dirtied_pause = ratelimit_pages(bdi);
 
 	/*
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	preempt_disable();
-	p =  &__get_cpu_var(bdp_ratelimits);
-	*p += nr_pages_dirtied;
-	if (unlikely(*p >= ratelimit)) {
-		ratelimit = *p;
-		*p = 0;
-		preempt_enable();
-		balance_dirty_pages(mapping, ratelimit);
-		return;
+	if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause ||
+		     bdi->dirty_exceeded)) {
+		balance_dirty_pages(mapping, current->nr_dirtied);
+		current->nr_dirtied = 0;
 	}
-	preempt_enable();
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
@@ -703,44 +734,6 @@ void laptop_sync_completion(void)
 #endif
 
 /*
- * If ratelimit_pages is too high then we can get into dirty-data overload
- * if a large number of processes all perform writes at the same time.
- * If it is too low then SMP machines will call the (expensive)
- * get_writeback_state too often.
- *
- * Here we set ratelimit_pages to a level which ensures that when all CPUs are
- * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
- */
-
-void writeback_set_ratelimit(void)
-{
-	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
-	if (ratelimit_pages < 16)
-		ratelimit_pages = 16;
-	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
-}
-
-static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
-{
-	writeback_set_ratelimit();
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block __cpuinitdata ratelimit_nb = {
-	.notifier_call	= ratelimit_handler,
-	.next		= NULL,
-};
-
-/*
  * Called early on to tune the page writeback dirty limits.
  *
  * We used to scale dirty pages according to how total memory
@@ -762,9 +755,6 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	writeback_set_ratelimit();
-	register_cpu_notifier(&ratelimit_nb);
-
 	shift = calc_period_shift();
 	prop_descriptor_init(&vm_completions, shift);
 	prop_descriptor_init(&vm_dirties, shift);
--- linux-next.orig/mm/memory_hotplug.c	2010-11-15 21:43:52.000000000 +0800
+++ linux-next/mm/memory_hotplug.c	2010-11-15 21:43:54.000000000 +0800
@@ -446,8 +446,6 @@ int online_pages(unsigned long pfn, unsi
 
 	vm_total_pages = nr_free_pagecache_pages();
 
-	writeback_set_ratelimit();
-
 	if (onlined_pages)
 		memory_notify(MEM_ONLINE, &arg);
 
@@ -877,7 +875,6 @@ repeat:
 	}
 
 	vm_total_pages = nr_free_pagecache_pages();
-	writeback_set_ratelimit();
 
 	memory_notify(MEM_OFFLINE, &arg);
 	unlock_system_sleep();


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages()
  2010-11-17  4:27 ` [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages() Wu Fengguang
@ 2010-11-17 14:39   ` Wu Fengguang
  2010-11-24 10:23   ` Peter Zijlstra
  1 sibling, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2010-11-17 14:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Christoph Hellwig, Dave Chinner, Theodore Ts'o,
	Chris Mason, Peter Zijlstra, Mel Gorman, Rik van Riel,
	KOSAKI Motohiro, linux-mm, linux-fsdevel@vger.kernel.org, LKML

> +	if (pause == 0 && nr_dirty < background_thresh)
> +		current->nr_dirtied_pause = ratelimit_pages(bdi);
> +	else if (pause == 1)
> +		current->nr_dirtied_pause += current->nr_dirtied_pause >> 5;

Sorry here is a bug fix for the above line, it's also pushed to the
git tree.

Thanks,
Fengguang
---
Subject: writeback: fix increasement of nr_dirtied_pause
Date: Wed Nov 17 22:31:26 CST 2010

Fix a bug that

	current->nr_dirtied_pause += current->nr_dirtied_pause >> 5;

does not effectively increase nr_dirtied_pause when it's <= 32.
Thus nr_dirtied_pause may never grow up..

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/page-writeback.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- linux-next.orig/mm/page-writeback.c	2010-11-17 22:31:09.000000000 +0800
+++ linux-next/mm/page-writeback.c	2010-11-17 22:31:23.000000000 +0800
@@ -662,7 +662,7 @@ pause:
 	if (pause == 0 && nr_dirty < background_thresh)
 		current->nr_dirtied_pause = ratelimit_pages(bdi);
 	else if (pause == 1)
-		current->nr_dirtied_pause += current->nr_dirtied_pause >> 5;
+		current->nr_dirtied_pause += current->nr_dirtied_pause / 32 + 1;
 	else if (pause >= HZ/10)
 		/*
 		 * when repeated, writing 1 page per 100ms on slow devices,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages()
  2010-11-17  4:27 ` [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages() Wu Fengguang
  2010-11-17 14:39   ` Wu Fengguang
@ 2010-11-24 10:23   ` Peter Zijlstra
  2010-11-24 10:43     ` Wu Fengguang
  1 sibling, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2010-11-24 10:23 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, Dave Chinner,
	Theodore Ts'o, Chris Mason, Mel Gorman, Rik van Riel,
	KOSAKI Motohiro, linux-mm, linux-fsdevel, LKML

On Wed, 2010-11-17 at 12:27 +0800, Wu Fengguang wrote:
> +       if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause ||
> +                    bdi->dirty_exceeded)) {
> +               balance_dirty_pages(mapping, current->nr_dirtied);
> +               current->nr_dirtied = 0;
>         } 

Was it a conscious choice to use
  current->nr_dirtied = 0
over 
  current->nr_dirtied -= current->nr_dirtied_pause
?

The former will cause a drift in pause times due to truncation of the
excess.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages()
  2010-11-24 10:23   ` Peter Zijlstra
@ 2010-11-24 10:43     ` Wu Fengguang
  2010-11-24 10:49       ` Peter Zijlstra
  0 siblings, 1 reply; 6+ messages in thread
From: Wu Fengguang @ 2010-11-24 10:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, Dave Chinner,
	Theodore Ts'o, Chris Mason, Mel Gorman, Rik van Riel,
	KOSAKI Motohiro, linux-mm, linux-fsdevel@vger.kernel.org, LKML

On Wed, Nov 24, 2010 at 06:23:07PM +0800, Peter Zijlstra wrote:
> On Wed, 2010-11-17 at 12:27 +0800, Wu Fengguang wrote:
> > +       if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause ||
> > +                    bdi->dirty_exceeded)) {
> > +               balance_dirty_pages(mapping, current->nr_dirtied);
> > +               current->nr_dirtied = 0;
> >         } 
> 
> Was it a conscious choice to use
>   current->nr_dirtied = 0
> over 
>   current->nr_dirtied -= current->nr_dirtied_pause
> ?
> 
> The former will cause a drift in pause times due to truncation of the
> excess.

It should be fine in either way, as long as the "truncated" number is
passed to balance_dirty_pages():

+               balance_dirty_pages(mapping, current->nr_dirtied);
+               current->nr_dirtied = 0;

or

+               balance_dirty_pages(mapping, current->nr_dirtied_pause);
+               current->nr_dirtied -= current->nr_dirtied_pause;

Thanks,
Fengguang

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages()
  2010-11-24 10:43     ` Wu Fengguang
@ 2010-11-24 10:49       ` Peter Zijlstra
  0 siblings, 0 replies; 6+ messages in thread
From: Peter Zijlstra @ 2010-11-24 10:49 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, Dave Chinner,
	Theodore Ts'o, Chris Mason, Mel Gorman, Rik van Riel,
	KOSAKI Motohiro, linux-mm, linux-fsdevel@vger.kernel.org, LKML

On Wed, 2010-11-24 at 18:43 +0800, Wu Fengguang wrote:
> On Wed, Nov 24, 2010 at 06:23:07PM +0800, Peter Zijlstra wrote:
> > On Wed, 2010-11-17 at 12:27 +0800, Wu Fengguang wrote:
> > > +       if (unlikely(current->nr_dirtied >= current->nr_dirtied_pause ||
> > > +                    bdi->dirty_exceeded)) {
> > > +               balance_dirty_pages(mapping, current->nr_dirtied);
> > > +               current->nr_dirtied = 0;
> > >         } 
> > 
> > Was it a conscious choice to use
> >   current->nr_dirtied = 0
> > over 
> >   current->nr_dirtied -= current->nr_dirtied_pause
> > ?
> > 
> > The former will cause a drift in pause times due to truncation of the
> > excess.
> 
> It should be fine in either way, as long as the "truncated" number is
> passed to balance_dirty_pages():
> 
> +               balance_dirty_pages(mapping, current->nr_dirtied);
> +               current->nr_dirtied = 0;
> 
> or
> 
> +               balance_dirty_pages(mapping, current->nr_dirtied_pause);
> +               current->nr_dirtied -= current->nr_dirtied_pause;

ok, just wanted to make sure you'd considered it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-11-24 10:54 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-11-17  3:58 [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages() Wu Fengguang
  -- strict thread matches above, loose matches on Subject: below --
2010-11-17  4:27 [PATCH 00/13] IO-less dirty throttling v2 Wu Fengguang
2010-11-17  4:27 ` [PATCH 03/13] writeback: per-task rate limit on balance_dirty_pages() Wu Fengguang
2010-11-17 14:39   ` Wu Fengguang
2010-11-24 10:23   ` Peter Zijlstra
2010-11-24 10:43     ` Wu Fengguang
2010-11-24 10:49       ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).