The Linux Kernel Mailing List
 help / color / mirror / Atom feed
* [patch 3/21] scaled writeback throttling levels
@ 2002-08-11  7:38 Andrew Morton
  2002-08-14  8:40 ` William Lee Irwin III
  0 siblings, 1 reply; 2+ messages in thread
From: Andrew Morton @ 2002-08-11  7:38 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: lkml

(resend)

get_page_state() is showing up on profiles on some big machines.  It is
a quite expensive function and it is being called too often.

The patch replaces the hardwired RATELIMIT_PAGES with a calculated
amount based on the amount of memory in the machine and the number of
CPUs.



 page-writeback.c |   65 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 55 insertions(+), 10 deletions(-)

--- 2.5.31/mm/page-writeback.c~ratelimit-scaling	Sat Aug 10 23:29:36 2002
+++ 2.5.31-akpm/mm/page-writeback.c	Sat Aug 10 23:29:36 2002
@@ -22,6 +22,8 @@
 #include <linux/sysrq.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -34,10 +36,9 @@
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling.  Probably
- * should be scaled by memory size.
+ * will look to see if it needs to force writeback or throttling.
  */
-#define RATELIMIT_PAGES		((512 * 1024) / PAGE_SIZE)
+static int ratelimit_pages = 32;
 
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
@@ -45,8 +46,10 @@
  * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
  * large amounts of I/O are submitted.
  */
-#define SYNC_WRITEBACK_PAGES	((RATELIMIT_PAGES * 3) / 2)
-
+static inline int sync_writeback_pages(void)
+{
+	return ratelimit_pages + ratelimit_pages / 2;
+}
 
 /* The following parameters are exported via /proc/sys/vm */
 
@@ -119,12 +122,12 @@ void balance_dirty_pages(struct address_
 	bdi = mapping->backing_dev_info;
 
 	if (dirty_and_writeback > sync_thresh) {
-		int nr_to_write = SYNC_WRITEBACK_PAGES;
+		int nr_to_write = sync_writeback_pages();
 
 		writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
 		get_page_state(&ps);
 	} else if (dirty_and_writeback > async_thresh) {
-		int nr_to_write = SYNC_WRITEBACK_PAGES;
+		int nr_to_write = sync_writeback_pages();
 
 		writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
 		get_page_state(&ps);
@@ -153,7 +156,7 @@ void balance_dirty_pages_ratelimited(str
 	int cpu;
 
 	cpu = get_cpu();
-	if (ratelimits[cpu].count++ >= RATELIMIT_PAGES) {
+	if (ratelimits[cpu].count++ >= ratelimit_pages) {
 		ratelimits[cpu].count = 0;
 		put_cpu();
 		balance_dirty_pages(mapping);
@@ -247,16 +250,56 @@ static void wb_timer_fn(unsigned long un
 
 }
 
-static int __init wb_timer_init(void)
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive) get_page_state
+ * too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds before writeback cuts in.
+ *
+ * But the limit should not be set too high.  Because it also controls the
+ * amount of memory which the balance_dirty_pages() caller has to write back.
+ * If this is too large then the caller will block on the IO queue all the
+ * time.  So limit it to four megabytes - the balance_dirty_pages() caller
+ * will write six megabyte chunks, max.
+ */
+
+static void set_ratelimit(void)
+{
+	ratelimit_pages = nr_free_pagecache_pages() / (num_online_cpus() * 32);
+	if (ratelimit_pages < 16)
+		ratelimit_pages = 16;
+	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
+		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
+}
+
+static int
+ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+{
+	set_ratelimit();
+	return 0;
+}
+
+static struct notifier_block ratelimit_nb = {
+	.notifier_call	= ratelimit_handler,
+	.next		= NULL,
+};
+
+static int __init page_writeback_init(void)
 {
 	init_timer(&wb_timer);
 	wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
 	wb_timer.data = 0;
 	wb_timer.function = wb_timer_fn;
 	add_timer(&wb_timer);
+	set_ratelimit();
+	register_cpu_notifier(&ratelimit_nb);
 	return 0;
 }
-module_init(wb_timer_init);
+module_init(page_writeback_init);
 
 /*
  * A library function, which implements the vm_writeback a_op.  It's fairly
@@ -481,3 +524,5 @@ int __set_page_dirty_nobuffers(struct pa
 	return ret;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+

.

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [patch 3/21] scaled writeback throttling levels
  2002-08-11  7:38 [patch 3/21] scaled writeback throttling levels Andrew Morton
@ 2002-08-14  8:40 ` William Lee Irwin III
  0 siblings, 0 replies; 2+ messages in thread
From: William Lee Irwin III @ 2002-08-14  8:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, lkml

On Sun, Aug 11, 2002 at 12:38:32AM -0700, Andrew Morton wrote:
> (resend)
> get_page_state() is showing up on profiles on some big machines.  It is
> a quite expensive function and it is being called too often.
> The patch replaces the hardwired RATELIMIT_PAGES with a calculated
> amount based on the amount of memory in the machine and the number of
> CPUs.

dbench 256 on 16x/16G numaq:

Throughput 50.5397 MB/sec (NB=63.1747 MB/sec  505.397 MBit/sec)  256 procs


c013bf74 5827289  74.4428     .text.lock.highmem
c013b7d0 797024   10.1819     kunmap_high
c013b5dc 482436   6.16306     kmap_high
c012e53c 87883    1.12269     file_read_actor
c0114820 65764    0.840126    scheduler_tick
c013bcbc 32857    0.419744    blk_queue_bounce
c013564c 31874    0.407186    rmqueue
c012f260 29442    0.376118    generic_file_write
c01113b8 28706    0.366715    smp_apic_timer_interrupt
c0143d1c 26503    0.338572    block_prepare_write
c0105394 20555    0.262587    default_idle
c012dec0 19920    0.254475    unlock_page
c014333c 17401    0.222295    __block_prepare_write
c013b558 16545    0.21136     flush_all_zero_pkmaps
c0135d28 14959    0.191099    page_cache_release
c013fb30 11923    0.152315    generic_file_llseek
c013429c 11059    0.141277    lru_cache_add
c0135b10 10277    0.131287    __alloc_pages
c0140124 9841     0.125717    vfs_write
c0143dc8 8732     0.11155     generic_commit_write
c012dcb4 8051     0.10285     add_to_page_cache
c016d620 7884     0.100717    ext2_get_block

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2002-08-14  8:38 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-11  7:38 [patch 3/21] scaled writeback throttling levels Andrew Morton
2002-08-14  8:40 ` William Lee Irwin III

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox