All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 3/21] scaled writeback throttling levels
@ 2002-08-11  7:38 Andrew Morton
  2002-08-14  8:40 ` William Lee Irwin III
  0 siblings, 1 reply; 2+ messages in thread
From: Andrew Morton @ 2002-08-11  7:38 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: lkml

(resend)

get_page_state() is showing up on profiles on some big machines.  It is
a quite expensive function and it is being called too often.

The patch replaces the hardwired RATELIMIT_PAGES with a calculated
amount based on the amount of memory in the machine and the number of
CPUs.



 page-writeback.c |   65 ++++++++++++++++++++++++++++++++++++++++++++++---------
 1 files changed, 55 insertions(+), 10 deletions(-)

--- 2.5.31/mm/page-writeback.c~ratelimit-scaling	Sat Aug 10 23:29:36 2002
+++ 2.5.31-akpm/mm/page-writeback.c	Sat Aug 10 23:29:36 2002
@@ -22,6 +22,8 @@
 #include <linux/sysrq.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -34,10 +36,9 @@
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling.  Probably
- * should be scaled by memory size.
+ * will look to see if it needs to force writeback or throttling.
  */
-#define RATELIMIT_PAGES		((512 * 1024) / PAGE_SIZE)
+static int ratelimit_pages = 32;
 
 /*
  * When balance_dirty_pages decides that the caller needs to perform some
@@ -45,8 +46,10 @@
  * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
  * large amounts of I/O are submitted.
  */
-#define SYNC_WRITEBACK_PAGES	((RATELIMIT_PAGES * 3) / 2)
-
+static inline int sync_writeback_pages(void)
+{
+	return ratelimit_pages + ratelimit_pages / 2;
+}
 
 /* The following parameters are exported via /proc/sys/vm */
 
@@ -119,12 +122,12 @@ void balance_dirty_pages(struct address_
 	bdi = mapping->backing_dev_info;
 
 	if (dirty_and_writeback > sync_thresh) {
-		int nr_to_write = SYNC_WRITEBACK_PAGES;
+		int nr_to_write = sync_writeback_pages();
 
 		writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
 		get_page_state(&ps);
 	} else if (dirty_and_writeback > async_thresh) {
-		int nr_to_write = SYNC_WRITEBACK_PAGES;
+		int nr_to_write = sync_writeback_pages();
 
 		writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
 		get_page_state(&ps);
@@ -153,7 +156,7 @@ void balance_dirty_pages_ratelimited(str
 	int cpu;
 
 	cpu = get_cpu();
-	if (ratelimits[cpu].count++ >= RATELIMIT_PAGES) {
+	if (ratelimits[cpu].count++ >= ratelimit_pages) {
 		ratelimits[cpu].count = 0;
 		put_cpu();
 		balance_dirty_pages(mapping);
@@ -247,16 +250,56 @@ static void wb_timer_fn(unsigned long un
 
 }
 
-static int __init wb_timer_init(void)
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive) get_page_state
+ * too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds before writeback cuts in.
+ *
+ * But the limit should not be set too high.  Because it also controls the
+ * amount of memory which the balance_dirty_pages() caller has to write back.
+ * If this is too large then the caller will block on the IO queue all the
+ * time.  So limit it to four megabytes - the balance_dirty_pages() caller
+ * will write six megabyte chunks, max.
+ */
+
+static void set_ratelimit(void)
+{
+	ratelimit_pages = nr_free_pagecache_pages() / (num_online_cpus() * 32);
+	if (ratelimit_pages < 16)
+		ratelimit_pages = 16;
+	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
+		ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
+}
+
+static int
+ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+{
+	set_ratelimit();
+	return 0;
+}
+
+static struct notifier_block ratelimit_nb = {
+	.notifier_call	= ratelimit_handler,
+	.next		= NULL,
+};
+
+static int __init page_writeback_init(void)
 {
 	init_timer(&wb_timer);
 	wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
 	wb_timer.data = 0;
 	wb_timer.function = wb_timer_fn;
 	add_timer(&wb_timer);
+	set_ratelimit();
+	register_cpu_notifier(&ratelimit_nb);
 	return 0;
 }
-module_init(wb_timer_init);
+module_init(page_writeback_init);
 
 /*
  * A library function, which implements the vm_writeback a_op.  It's fairly
@@ -481,3 +524,5 @@ int __set_page_dirty_nobuffers(struct pa
 	return ret;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+

.

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [patch 3/21] scaled writeback throttling levels
  2002-08-11  7:38 [patch 3/21] scaled writeback throttling levels Andrew Morton
@ 2002-08-14  8:40 ` William Lee Irwin III
  0 siblings, 0 replies; 2+ messages in thread
From: William Lee Irwin III @ 2002-08-14  8:40 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Linus Torvalds, lkml

On Sun, Aug 11, 2002 at 12:38:32AM -0700, Andrew Morton wrote:
> (resend)
> get_page_state() is showing up on profiles on some big machines.  It is
> a quite expensive function and it is being called too often.
> The patch replaces the hardwired RATELIMIT_PAGES with a calculated
> amount based on the amount of memory in the machine and the number of
> CPUs.

dbench 256 on 16x/16G numaq:

Throughput 50.5397 MB/sec (NB=63.1747 MB/sec  505.397 MBit/sec)  256 procs


c013bf74 5827289  74.4428     .text.lock.highmem
c013b7d0 797024   10.1819     kunmap_high
c013b5dc 482436   6.16306     kmap_high
c012e53c 87883    1.12269     file_read_actor
c0114820 65764    0.840126    scheduler_tick
c013bcbc 32857    0.419744    blk_queue_bounce
c013564c 31874    0.407186    rmqueue
c012f260 29442    0.376118    generic_file_write
c01113b8 28706    0.366715    smp_apic_timer_interrupt
c0143d1c 26503    0.338572    block_prepare_write
c0105394 20555    0.262587    default_idle
c012dec0 19920    0.254475    unlock_page
c014333c 17401    0.222295    __block_prepare_write
c013b558 16545    0.21136     flush_all_zero_pkmaps
c0135d28 14959    0.191099    page_cache_release
c013fb30 11923    0.152315    generic_file_llseek
c013429c 11059    0.141277    lru_cache_add
c0135b10 10277    0.131287    __alloc_pages
c0140124 9841     0.125717    vfs_write
c0143dc8 8732     0.11155     generic_commit_write
c012dcb4 8051     0.10285     add_to_page_cache
c016d620 7884     0.100717    ext2_get_block

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2002-08-14  8:38 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-11  7:38 [patch 3/21] scaled writeback throttling levels Andrew Morton
2002-08-14  8:40 ` William Lee Irwin III

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.