* [patch 3/21] scaled writeback throttling levels
@ 2002-08-11 7:38 Andrew Morton
2002-08-14 8:40 ` William Lee Irwin III
0 siblings, 1 reply; 2+ messages in thread
From: Andrew Morton @ 2002-08-11 7:38 UTC (permalink / raw)
To: Linus Torvalds; +Cc: lkml
(resend)
get_page_state() is showing up on profiles on some big machines. It is
a quite expensive function and it is being called too often.
The patch replaces the hardwired RATELIMIT_PAGES with a calculated
amount based on the amount of memory in the machine and the number of
CPUs.
page-writeback.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++---------
1 files changed, 55 insertions(+), 10 deletions(-)
--- 2.5.31/mm/page-writeback.c~ratelimit-scaling Sat Aug 10 23:29:36 2002
+++ 2.5.31-akpm/mm/page-writeback.c Sat Aug 10 23:29:36 2002
@@ -22,6 +22,8 @@
#include <linux/sysrq.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
/*
* The maximum number of pages to writeout in a single bdflush/kupdate
@@ -34,10 +36,9 @@
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
- * will look to see if it needs to force writeback or throttling. Probably
- * should be scaled by memory size.
+ * will look to see if it needs to force writeback or throttling.
*/
-#define RATELIMIT_PAGES ((512 * 1024) / PAGE_SIZE)
+static int ratelimit_pages = 32;
/*
* When balance_dirty_pages decides that the caller needs to perform some
@@ -45,8 +46,10 @@
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
* large amounts of I/O are submitted.
*/
-#define SYNC_WRITEBACK_PAGES ((RATELIMIT_PAGES * 3) / 2)
-
+static inline int sync_writeback_pages(void)
+{
+ return ratelimit_pages + ratelimit_pages / 2;
+}
/* The following parameters are exported via /proc/sys/vm */
@@ -119,12 +122,12 @@ void balance_dirty_pages(struct address_
bdi = mapping->backing_dev_info;
if (dirty_and_writeback > sync_thresh) {
- int nr_to_write = SYNC_WRITEBACK_PAGES;
+ int nr_to_write = sync_writeback_pages();
writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
get_page_state(&ps);
} else if (dirty_and_writeback > async_thresh) {
- int nr_to_write = SYNC_WRITEBACK_PAGES;
+ int nr_to_write = sync_writeback_pages();
writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
get_page_state(&ps);
@@ -153,7 +156,7 @@ void balance_dirty_pages_ratelimited(str
int cpu;
cpu = get_cpu();
- if (ratelimits[cpu].count++ >= RATELIMIT_PAGES) {
+ if (ratelimits[cpu].count++ >= ratelimit_pages) {
ratelimits[cpu].count = 0;
put_cpu();
balance_dirty_pages(mapping);
@@ -247,16 +250,56 @@ static void wb_timer_fn(unsigned long un
}
-static int __init wb_timer_init(void)
+/*
+ * If ratelimit_pages is too high then we can get into dirty-data overload
+ * if a large number of processes all perform writes at the same time.
+ * If it is too low then SMP machines will call the (expensive) get_page_state
+ * too often.
+ *
+ * Here we set ratelimit_pages to a level which ensures that when all CPUs are
+ * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
+ * thresholds before writeback cuts in.
+ *
+ * But the limit should not be set too high. Because it also controls the
+ * amount of memory which the balance_dirty_pages() caller has to write back.
+ * If this is too large then the caller will block on the IO queue all the
+ * time. So limit it to four megabytes - the balance_dirty_pages() caller
+ * will write six megabyte chunks, max.
+ */
+
+static void set_ratelimit(void)
+{
+ ratelimit_pages = nr_free_pagecache_pages() / (num_online_cpus() * 32);
+ if (ratelimit_pages < 16)
+ ratelimit_pages = 16;
+ if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
+ ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
+}
+
+static int
+ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+{
+ set_ratelimit();
+ return 0;
+}
+
+static struct notifier_block ratelimit_nb = {
+ .notifier_call = ratelimit_handler,
+ .next = NULL,
+};
+
+static int __init page_writeback_init(void)
{
init_timer(&wb_timer);
wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
wb_timer.data = 0;
wb_timer.function = wb_timer_fn;
add_timer(&wb_timer);
+ set_ratelimit();
+ register_cpu_notifier(&ratelimit_nb);
return 0;
}
-module_init(wb_timer_init);
+module_init(page_writeback_init);
/*
* A library function, which implements the vm_writeback a_op. It's fairly
@@ -481,3 +524,5 @@ int __set_page_dirty_nobuffers(struct pa
return ret;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+
.
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [patch 3/21] scaled writeback throttling levels
2002-08-11 7:38 [patch 3/21] scaled writeback throttling levels Andrew Morton
@ 2002-08-14 8:40 ` William Lee Irwin III
0 siblings, 0 replies; 2+ messages in thread
From: William Lee Irwin III @ 2002-08-14 8:40 UTC (permalink / raw)
To: Andrew Morton; +Cc: Linus Torvalds, lkml
On Sun, Aug 11, 2002 at 12:38:32AM -0700, Andrew Morton wrote:
> (resend)
> get_page_state() is showing up on profiles on some big machines. It is
> a quite expensive function and it is being called too often.
> The patch replaces the hardwired RATELIMIT_PAGES with a calculated
> amount based on the amount of memory in the machine and the number of
> CPUs.
dbench 256 on 16x/16G numaq:
Throughput 50.5397 MB/sec (NB=63.1747 MB/sec 505.397 MBit/sec) 256 procs
c013bf74 5827289 74.4428 .text.lock.highmem
c013b7d0 797024 10.1819 kunmap_high
c013b5dc 482436 6.16306 kmap_high
c012e53c 87883 1.12269 file_read_actor
c0114820 65764 0.840126 scheduler_tick
c013bcbc 32857 0.419744 blk_queue_bounce
c013564c 31874 0.407186 rmqueue
c012f260 29442 0.376118 generic_file_write
c01113b8 28706 0.366715 smp_apic_timer_interrupt
c0143d1c 26503 0.338572 block_prepare_write
c0105394 20555 0.262587 default_idle
c012dec0 19920 0.254475 unlock_page
c014333c 17401 0.222295 __block_prepare_write
c013b558 16545 0.21136 flush_all_zero_pkmaps
c0135d28 14959 0.191099 page_cache_release
c013fb30 11923 0.152315 generic_file_llseek
c013429c 11059 0.141277 lru_cache_add
c0135b10 10277 0.131287 __alloc_pages
c0140124 9841 0.125717 vfs_write
c0143dc8 8732 0.11155 generic_commit_write
c012dcb4 8051 0.10285 add_to_page_cache
c016d620 7884 0.100717 ext2_get_block
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2002-08-14 8:38 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-11 7:38 [patch 3/21] scaled writeback throttling levels Andrew Morton
2002-08-14 8:40 ` William Lee Irwin III
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.