diff for duplicates of <20160225200721.GB3370@cmpxchg.org> diff --git a/a/1.txt b/N1/1.txt index 0f7dd4b..9494f73 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -43,3 +43,182 @@ patch for these two instances in place. > > +node/system. The maximum value is 1000, or 10% of memory. > > Ditto for 0.001%. + +>From 8e97efd64ef8491a7a4e7326f7c0d3cb7a5eb264 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner <hannes@cmpxchg.org> +Date: Thu, 18 Feb 2016 10:04:21 -0500 +Subject: [PATCH V3] mm: scale kswapd watermarks in proportion to memory + +In machines with 140G of memory and enterprise flash storage, we have +seen read and write bursts routinely exceed the kswapd watermarks and +cause thundering herds in direct reclaim. Unfortunately, the only way +to tune kswapd aggressiveness is through adjusting min_free_kbytes - +the system's emergency reserves - which is entirely unrelated to the +system's latency requirements. In order to get kswapd to maintain a +250M buffer of free memory, the emergency reserves need to be set to +1G. That is a lot of memory wasted for no good reason. + +On the other hand, it's reasonable to assume that allocation bursts +and overall allocation concurrency scale with memory capacity, so it +makes sense to make kswapd aggressiveness a function of that as well. + +Change the kswapd watermark scale factor from the currently fixed 25% +of the tunable emergency reserve to a tunable 0.1% of memory. + +Beyond 1G of memory, this will produce bigger watermark steps than the +current formula in default settings. Ensure that the new formula never +chooses steps smaller than that, i.e. 25% of the emergency reserve. + +On a 140G machine, this raises the default watermark steps - the +distance between min and low, and low and high - from 16M to 143M. + +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +Acked-by: Mel Gorman <mgorman@suse.de> +Acked-by: Rik van Riel <riel@redhat.com> +Acked-by: David Rientjes <rientjes@google.com> +--- + Documentation/sysctl/vm.txt | 18 ++++++++++++++++++ + include/linux/mm.h | 1 + + include/linux/mmzone.h | 2 ++ + kernel/sysctl.c | 10 ++++++++++ + mm/page_alloc.c | 29 +++++++++++++++++++++++++++-- + 5 files changed, 58 insertions(+), 2 deletions(-) + +diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt +index 89a887c..cb03684 100644 +--- a/Documentation/sysctl/vm.txt ++++ b/Documentation/sysctl/vm.txt +@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable + directory and inode objects. With vfs_cache_pressure=1000, it will look for + ten times more freeable objects than there are. + ++============================================================= ++ ++watermark_scale_factor: ++ ++This factor controls the aggressiveness of kswapd. It defines the ++amount of memory left in a node/system before kswapd is woken up and ++how much memory needs to be free before kswapd goes back to sleep. ++ ++The unit is in fractions of 10,000. The default value of 10 means the ++distances between watermarks are 0.1% of the available memory in the ++node/system. The maximum value is 1000, or 10% of memory. ++ ++A high rate of threads entering direct reclaim (allocstall) or kswapd ++going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate ++that the number of free pages kswapd maintains for latency reasons is ++too small for the allocation bursts occurring in the system. This knob ++can then be used to tune kswapd aggressiveness accordingly. ++ + ============================================================== + + zone_reclaim_mode: +diff --git a/include/linux/mm.h b/include/linux/mm.h +index fe4d988..f2f4a6c 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1896,6 +1896,7 @@ extern void zone_pcp_reset(struct zone *zone); + + /* page_alloc.c */ + extern int min_free_kbytes; ++extern int watermark_scale_factor; + + /* nommu.c */ + extern atomic_long_t mmap_pages_allocated; +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index bdd9a27..c60df92 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone) + struct ctl_table; + int min_free_kbytes_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); ++int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, ++ void __user *, size_t *, loff_t *); + extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; + int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index f930ec2..96ec234 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -126,6 +126,7 @@ static int __maybe_unused two = 2; + static int __maybe_unused four = 4; + static unsigned long one_ul = 1; + static int one_hundred = 100; ++static int one_thousand = 1000; + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -1404,6 +1405,15 @@ static struct ctl_table vm_table[] = { + .extra1 = &zero, + }, + { ++ .procname = "watermark_scale_factor", ++ .data = &watermark_scale_factor, ++ .maxlen = sizeof(watermark_scale_factor), ++ .mode = 0644, ++ .proc_handler = watermark_scale_factor_sysctl_handler, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 9cd427c..9bede0b 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { + + int min_free_kbytes = 1024; + int user_min_free_kbytes = -1; ++int watermark_scale_factor = 10; + + static unsigned long __meminitdata nr_kernel_pages; + static unsigned long __meminitdata nr_all_pages; +@@ -6453,8 +6454,17 @@ static void __setup_per_zone_wmarks(void) + zone->watermark[WMARK_MIN] = tmp; + } + +- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); +- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); ++ /* ++ * Set the kswapd watermarks distance according to the ++ * scale factor in proportion to available memory, but ++ * ensure a minimum size on small systems. ++ */ ++ tmp = max_t(u64, tmp >> 2, ++ mult_frac(zone->managed_pages, ++ watermark_scale_factor, 10000)); ++ ++ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; ++ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + + __mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - +@@ -6551,6 +6561,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, + return 0; + } + ++int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, ++ void __user *buffer, size_t *length, loff_t *ppos) ++{ ++ int rc; ++ ++ rc = proc_dointvec_minmax(table, write, buffer, length, ppos); ++ if (rc) ++ return rc; ++ ++ if (write) ++ setup_per_zone_wmarks(); ++ ++ return 0; ++} ++ + #ifdef CONFIG_NUMA + int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +-- +2.7.1 diff --git a/a/content_digest b/N1/content_digest index fa1659b..51e1295 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -56,6 +56,185 @@ "> > +distances between watermarks are 0.001% of the available memory in the\n" "> > +node/system. The maximum value is 1000, or 10% of memory.\n" "> \n" - > Ditto for 0.001%. + "> Ditto for 0.001%.\n" + "\n" + ">From 8e97efd64ef8491a7a4e7326f7c0d3cb7a5eb264 Mon Sep 17 00:00:00 2001\n" + "From: Johannes Weiner <hannes@cmpxchg.org>\n" + "Date: Thu, 18 Feb 2016 10:04:21 -0500\n" + "Subject: [PATCH V3] mm: scale kswapd watermarks in proportion to memory\n" + "\n" + "In machines with 140G of memory and enterprise flash storage, we have\n" + "seen read and write bursts routinely exceed the kswapd watermarks and\n" + "cause thundering herds in direct reclaim. Unfortunately, the only way\n" + "to tune kswapd aggressiveness is through adjusting min_free_kbytes -\n" + "the system's emergency reserves - which is entirely unrelated to the\n" + "system's latency requirements. In order to get kswapd to maintain a\n" + "250M buffer of free memory, the emergency reserves need to be set to\n" + "1G. That is a lot of memory wasted for no good reason.\n" + "\n" + "On the other hand, it's reasonable to assume that allocation bursts\n" + "and overall allocation concurrency scale with memory capacity, so it\n" + "makes sense to make kswapd aggressiveness a function of that as well.\n" + "\n" + "Change the kswapd watermark scale factor from the currently fixed 25%\n" + "of the tunable emergency reserve to a tunable 0.1% of memory.\n" + "\n" + "Beyond 1G of memory, this will produce bigger watermark steps than the\n" + "current formula in default settings. Ensure that the new formula never\n" + "chooses steps smaller than that, i.e. 25% of the emergency reserve.\n" + "\n" + "On a 140G machine, this raises the default watermark steps - the\n" + "distance between min and low, and low and high - from 16M to 143M.\n" + "\n" + "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n" + "Acked-by: Mel Gorman <mgorman@suse.de>\n" + "Acked-by: Rik van Riel <riel@redhat.com>\n" + "Acked-by: David Rientjes <rientjes@google.com>\n" + "---\n" + " Documentation/sysctl/vm.txt | 18 ++++++++++++++++++\n" + " include/linux/mm.h | 1 +\n" + " include/linux/mmzone.h | 2 ++\n" + " kernel/sysctl.c | 10 ++++++++++\n" + " mm/page_alloc.c | 29 +++++++++++++++++++++++++++--\n" + " 5 files changed, 58 insertions(+), 2 deletions(-)\n" + "\n" + "diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt\n" + "index 89a887c..cb03684 100644\n" + "--- a/Documentation/sysctl/vm.txt\n" + "+++ b/Documentation/sysctl/vm.txt\n" + "@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable\n" + " directory and inode objects. With vfs_cache_pressure=1000, it will look for\n" + " ten times more freeable objects than there are.\n" + " \n" + "+=============================================================\n" + "+\n" + "+watermark_scale_factor:\n" + "+\n" + "+This factor controls the aggressiveness of kswapd. It defines the\n" + "+amount of memory left in a node/system before kswapd is woken up and\n" + "+how much memory needs to be free before kswapd goes back to sleep.\n" + "+\n" + "+The unit is in fractions of 10,000. The default value of 10 means the\n" + "+distances between watermarks are 0.1% of the available memory in the\n" + "+node/system. The maximum value is 1000, or 10% of memory.\n" + "+\n" + "+A high rate of threads entering direct reclaim (allocstall) or kswapd\n" + "+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate\n" + "+that the number of free pages kswapd maintains for latency reasons is\n" + "+too small for the allocation bursts occurring in the system. This knob\n" + "+can then be used to tune kswapd aggressiveness accordingly.\n" + "+\n" + " ==============================================================\n" + " \n" + " zone_reclaim_mode:\n" + "diff --git a/include/linux/mm.h b/include/linux/mm.h\n" + "index fe4d988..f2f4a6c 100644\n" + "--- a/include/linux/mm.h\n" + "+++ b/include/linux/mm.h\n" + "@@ -1896,6 +1896,7 @@ extern void zone_pcp_reset(struct zone *zone);\n" + " \n" + " /* page_alloc.c */\n" + " extern int min_free_kbytes;\n" + "+extern int watermark_scale_factor;\n" + " \n" + " /* nommu.c */\n" + " extern atomic_long_t mmap_pages_allocated;\n" + "diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h\n" + "index bdd9a27..c60df92 100644\n" + "--- a/include/linux/mmzone.h\n" + "+++ b/include/linux/mmzone.h\n" + "@@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone)\n" + " struct ctl_table;\n" + " int min_free_kbytes_sysctl_handler(struct ctl_table *, int,\n" + " \t\t\t\t\tvoid __user *, size_t *, loff_t *);\n" + "+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,\n" + "+\t\t\t\t\tvoid __user *, size_t *, loff_t *);\n" + " extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];\n" + " int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,\n" + " \t\t\t\t\tvoid __user *, size_t *, loff_t *);\n" + "diff --git a/kernel/sysctl.c b/kernel/sysctl.c\n" + "index f930ec2..96ec234 100644\n" + "--- a/kernel/sysctl.c\n" + "+++ b/kernel/sysctl.c\n" + "@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;\n" + " static int __maybe_unused four = 4;\n" + " static unsigned long one_ul = 1;\n" + " static int one_hundred = 100;\n" + "+static int one_thousand = 1000;\n" + " #ifdef CONFIG_PRINTK\n" + " static int ten_thousand = 10000;\n" + " #endif\n" + "@@ -1404,6 +1405,15 @@ static struct ctl_table vm_table[] = {\n" + " \t\t.extra1\t\t= &zero,\n" + " \t},\n" + " \t{\n" + "+\t\t.procname\t= \"watermark_scale_factor\",\n" + "+\t\t.data\t\t= &watermark_scale_factor,\n" + "+\t\t.maxlen\t\t= sizeof(watermark_scale_factor),\n" + "+\t\t.mode\t\t= 0644,\n" + "+\t\t.proc_handler\t= watermark_scale_factor_sysctl_handler,\n" + "+\t\t.extra1\t\t= &one,\n" + "+\t\t.extra2\t\t= &one_thousand,\n" + "+\t},\n" + "+\t{\n" + " \t\t.procname\t= \"percpu_pagelist_fraction\",\n" + " \t\t.data\t\t= &percpu_pagelist_fraction,\n" + " \t\t.maxlen\t\t= sizeof(percpu_pagelist_fraction),\n" + "diff --git a/mm/page_alloc.c b/mm/page_alloc.c\n" + "index 9cd427c..9bede0b 100644\n" + "--- a/mm/page_alloc.c\n" + "+++ b/mm/page_alloc.c\n" + "@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {\n" + " \n" + " int min_free_kbytes = 1024;\n" + " int user_min_free_kbytes = -1;\n" + "+int watermark_scale_factor = 10;\n" + " \n" + " static unsigned long __meminitdata nr_kernel_pages;\n" + " static unsigned long __meminitdata nr_all_pages;\n" + "@@ -6453,8 +6454,17 @@ static void __setup_per_zone_wmarks(void)\n" + " \t\t\tzone->watermark[WMARK_MIN] = tmp;\n" + " \t\t}\n" + " \n" + "-\t\tzone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);\n" + "-\t\tzone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);\n" + "+\t\t/*\n" + "+\t\t * Set the kswapd watermarks distance according to the\n" + "+\t\t * scale factor in proportion to available memory, but\n" + "+\t\t * ensure a minimum size on small systems.\n" + "+\t\t */\n" + "+\t\ttmp = max_t(u64, tmp >> 2,\n" + "+\t\t\t mult_frac(zone->managed_pages,\n" + "+\t\t\t\t watermark_scale_factor, 10000));\n" + "+\n" + "+\t\tzone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;\n" + "+\t\tzone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;\n" + " \n" + " \t\t__mod_zone_page_state(zone, NR_ALLOC_BATCH,\n" + " \t\t\thigh_wmark_pages(zone) - low_wmark_pages(zone) -\n" + "@@ -6551,6 +6561,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,\n" + " \treturn 0;\n" + " }\n" + " \n" + "+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,\n" + "+\tvoid __user *buffer, size_t *length, loff_t *ppos)\n" + "+{\n" + "+\tint rc;\n" + "+\n" + "+\trc = proc_dointvec_minmax(table, write, buffer, length, ppos);\n" + "+\tif (rc)\n" + "+\t\treturn rc;\n" + "+\n" + "+\tif (write)\n" + "+\t\tsetup_per_zone_wmarks();\n" + "+\n" + "+\treturn 0;\n" + "+}\n" + "+\n" + " #ifdef CONFIG_NUMA\n" + " int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,\n" + " \tvoid __user *buffer, size_t *length, loff_t *ppos)\n" + "-- \n" + 2.7.1 -d67900df9e3377c6ac1e77cf0d305a6f6af4bd6a542029d9a3b35d307ec112b7 +e7db8031a3b3f30b80b2aad3fb1f9d59a2e467412b40e146e4c76019760ee10e
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.