All of lore.kernel.org
 help / color / mirror / Atom feed
diff for duplicates of <20160225200721.GB3370@cmpxchg.org>

diff --git a/a/1.txt b/N1/1.txt
index 0f7dd4b..9494f73 100644
--- a/a/1.txt
+++ b/N1/1.txt
@@ -43,3 +43,182 @@ patch for these two instances in place.
 > > +node/system. The maximum value is 1000, or 10% of memory.
 > 
 > Ditto for 0.001%.
+
+>From 8e97efd64ef8491a7a4e7326f7c0d3cb7a5eb264 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 18 Feb 2016 10:04:21 -0500
+Subject: [PATCH V3] mm: scale kswapd watermarks in proportion to memory
+
+In machines with 140G of memory and enterprise flash storage, we have
+seen read and write bursts routinely exceed the kswapd watermarks and
+cause thundering herds in direct reclaim. Unfortunately, the only way
+to tune kswapd aggressiveness is through adjusting min_free_kbytes -
+the system's emergency reserves - which is entirely unrelated to the
+system's latency requirements. In order to get kswapd to maintain a
+250M buffer of free memory, the emergency reserves need to be set to
+1G. That is a lot of memory wasted for no good reason.
+
+On the other hand, it's reasonable to assume that allocation bursts
+and overall allocation concurrency scale with memory capacity, so it
+makes sense to make kswapd aggressiveness a function of that as well.
+
+Change the kswapd watermark scale factor from the currently fixed 25%
+of the tunable emergency reserve to a tunable 0.1% of memory.
+
+Beyond 1G of memory, this will produce bigger watermark steps than the
+current formula in default settings. Ensure that the new formula never
+chooses steps smaller than that, i.e. 25% of the emergency reserve.
+
+On a 140G machine, this raises the default watermark steps - the
+distance between min and low, and low and high - from 16M to 143M.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Acked-by: David Rientjes <rientjes@google.com>
+---
+ Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
+ include/linux/mm.h          |  1 +
+ include/linux/mmzone.h      |  2 ++
+ kernel/sysctl.c             | 10 ++++++++++
+ mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
+ 5 files changed, 58 insertions(+), 2 deletions(-)
+
+diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
+index 89a887c..cb03684 100644
+--- a/Documentation/sysctl/vm.txt
++++ b/Documentation/sysctl/vm.txt
+@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
+ directory and inode objects. With vfs_cache_pressure=1000, it will look for
+ ten times more freeable objects than there are.
+ 
++=============================================================
++
++watermark_scale_factor:
++
++This factor controls the aggressiveness of kswapd. It defines the
++amount of memory left in a node/system before kswapd is woken up and
++how much memory needs to be free before kswapd goes back to sleep.
++
++The unit is in fractions of 10,000. The default value of 10 means the
++distances between watermarks are 0.1% of the available memory in the
++node/system. The maximum value is 1000, or 10% of memory.
++
++A high rate of threads entering direct reclaim (allocstall) or kswapd
++going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
++that the number of free pages kswapd maintains for latency reasons is
++too small for the allocation bursts occurring in the system. This knob
++can then be used to tune kswapd aggressiveness accordingly.
++
+ ==============================================================
+ 
+ zone_reclaim_mode:
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index fe4d988..f2f4a6c 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1896,6 +1896,7 @@ extern void zone_pcp_reset(struct zone *zone);
+ 
+ /* page_alloc.c */
+ extern int min_free_kbytes;
++extern int watermark_scale_factor;
+ 
+ /* nommu.c */
+ extern atomic_long_t mmap_pages_allocated;
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index bdd9a27..c60df92 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone)
+ struct ctl_table;
+ int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
+ 					void __user *, size_t *, loff_t *);
++int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
++					void __user *, size_t *, loff_t *);
+ extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
+ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
+ 					void __user *, size_t *, loff_t *);
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index f930ec2..96ec234 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
+ static int __maybe_unused four = 4;
+ static unsigned long one_ul = 1;
+ static int one_hundred = 100;
++static int one_thousand = 1000;
+ #ifdef CONFIG_PRINTK
+ static int ten_thousand = 10000;
+ #endif
+@@ -1404,6 +1405,15 @@ static struct ctl_table vm_table[] = {
+ 		.extra1		= &zero,
+ 	},
+ 	{
++		.procname	= "watermark_scale_factor",
++		.data		= &watermark_scale_factor,
++		.maxlen		= sizeof(watermark_scale_factor),
++		.mode		= 0644,
++		.proc_handler	= watermark_scale_factor_sysctl_handler,
++		.extra1		= &one,
++		.extra2		= &one_thousand,
++	},
++	{
+ 		.procname	= "percpu_pagelist_fraction",
+ 		.data		= &percpu_pagelist_fraction,
+ 		.maxlen		= sizeof(percpu_pagelist_fraction),
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 9cd427c..9bede0b 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
+ 
+ int min_free_kbytes = 1024;
+ int user_min_free_kbytes = -1;
++int watermark_scale_factor = 10;
+ 
+ static unsigned long __meminitdata nr_kernel_pages;
+ static unsigned long __meminitdata nr_all_pages;
+@@ -6453,8 +6454,17 @@ static void __setup_per_zone_wmarks(void)
+ 			zone->watermark[WMARK_MIN] = tmp;
+ 		}
+ 
+-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
+-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
++		/*
++		 * Set the kswapd watermarks distance according to the
++		 * scale factor in proportion to available memory, but
++		 * ensure a minimum size on small systems.
++		 */
++		tmp = max_t(u64, tmp >> 2,
++			    mult_frac(zone->managed_pages,
++				      watermark_scale_factor, 10000));
++
++		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
++		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ 
+ 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
+ 			high_wmark_pages(zone) - low_wmark_pages(zone) -
+@@ -6551,6 +6561,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
+ 	return 0;
+ }
+ 
++int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
++	void __user *buffer, size_t *length, loff_t *ppos)
++{
++	int rc;
++
++	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
++	if (rc)
++		return rc;
++
++	if (write)
++		setup_per_zone_wmarks();
++
++	return 0;
++}
++
+ #ifdef CONFIG_NUMA
+ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
+ 	void __user *buffer, size_t *length, loff_t *ppos)
+-- 
+2.7.1
diff --git a/a/content_digest b/N1/content_digest
index fa1659b..51e1295 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -56,6 +56,185 @@
  "> > +distances between watermarks are 0.001% of the available memory in the\n"
  "> > +node/system. The maximum value is 1000, or 10% of memory.\n"
  "> \n"
- > Ditto for 0.001%.
+ "> Ditto for 0.001%.\n"
+ "\n"
+ ">From 8e97efd64ef8491a7a4e7326f7c0d3cb7a5eb264 Mon Sep 17 00:00:00 2001\n"
+ "From: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "Date: Thu, 18 Feb 2016 10:04:21 -0500\n"
+ "Subject: [PATCH V3] mm: scale kswapd watermarks in proportion to memory\n"
+ "\n"
+ "In machines with 140G of memory and enterprise flash storage, we have\n"
+ "seen read and write bursts routinely exceed the kswapd watermarks and\n"
+ "cause thundering herds in direct reclaim. Unfortunately, the only way\n"
+ "to tune kswapd aggressiveness is through adjusting min_free_kbytes -\n"
+ "the system's emergency reserves - which is entirely unrelated to the\n"
+ "system's latency requirements. In order to get kswapd to maintain a\n"
+ "250M buffer of free memory, the emergency reserves need to be set to\n"
+ "1G. That is a lot of memory wasted for no good reason.\n"
+ "\n"
+ "On the other hand, it's reasonable to assume that allocation bursts\n"
+ "and overall allocation concurrency scale with memory capacity, so it\n"
+ "makes sense to make kswapd aggressiveness a function of that as well.\n"
+ "\n"
+ "Change the kswapd watermark scale factor from the currently fixed 25%\n"
+ "of the tunable emergency reserve to a tunable 0.1% of memory.\n"
+ "\n"
+ "Beyond 1G of memory, this will produce bigger watermark steps than the\n"
+ "current formula in default settings. Ensure that the new formula never\n"
+ "chooses steps smaller than that, i.e. 25% of the emergency reserve.\n"
+ "\n"
+ "On a 140G machine, this raises the default watermark steps - the\n"
+ "distance between min and low, and low and high - from 16M to 143M.\n"
+ "\n"
+ "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "Acked-by: Mel Gorman <mgorman@suse.de>\n"
+ "Acked-by: Rik van Riel <riel@redhat.com>\n"
+ "Acked-by: David Rientjes <rientjes@google.com>\n"
+ "---\n"
+ " Documentation/sysctl/vm.txt | 18 ++++++++++++++++++\n"
+ " include/linux/mm.h          |  1 +\n"
+ " include/linux/mmzone.h      |  2 ++\n"
+ " kernel/sysctl.c             | 10 ++++++++++\n"
+ " mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--\n"
+ " 5 files changed, 58 insertions(+), 2 deletions(-)\n"
+ "\n"
+ "diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt\n"
+ "index 89a887c..cb03684 100644\n"
+ "--- a/Documentation/sysctl/vm.txt\n"
+ "+++ b/Documentation/sysctl/vm.txt\n"
+ "@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable\n"
+ " directory and inode objects. With vfs_cache_pressure=1000, it will look for\n"
+ " ten times more freeable objects than there are.\n"
+ " \n"
+ "+=============================================================\n"
+ "+\n"
+ "+watermark_scale_factor:\n"
+ "+\n"
+ "+This factor controls the aggressiveness of kswapd. It defines the\n"
+ "+amount of memory left in a node/system before kswapd is woken up and\n"
+ "+how much memory needs to be free before kswapd goes back to sleep.\n"
+ "+\n"
+ "+The unit is in fractions of 10,000. The default value of 10 means the\n"
+ "+distances between watermarks are 0.1% of the available memory in the\n"
+ "+node/system. The maximum value is 1000, or 10% of memory.\n"
+ "+\n"
+ "+A high rate of threads entering direct reclaim (allocstall) or kswapd\n"
+ "+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate\n"
+ "+that the number of free pages kswapd maintains for latency reasons is\n"
+ "+too small for the allocation bursts occurring in the system. This knob\n"
+ "+can then be used to tune kswapd aggressiveness accordingly.\n"
+ "+\n"
+ " ==============================================================\n"
+ " \n"
+ " zone_reclaim_mode:\n"
+ "diff --git a/include/linux/mm.h b/include/linux/mm.h\n"
+ "index fe4d988..f2f4a6c 100644\n"
+ "--- a/include/linux/mm.h\n"
+ "+++ b/include/linux/mm.h\n"
+ "@@ -1896,6 +1896,7 @@ extern void zone_pcp_reset(struct zone *zone);\n"
+ " \n"
+ " /* page_alloc.c */\n"
+ " extern int min_free_kbytes;\n"
+ "+extern int watermark_scale_factor;\n"
+ " \n"
+ " /* nommu.c */\n"
+ " extern atomic_long_t mmap_pages_allocated;\n"
+ "diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h\n"
+ "index bdd9a27..c60df92 100644\n"
+ "--- a/include/linux/mmzone.h\n"
+ "+++ b/include/linux/mmzone.h\n"
+ "@@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone)\n"
+ " struct ctl_table;\n"
+ " int min_free_kbytes_sysctl_handler(struct ctl_table *, int,\n"
+ " \t\t\t\t\tvoid __user *, size_t *, loff_t *);\n"
+ "+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,\n"
+ "+\t\t\t\t\tvoid __user *, size_t *, loff_t *);\n"
+ " extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];\n"
+ " int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,\n"
+ " \t\t\t\t\tvoid __user *, size_t *, loff_t *);\n"
+ "diff --git a/kernel/sysctl.c b/kernel/sysctl.c\n"
+ "index f930ec2..96ec234 100644\n"
+ "--- a/kernel/sysctl.c\n"
+ "+++ b/kernel/sysctl.c\n"
+ "@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;\n"
+ " static int __maybe_unused four = 4;\n"
+ " static unsigned long one_ul = 1;\n"
+ " static int one_hundred = 100;\n"
+ "+static int one_thousand = 1000;\n"
+ " #ifdef CONFIG_PRINTK\n"
+ " static int ten_thousand = 10000;\n"
+ " #endif\n"
+ "@@ -1404,6 +1405,15 @@ static struct ctl_table vm_table[] = {\n"
+ " \t\t.extra1\t\t= &zero,\n"
+ " \t},\n"
+ " \t{\n"
+ "+\t\t.procname\t= \"watermark_scale_factor\",\n"
+ "+\t\t.data\t\t= &watermark_scale_factor,\n"
+ "+\t\t.maxlen\t\t= sizeof(watermark_scale_factor),\n"
+ "+\t\t.mode\t\t= 0644,\n"
+ "+\t\t.proc_handler\t= watermark_scale_factor_sysctl_handler,\n"
+ "+\t\t.extra1\t\t= &one,\n"
+ "+\t\t.extra2\t\t= &one_thousand,\n"
+ "+\t},\n"
+ "+\t{\n"
+ " \t\t.procname\t= \"percpu_pagelist_fraction\",\n"
+ " \t\t.data\t\t= &percpu_pagelist_fraction,\n"
+ " \t\t.maxlen\t\t= sizeof(percpu_pagelist_fraction),\n"
+ "diff --git a/mm/page_alloc.c b/mm/page_alloc.c\n"
+ "index 9cd427c..9bede0b 100644\n"
+ "--- a/mm/page_alloc.c\n"
+ "+++ b/mm/page_alloc.c\n"
+ "@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {\n"
+ " \n"
+ " int min_free_kbytes = 1024;\n"
+ " int user_min_free_kbytes = -1;\n"
+ "+int watermark_scale_factor = 10;\n"
+ " \n"
+ " static unsigned long __meminitdata nr_kernel_pages;\n"
+ " static unsigned long __meminitdata nr_all_pages;\n"
+ "@@ -6453,8 +6454,17 @@ static void __setup_per_zone_wmarks(void)\n"
+ " \t\t\tzone->watermark[WMARK_MIN] = tmp;\n"
+ " \t\t}\n"
+ " \n"
+ "-\t\tzone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);\n"
+ "-\t\tzone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);\n"
+ "+\t\t/*\n"
+ "+\t\t * Set the kswapd watermarks distance according to the\n"
+ "+\t\t * scale factor in proportion to available memory, but\n"
+ "+\t\t * ensure a minimum size on small systems.\n"
+ "+\t\t */\n"
+ "+\t\ttmp = max_t(u64, tmp >> 2,\n"
+ "+\t\t\t    mult_frac(zone->managed_pages,\n"
+ "+\t\t\t\t      watermark_scale_factor, 10000));\n"
+ "+\n"
+ "+\t\tzone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;\n"
+ "+\t\tzone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;\n"
+ " \n"
+ " \t\t__mod_zone_page_state(zone, NR_ALLOC_BATCH,\n"
+ " \t\t\thigh_wmark_pages(zone) - low_wmark_pages(zone) -\n"
+ "@@ -6551,6 +6561,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,\n"
+ " \treturn 0;\n"
+ " }\n"
+ " \n"
+ "+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,\n"
+ "+\tvoid __user *buffer, size_t *length, loff_t *ppos)\n"
+ "+{\n"
+ "+\tint rc;\n"
+ "+\n"
+ "+\trc = proc_dointvec_minmax(table, write, buffer, length, ppos);\n"
+ "+\tif (rc)\n"
+ "+\t\treturn rc;\n"
+ "+\n"
+ "+\tif (write)\n"
+ "+\t\tsetup_per_zone_wmarks();\n"
+ "+\n"
+ "+\treturn 0;\n"
+ "+}\n"
+ "+\n"
+ " #ifdef CONFIG_NUMA\n"
+ " int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,\n"
+ " \tvoid __user *buffer, size_t *length, loff_t *ppos)\n"
+ "-- \n"
+ 2.7.1
 
-d67900df9e3377c6ac1e77cf0d305a6f6af4bd6a542029d9a3b35d307ec112b7
+e7db8031a3b3f30b80b2aad3fb1f9d59a2e467412b40e146e4c76019760ee10e

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.