[patch 2/3] MM: allow per-cpu vmstat_threshold configuration

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Marcelo Tosatti <mtosatti@redhat.com>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Luiz Capitulino <lcapitulino@redhat.com>,
	Rik van Riel <riel@redhat.com>,
	Linux RT Users <linux-rt-users@vger.kernel.org>,
	Marcelo Tosatti <mtosatti@redhat.com>
Subject: [patch 2/3] MM: allow per-cpu vmstat_threshold configuration
Date: Wed, 03 May 2017 15:40:09 -0300	[thread overview]
Message-ID: <20170503184039.818107646@redhat.com> (raw)
In-Reply-To: 20170503184007.174707977@redhat.com

[-- Attachment #1: vmstat-configurable-thresh --]
[-- Type: text/plain, Size: 10837 bytes --]

The per-CPU vmstat worker is a problem on -RT workloads (because
ideally the CPU is entirely reserved for the -RT app, without
interference). The worker transfers accumulated per-CPU 
vmstat counters to global counters.

To resolve the problem, create a userspace configurable per-CPU 
vmstat threshold: by default the VM code calculates the size of
the per-CPU vmstat arrays. This tunable allows userspace to 
configure the vmstat threshold values.

The patch below contains documentation which describes the tunables
in more detail.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 Documentation/vm/vmstat_thresholds.txt |   78 +++++++++++++
 mm/vmstat.c                            |  188 +++++++++++++++++++++++++++++----
 2 files changed, 247 insertions(+), 19 deletions(-)

Index: linux-2.6-git-disable-vmstat-worker/mm/vmstat.c
===================================================================
--- linux-2.6-git-disable-vmstat-worker.orig/mm/vmstat.c	2017-04-25 07:39:13.941019853 -0300
+++ linux-2.6-git-disable-vmstat-worker/mm/vmstat.c	2017-05-03 10:59:43.495714336 -0300
@@ -91,8 +91,16 @@
 EXPORT_SYMBOL(vm_zone_stat);
 EXPORT_SYMBOL(vm_node_stat);
 
+struct vmstat_uparam {
+	atomic_t user_stat_thresh;
+};
+
+static DEFINE_PER_CPU(struct vmstat_uparam, vmstat_uparam);
+
 #ifdef CONFIG_SMP
 
+#define MAX_THRESHOLD 125
+
 int calculate_pressure_threshold(struct zone *zone)
 {
 	int threshold;
@@ -110,9 +118,9 @@
 	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
 
 	/*
-	 * Maximum threshold is 125
+	 * Maximum threshold is MAX_THRESHOLD == 125
 	 */
-	threshold = min(125, threshold);
+	threshold = min(MAX_THRESHOLD, threshold);
 
 	return threshold;
 }
@@ -188,15 +196,31 @@
 		threshold = calculate_normal_threshold(zone);
 
 		for_each_online_cpu(cpu) {
-			int pgdat_threshold;
+			int pgdat_threshold, ustat_thresh;
+			struct vmstat_uparam *vup;
 
-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-							= threshold;
+			struct per_cpu_nodestat __percpu *pcp;
+			struct per_cpu_pageset *p;
+
+			p = per_cpu_ptr(zone->pageset, cpu);
+
+			vup = &per_cpu(vmstat_uparam, cpu);
+			ustat_thresh = atomic_read(&vup->user_stat_thresh);
+
+			if (ustat_thresh)
+				p->stat_threshold = ustat_thresh;
+			else
+				p->stat_threshold = threshold;
+
+			pcp = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
 
 			/* Base nodestat threshold on the largest populated zone. */
-			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
-			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
-				= max(threshold, pgdat_threshold);
+			pgdat_threshold = pcp->stat_threshold;
+			if (ustat_thresh)
+				pcp->stat_threshold = ustat_thresh;
+			else
+				pcp->stat_threshold = max(threshold,
+							  pgdat_threshold);
 		}
 
 		/*
@@ -226,9 +250,24 @@
 			continue;
 
 		threshold = (*calculate_pressure)(zone);
-		for_each_online_cpu(cpu)
+		for_each_online_cpu(cpu) {
+			int t, ustat_thresh;
+			struct vmstat_uparam *vup;
+
+			vup = &per_cpu(vmstat_uparam, cpu);
+			ustat_thresh = atomic_read(&vup->user_stat_thresh);
+			t = threshold;
+
+			/*
+			 * min because pressure could cause
+			 * calculate_pressure'ed value to be smaller.
+			 */
+			if (ustat_thresh)
+				t = min(threshold, ustat_thresh);
+
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-							= threshold;
+							= t;
+		}
 	}
 }
 
@@ -249,7 +288,7 @@
 
 	t = __this_cpu_read(pcp->stat_threshold);
 
-	if (unlikely(x > t || x < -t)) {
+	if (unlikely(x >= t || x <= -t)) {
 		zone_page_state_add(x, zone, item);
 		x = 0;
 	}
@@ -269,7 +308,7 @@
 
 	t = __this_cpu_read(pcp->stat_threshold);
 
-	if (unlikely(x > t || x < -t)) {
+	if (unlikely(x >= t || x <= -t)) {
 		node_page_state_add(x, pgdat, item);
 		x = 0;
 	}
@@ -308,7 +347,7 @@
 
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v > t)) {
+	if (unlikely(v >= t)) {
 		s8 overstep = t >> 1;
 
 		zone_page_state_add(v + overstep, zone, item);
@@ -324,7 +363,7 @@
 
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v > t)) {
+	if (unlikely(v >= t)) {
 		s8 overstep = t >> 1;
 
 		node_page_state_add(v + overstep, pgdat, item);
@@ -352,7 +391,7 @@
 
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v < - t)) {
+	if (unlikely(v <= - t)) {
 		s8 overstep = t >> 1;
 
 		zone_page_state_add(v - overstep, zone, item);
@@ -368,7 +407,7 @@
 
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v < - t)) {
+	if (unlikely(v <= - t)) {
 		s8 overstep = t >> 1;
 
 		node_page_state_add(v - overstep, pgdat, item);
@@ -426,7 +465,7 @@
 		o = this_cpu_read(*p);
 		n = delta + o;
 
-		if (n > t || n < -t) {
+		if (n >= t || n <= -t) {
 			int os = overstep_mode * (t >> 1) ;
 
 			/* Overflow must be added to zone counters */
@@ -483,7 +522,7 @@
 		o = this_cpu_read(*p);
 		n = delta + o;
 
-		if (n > t || n < -t) {
+		if (n >= t || n <= -t) {
 			int os = overstep_mode * (t >> 1) ;
 
 			/* Overflow must be added to node counters */
@@ -1696,6 +1735,96 @@
 		round_jiffies_relative(sysctl_stat_interval));
 }
 
+#ifdef CONFIG_SYSFS
+
+static ssize_t vmstat_thresh_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	int ret;
+	struct vmstat_uparam *vup;
+	unsigned int cpu = dev->id;
+
+	preempt_disable();
+
+	vup = &per_cpu(vmstat_uparam, cpu);
+	ret = sprintf(buf, "%d\n", atomic_read(&vup->user_stat_thresh));
+
+	preempt_enable();
+
+	return ret;
+}
+
+static ssize_t vmstat_thresh_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int ret, val;
+	unsigned int cpu = dev->id;
+	struct vmstat_uparam *vup;
+
+	ret = sscanf(buf, "%d", &val);
+	if (ret != 1 || val < 1 || val > MAX_THRESHOLD)
+		return -EINVAL;
+
+	preempt_disable();
+
+	if (cpu_online(cpu)) {
+		vup = &per_cpu(vmstat_uparam, cpu);
+		atomic_set(&vup->user_stat_thresh, val);
+	} else
+		count = -EINVAL;
+
+	preempt_enable();
+
+	return count;
+}
+
+struct device_attribute vmstat_threshold_attr =
+	__ATTR(vmstat_threshold, 0644, vmstat_thresh_show, vmstat_thresh_store);
+
+static struct attribute *vmstat_attrs[] = {
+	&vmstat_threshold_attr.attr,
+	NULL
+};
+
+static struct attribute_group vmstat_attr_group = {
+	.attrs  =  vmstat_attrs,
+	.name   = "vmstat"
+};
+
+static int vmstat_thresh_cpu_online(unsigned int cpu)
+{
+	struct device *dev = get_cpu_device(cpu);
+	int ret;
+
+	ret = sysfs_create_group(&dev->kobj, &vmstat_attr_group);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int vmstat_thresh_cpu_down_prep(unsigned int cpu)
+{
+	struct device *dev = get_cpu_device(cpu);
+
+	sysfs_remove_group(&dev->kobj, &vmstat_attr_group);
+	return 0;
+}
+
+static void init_vmstat_sysfs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+		atomic_set(&vup->user_stat_thresh, 0);
+	}
+}
+
+#endif /* CONFIG_SYSFS */
+
 static void __init init_cpu_node_state(void)
 {
 	int node;
@@ -1723,9 +1852,12 @@
 {
 	const struct cpumask *node_cpus;
 	int node;
+	struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
 
 	node = cpu_to_node(cpu);
 
+	atomic_set(&vup->user_stat_thresh, 0);
+
 	refresh_zone_stat_thresholds();
 	node_cpus = cpumask_of_node(node);
 	if (cpumask_weight(node_cpus) > 0)
@@ -1735,7 +1867,7 @@
 	return 0;
 }
 
-#endif
+#endif /* CONFIG_SMP */
 
 struct workqueue_struct *mm_percpu_wq;
 
@@ -1772,6 +1904,24 @@
 #endif
 }
 
+static int __init init_mm_internals_late(void)
+{
+#ifdef CONFIG_SYSFS
+	int ret;
+
+	init_vmstat_sysfs();
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/vmstat_thresh:online",
+					vmstat_thresh_cpu_online,
+					vmstat_thresh_cpu_down_prep);
+	if (ret < 0)
+		pr_err("vmstat_thresh: failed to register 'online' hotplug state\n");
+#endif
+	return 0;
+}
+
+late_initcall(init_mm_internals_late);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 
 /*
Index: linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt	2017-05-02 13:48:45.946840708 -0300
@@ -0,0 +1,78 @@
+Userspace configurable vmstat thresholds
+========================================
+
+This document describes the tunables to control
+per-CPU vmstat threshold and per-CPU vmstat worker
+thread.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_threshold:
+
+This file contains the per-CPU vmstat threshold.
+This value is the maximum that a single per-CPU vmstat statistic
+can accumulate before transferring to the global counters.
+
+A value of 0 indicates that the value is set
+by the in kernel algorithm.
+
+A value different than 0 indicates that particular
+value is used for vmstat_threshold.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_worker:
+
+Enable/disable the per-CPU vmstat worker.
+
+What does the vmstat_threshold value mean? What are the implications
+of changing this value? What's the difference in choosing 1, 2, 3
+or 500?
+====================================================================
+
+Its the maximum value for a vmstat statistics counter to hold. After
+that value, the statistics are transferred to the global counter:
+
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                long delta)
+{
+        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+        s8 __percpu *p = pcp->vm_node_stat_diff + item;
+        long x;
+        long t;
+
+        x = delta + __this_cpu_read(*p);
+
+        t = __this_cpu_read(pcp->stat_threshold);
+
+        if (unlikely(x > t || x < -t)) {
+                node_page_state_add(x, pgdat, item);
+                x = 0;
+        }
+        __this_cpu_write(*p, x);
+}
+
+Increasing the threshold value does two things:
+        1) It decreases the number of inter-processor accesses.
+        2) It increases how much the global counters stay out of
+           sync relative to actual current values.
+
+
+Usage example:
+=============
+
+In a realtime system, the worker thread waking up and executing
+vmstat_update can be an undesired source of latencies.
+
+To avoid the worker thread from waking up, executing vmstat_update
+on cpu 1, for example, perform the following steps:
+
+
+cd /sys/devices/system/cpu/cpu0/vmstat/
+
+# Set vmstat threshold to 1 for cpu1, so that no
+# vmstat statistics are collected in cpu1's per-cpu
+# stats, instead they are immediately transferred
+# to the global counter.
+
+$ echo 1 > vmstat_threshold
+
+# Disable vmstat_update worker for cpu1:
+$ echo 0 > vmstat_worker
+

WARNING: multiple messages have this Message-ID (diff)

From: Marcelo Tosatti <mtosatti@redhat.com>
To: linux-kernel@vger.kernel.org, linux-mm@kvack.org
Cc: Luiz Capitulino <lcapitulino@redhat.com>,
	Rik van Riel <riel@redhat.com>,
	Linux RT Users <linux-rt-users@vger.kernel.org>,
	Marcelo Tosatti <mtosatti@redhat.com>
Subject: [patch 2/3] MM: allow per-cpu vmstat_threshold configuration
Date: Wed, 03 May 2017 15:40:09 -0300	[thread overview]
Message-ID: <20170503184039.818107646@redhat.com> (raw)
In-Reply-To: 20170503184007.174707977@redhat.com

[-- Attachment #1: vmstat-configurable-thresh --]
[-- Type: text/plain, Size: 11062 bytes --]

The per-CPU vmstat worker is a problem on -RT workloads (because
ideally the CPU is entirely reserved for the -RT app, without
interference). The worker transfers accumulated per-CPU 
vmstat counters to global counters.

To resolve the problem, create a userspace configurable per-CPU 
vmstat threshold: by default the VM code calculates the size of
the per-CPU vmstat arrays. This tunable allows userspace to 
configure the vmstat threshold values.

The patch below contains documentation which describes the tunables
in more detail.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 Documentation/vm/vmstat_thresholds.txt |   78 +++++++++++++
 mm/vmstat.c                            |  188 +++++++++++++++++++++++++++++----
 2 files changed, 247 insertions(+), 19 deletions(-)

Index: linux-2.6-git-disable-vmstat-worker/mm/vmstat.c
===================================================================
--- linux-2.6-git-disable-vmstat-worker.orig/mm/vmstat.c	2017-04-25 07:39:13.941019853 -0300
+++ linux-2.6-git-disable-vmstat-worker/mm/vmstat.c	2017-05-03 10:59:43.495714336 -0300
@@ -91,8 +91,16 @@
 EXPORT_SYMBOL(vm_zone_stat);
 EXPORT_SYMBOL(vm_node_stat);
 
+struct vmstat_uparam {
+	atomic_t user_stat_thresh;
+};
+
+static DEFINE_PER_CPU(struct vmstat_uparam, vmstat_uparam);
+
 #ifdef CONFIG_SMP
 
+#define MAX_THRESHOLD 125
+
 int calculate_pressure_threshold(struct zone *zone)
 {
 	int threshold;
@@ -110,9 +118,9 @@
 	threshold = max(1, (int)(watermark_distance / num_online_cpus()));
 
 	/*
-	 * Maximum threshold is 125
+	 * Maximum threshold is MAX_THRESHOLD == 125
 	 */
-	threshold = min(125, threshold);
+	threshold = min(MAX_THRESHOLD, threshold);
 
 	return threshold;
 }
@@ -188,15 +196,31 @@
 		threshold = calculate_normal_threshold(zone);
 
 		for_each_online_cpu(cpu) {
-			int pgdat_threshold;
+			int pgdat_threshold, ustat_thresh;
+			struct vmstat_uparam *vup;
 
-			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-							= threshold;
+			struct per_cpu_nodestat __percpu *pcp;
+			struct per_cpu_pageset *p;
+
+			p = per_cpu_ptr(zone->pageset, cpu);
+
+			vup = &per_cpu(vmstat_uparam, cpu);
+			ustat_thresh = atomic_read(&vup->user_stat_thresh);
+
+			if (ustat_thresh)
+				p->stat_threshold = ustat_thresh;
+			else
+				p->stat_threshold = threshold;
+
+			pcp = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
 
 			/* Base nodestat threshold on the largest populated zone. */
-			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
-			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
-				= max(threshold, pgdat_threshold);
+			pgdat_threshold = pcp->stat_threshold;
+			if (ustat_thresh)
+				pcp->stat_threshold = ustat_thresh;
+			else
+				pcp->stat_threshold = max(threshold,
+							  pgdat_threshold);
 		}
 
 		/*
@@ -226,9 +250,24 @@
 			continue;
 
 		threshold = (*calculate_pressure)(zone);
-		for_each_online_cpu(cpu)
+		for_each_online_cpu(cpu) {
+			int t, ustat_thresh;
+			struct vmstat_uparam *vup;
+
+			vup = &per_cpu(vmstat_uparam, cpu);
+			ustat_thresh = atomic_read(&vup->user_stat_thresh);
+			t = threshold;
+
+			/*
+			 * min because pressure could cause
+			 * calculate_pressure'ed value to be smaller.
+			 */
+			if (ustat_thresh)
+				t = min(threshold, ustat_thresh);
+
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-							= threshold;
+							= t;
+		}
 	}
 }
 
@@ -249,7 +288,7 @@
 
 	t = __this_cpu_read(pcp->stat_threshold);
 
-	if (unlikely(x > t || x < -t)) {
+	if (unlikely(x >= t || x <= -t)) {
 		zone_page_state_add(x, zone, item);
 		x = 0;
 	}
@@ -269,7 +308,7 @@
 
 	t = __this_cpu_read(pcp->stat_threshold);
 
-	if (unlikely(x > t || x < -t)) {
+	if (unlikely(x >= t || x <= -t)) {
 		node_page_state_add(x, pgdat, item);
 		x = 0;
 	}
@@ -308,7 +347,7 @@
 
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v > t)) {
+	if (unlikely(v >= t)) {
 		s8 overstep = t >> 1;
 
 		zone_page_state_add(v + overstep, zone, item);
@@ -324,7 +363,7 @@
 
 	v = __this_cpu_inc_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v > t)) {
+	if (unlikely(v >= t)) {
 		s8 overstep = t >> 1;
 
 		node_page_state_add(v + overstep, pgdat, item);
@@ -352,7 +391,7 @@
 
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v < - t)) {
+	if (unlikely(v <= - t)) {
 		s8 overstep = t >> 1;
 
 		zone_page_state_add(v - overstep, zone, item);
@@ -368,7 +407,7 @@
 
 	v = __this_cpu_dec_return(*p);
 	t = __this_cpu_read(pcp->stat_threshold);
-	if (unlikely(v < - t)) {
+	if (unlikely(v <= - t)) {
 		s8 overstep = t >> 1;
 
 		node_page_state_add(v - overstep, pgdat, item);
@@ -426,7 +465,7 @@
 		o = this_cpu_read(*p);
 		n = delta + o;
 
-		if (n > t || n < -t) {
+		if (n >= t || n <= -t) {
 			int os = overstep_mode * (t >> 1) ;
 
 			/* Overflow must be added to zone counters */
@@ -483,7 +522,7 @@
 		o = this_cpu_read(*p);
 		n = delta + o;
 
-		if (n > t || n < -t) {
+		if (n >= t || n <= -t) {
 			int os = overstep_mode * (t >> 1) ;
 
 			/* Overflow must be added to node counters */
@@ -1696,6 +1735,96 @@
 		round_jiffies_relative(sysctl_stat_interval));
 }
 
+#ifdef CONFIG_SYSFS
+
+static ssize_t vmstat_thresh_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	int ret;
+	struct vmstat_uparam *vup;
+	unsigned int cpu = dev->id;
+
+	preempt_disable();
+
+	vup = &per_cpu(vmstat_uparam, cpu);
+	ret = sprintf(buf, "%d\n", atomic_read(&vup->user_stat_thresh));
+
+	preempt_enable();
+
+	return ret;
+}
+
+static ssize_t vmstat_thresh_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int ret, val;
+	unsigned int cpu = dev->id;
+	struct vmstat_uparam *vup;
+
+	ret = sscanf(buf, "%d", &val);
+	if (ret != 1 || val < 1 || val > MAX_THRESHOLD)
+		return -EINVAL;
+
+	preempt_disable();
+
+	if (cpu_online(cpu)) {
+		vup = &per_cpu(vmstat_uparam, cpu);
+		atomic_set(&vup->user_stat_thresh, val);
+	} else
+		count = -EINVAL;
+
+	preempt_enable();
+
+	return count;
+}
+
+struct device_attribute vmstat_threshold_attr =
+	__ATTR(vmstat_threshold, 0644, vmstat_thresh_show, vmstat_thresh_store);
+
+static struct attribute *vmstat_attrs[] = {
+	&vmstat_threshold_attr.attr,
+	NULL
+};
+
+static struct attribute_group vmstat_attr_group = {
+	.attrs  =  vmstat_attrs,
+	.name   = "vmstat"
+};
+
+static int vmstat_thresh_cpu_online(unsigned int cpu)
+{
+	struct device *dev = get_cpu_device(cpu);
+	int ret;
+
+	ret = sysfs_create_group(&dev->kobj, &vmstat_attr_group);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int vmstat_thresh_cpu_down_prep(unsigned int cpu)
+{
+	struct device *dev = get_cpu_device(cpu);
+
+	sysfs_remove_group(&dev->kobj, &vmstat_attr_group);
+	return 0;
+}
+
+static void init_vmstat_sysfs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+		atomic_set(&vup->user_stat_thresh, 0);
+	}
+}
+
+#endif /* CONFIG_SYSFS */
+
 static void __init init_cpu_node_state(void)
 {
 	int node;
@@ -1723,9 +1852,12 @@
 {
 	const struct cpumask *node_cpus;
 	int node;
+	struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
 
 	node = cpu_to_node(cpu);
 
+	atomic_set(&vup->user_stat_thresh, 0);
+
 	refresh_zone_stat_thresholds();
 	node_cpus = cpumask_of_node(node);
 	if (cpumask_weight(node_cpus) > 0)
@@ -1735,7 +1867,7 @@
 	return 0;
 }
 
-#endif
+#endif /* CONFIG_SMP */
 
 struct workqueue_struct *mm_percpu_wq;
 
@@ -1772,6 +1904,24 @@
 #endif
 }
 
+static int __init init_mm_internals_late(void)
+{
+#ifdef CONFIG_SYSFS
+	int ret;
+
+	init_vmstat_sysfs();
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/vmstat_thresh:online",
+					vmstat_thresh_cpu_online,
+					vmstat_thresh_cpu_down_prep);
+	if (ret < 0)
+		pr_err("vmstat_thresh: failed to register 'online' hotplug state\n");
+#endif
+	return 0;
+}
+
+late_initcall(init_mm_internals_late);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 
 /*
Index: linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt	2017-05-02 13:48:45.946840708 -0300
@@ -0,0 +1,78 @@
+Userspace configurable vmstat thresholds
+========================================
+
+This document describes the tunables to control
+per-CPU vmstat threshold and per-CPU vmstat worker
+thread.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_threshold:
+
+This file contains the per-CPU vmstat threshold.
+This value is the maximum that a single per-CPU vmstat statistic
+can accumulate before transferring to the global counters.
+
+A value of 0 indicates that the value is set
+by the in kernel algorithm.
+
+A value different than 0 indicates that particular
+value is used for vmstat_threshold.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_worker:
+
+Enable/disable the per-CPU vmstat worker.
+
+What does the vmstat_threshold value mean? What are the implications
+of changing this value? What's the difference in choosing 1, 2, 3
+or 500?
+====================================================================
+
+Its the maximum value for a vmstat statistics counter to hold. After
+that value, the statistics are transferred to the global counter:
+
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                long delta)
+{
+        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+        s8 __percpu *p = pcp->vm_node_stat_diff + item;
+        long x;
+        long t;
+
+        x = delta + __this_cpu_read(*p);
+
+        t = __this_cpu_read(pcp->stat_threshold);
+
+        if (unlikely(x > t || x < -t)) {
+                node_page_state_add(x, pgdat, item);
+                x = 0;
+        }
+        __this_cpu_write(*p, x);
+}
+
+Increasing the threshold value does two things:
+        1) It decreases the number of inter-processor accesses.
+        2) It increases how much the global counters stay out of
+           sync relative to actual current values.
+
+
+Usage example:
+=============
+
+In a realtime system, the worker thread waking up and executing
+vmstat_update can be an undesired source of latencies.
+
+To avoid the worker thread from waking up, executing vmstat_update
+on cpu 1, for example, perform the following steps:
+
+
+cd /sys/devices/system/cpu/cpu0/vmstat/
+
+# Set vmstat threshold to 1 for cpu1, so that no
+# vmstat statistics are collected in cpu1's per-cpu
+# stats, instead they are immediately transferred
+# to the global counter.
+
+$ echo 1 > vmstat_threshold
+
+# Disable vmstat_update worker for cpu1:
+$ echo 0 > vmstat_worker
+


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2017-05-03 18:45 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-05-03 18:40 [patch 0/3] per-CPU vmstat thresholds and vmstat worker disablement (v2) Marcelo Tosatti
2017-05-03 18:40 ` Marcelo Tosatti
2017-05-03 18:40 ` [patch 1/3] MM: remove unused quiet_vmstat function Marcelo Tosatti
2017-05-03 18:40   ` Marcelo Tosatti
2017-05-10 13:36   ` Rik van Riel
2017-05-10 13:36     ` Rik van Riel
2017-05-03 18:40 ` Marcelo Tosatti [this message]
2017-05-03 18:40   ` [patch 2/3] MM: allow per-cpu vmstat_threshold configuration Marcelo Tosatti
2017-05-03 18:40 ` [patch 3/3] MM: allow per-cpu vmstat_worker configuration Marcelo Tosatti
2017-05-03 18:40   ` Marcelo Tosatti
2017-05-10 15:34   ` Rik van Riel
2017-05-10 15:34     ` Rik van Riel
2017-05-11 15:33     ` Marcelo Tosatti
2017-05-11 15:33       ` Marcelo Tosatti
2017-05-16  1:31   ` [lkp-robot] [MM] 3e38a07a66: ltp.proc01.fail kernel test robot
2017-05-16  1:31     ` kernel test robot
2017-05-16  1:31     ` kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170503184039.818107646@redhat.com \
    --to=mtosatti@redhat.com \
    --cc=lcapitulino@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.