From: Marcelo Tosatti <mtosatti@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: Christoph Lameter <cl@linux.com>,
Thomas Gleixner <tglx@linutronix.de>,
Frederic Weisbecker <frederic@kernel.org>,
Juri Lelli <juri.lelli@redhat.com>, Nitesh Lal <nilal@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Nicolas Saenz <nsaenzju@redhat.com>,
Marcelo Tosatti <mtosatti@redhat.com>
Subject: [patch 3/5] mm: vmstat: optionally flush per-CPU vmstat counters on return to userspace
Date: Wed, 14 Jul 2021 17:42:08 -0300 [thread overview]
Message-ID: <20210714204233.710628753@fuller.cnet> (raw)
In-Reply-To: 20210714204205.245522189@fuller.cnet
The logic to disable vmstat worker thread, when entering
nohz full, does not cover all scenarios. For example, it is possible
for the following to happen:
1) enter nohz_full, which calls refresh_cpu_vm_stats, syncing the stats.
2) app runs mlock, which increases counters for mlock'ed pages.
3) start -RT loop
Since refresh_cpu_vm_stats from nohz_full logic can happen _before_
the mlock, vmstat shepherd can restart vmstat worker thread on
the CPU in question.
To fix this, optionally sync the vmstat counters when returning
from userspace, controllable by a new "quiesce_on_exit_to_usermode" isolcpus
flags (default off).
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Index: linux-2.6-vmstat-update/kernel/sched/isolation.c
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/sched/isolation.c
+++ linux-2.6-vmstat-update/kernel/sched/isolation.c
@@ -8,6 +8,7 @@
*
*/
#include "sched.h"
+#include <linux/vmstat.h>
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
@@ -129,6 +130,11 @@ static int __init housekeeping_setup(cha
}
}
+#ifdef CONFIG_SMP
+ if (flags & HK_FLAG_QUIESCE_URET)
+ static_branch_enable(&vmstat_sync_enabled);
+#endif
+
housekeeping_flags |= flags;
free_bootmem_cpumask_var(non_housekeeping_mask);
Index: linux-2.6-vmstat-update/include/linux/vmstat.h
===================================================================
--- linux-2.6-vmstat-update.orig/include/linux/vmstat.h
+++ linux-2.6-vmstat-update/include/linux/vmstat.h
@@ -21,6 +21,23 @@ int sysctl_vm_numa_stat_handler(struct c
void *buffer, size_t *length, loff_t *ppos);
#endif
+#ifdef CONFIG_SMP
+DECLARE_STATIC_KEY_FALSE(vmstat_sync_enabled);
+
+extern void __sync_vmstat(void);
+static inline void sync_vmstat(void)
+{
+ if (static_branch_unlikely(&vmstat_sync_enabled))
+ __sync_vmstat();
+}
+#else
+
+static inline void sync_vmstat(void)
+{
+}
+
+#endif
+
struct reclaim_stat {
unsigned nr_dirty;
unsigned nr_unqueued_dirty;
Index: linux-2.6-vmstat-update/mm/vmstat.c
===================================================================
--- linux-2.6-vmstat-update.orig/mm/vmstat.c
+++ linux-2.6-vmstat-update/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
@@ -308,6 +309,17 @@ void set_pgdat_percpu_threshold(pg_data_
}
}
+DEFINE_STATIC_KEY_FALSE(vmstat_sync_enabled);
+static DEFINE_PER_CPU_ALIGNED(bool, vmstat_dirty);
+
+static inline void mark_vmstat_dirty(void)
+{
+ if (!static_branch_unlikely(&vmstat_sync_enabled))
+ return;
+
+ raw_cpu_write(vmstat_dirty, true);
+}
+
/*
* For use when we know that interrupts are disabled,
* or when we know that preemption is disabled and that
@@ -330,6 +342,7 @@ void __mod_zone_page_state(struct zone *
x = 0;
}
__this_cpu_write(*p, x);
+ mark_vmstat_dirty();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -361,6 +374,7 @@ void __mod_node_page_state(struct pglist
x = 0;
}
__this_cpu_write(*p, x);
+ mark_vmstat_dirty();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -401,6 +415,7 @@ void __inc_zone_state(struct zone *zone,
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+ mark_vmstat_dirty();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -419,6 +434,7 @@ void __inc_node_state(struct pglist_data
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+ mark_vmstat_dirty();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -447,6 +463,7 @@ void __dec_zone_state(struct zone *zone,
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+ mark_vmstat_dirty();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -465,6 +482,7 @@ void __dec_node_state(struct pglist_data
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+ mark_vmstat_dirty();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -528,6 +546,7 @@ static inline void mod_zone_state(struct
if (z)
zone_page_state_add(z, zone, item);
+ mark_vmstat_dirty();
}
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
@@ -596,6 +615,7 @@ static inline void mod_node_state(struct
if (z)
node_page_state_add(z, pgdat, item);
+ mark_vmstat_dirty();
}
void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
@@ -2006,6 +2026,37 @@ static void vmstat_shepherd(struct work_
round_jiffies_relative(sysctl_stat_interval));
}
+void __sync_vmstat(void)
+{
+ int cpu;
+
+ cpu = get_cpu();
+ if (housekeeping_cpu(cpu, HK_FLAG_QUIESCE_URET)) {
+ put_cpu();
+ return;
+ }
+
+ if (!raw_cpu_read(vmstat_dirty)) {
+ put_cpu();
+ return;
+ }
+
+ refresh_cpu_vm_stats(false);
+ raw_cpu_write(vmstat_dirty, false);
+ put_cpu();
+
+ /*
+ * If task is migrated to another CPU between put_cpu
+ * and cancel_delayed_work_sync, the code below might
+ * cancel vmstat_update work for a different cpu
+ * (than the one from which the vmstats were flushed).
+ *
+ * However, vmstat shepherd will re-enable it later,
+ * so its harmless.
+ */
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+}
+
static void __init start_shepherd_timer(void)
{
int cpu;
Index: linux-2.6-vmstat-update/kernel/entry/common.c
===================================================================
--- linux-2.6-vmstat-update.orig/kernel/entry/common.c
+++ linux-2.6-vmstat-update/kernel/entry/common.c
@@ -6,6 +6,7 @@
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
+#include <linux/vmstat.h>
#include "common.h"
@@ -290,6 +291,7 @@ static void syscall_exit_to_user_mode_pr
*/
static void isolation_exit_to_user_mode_prepare(void)
{
+ sync_vmstat();
}
static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
next prev parent reply other threads:[~2021-07-14 20:43 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-14 20:42 [patch 0/5] optionally perform deferred actions on return to userspace (v3) Marcelo Tosatti
2021-07-14 20:42 ` [patch 1/5] sched: isolation: introduce quiesce_on_exit_to_usermode isolcpu flags Marcelo Tosatti
2021-07-19 14:14 ` Frederic Weisbecker
2021-07-14 20:42 ` [patch 2/5] common entry: add hook for isolation to __syscall_exit_to_user_mode_work Marcelo Tosatti
2021-07-14 20:42 ` Marcelo Tosatti [this message]
2021-07-14 20:42 ` [patch 4/5] mm: vmstat: move need_update Marcelo Tosatti
2021-07-14 20:42 ` [patch 5/5] mm: vmstat_refresh: avoid queueing work item if cpu stats are clean Marcelo Tosatti
-- strict thread matches above, loose matches on Subject: below --
2021-07-09 17:37 [patch 0/5] optionally perform deferred actions on return to userspace Marcelo Tosatti
2021-07-09 17:37 ` [patch 3/5] mm: vmstat: optionally flush per-CPU vmstat counters " Marcelo Tosatti
2021-07-12 9:05 ` Christoph Lameter
2021-07-12 10:30 ` Marcelo Tosatti
2021-07-13 19:30 ` Marcelo Tosatti
2021-07-03 4:54 kernel test robot
2021-07-01 21:03 [patch 0/5] optionally sync per-CPU vmstats counter " Marcelo Tosatti
2021-07-01 21:03 ` [patch 3/5] mm: vmstat: optionally flush per-CPU vmstat counters " Marcelo Tosatti
2021-07-01 23:11 ` kernel test robot
2021-07-01 23:11 ` kernel test robot
2021-07-02 6:50 ` kernel test robot
2021-07-02 6:50 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210714204233.710628753@fuller.cnet \
--to=mtosatti@redhat.com \
--cc=cl@linux.com \
--cc=frederic@kernel.org \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=nilal@redhat.com \
--cc=nsaenzju@redhat.com \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.