From: Marcelo Tosatti <mtosatti@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: Nitesh Lal <nilal@redhat.com>,
Nicolas Saenz Julienne <nsaenzju@redhat.com>,
Frederic Weisbecker <frederic@kernel.org>,
Christoph Lameter <cl@linux.com>,
Juri Lelli <juri.lelli@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Alex Belits <abelits@belits.com>, Peter Xu <peterx@redhat.com>,
Thomas Gleixner <tglx@linutronix.de>,
Daniel Bristot de Oliveira <bristot@redhat.com>,
Marcelo Tosatti <mtosatti@redhat.com>
Subject: [patch v7 03/10] task isolation: sync vmstats on return to userspace
Date: Fri, 12 Nov 2021 09:35:34 -0300 [thread overview]
Message-ID: <20211112123750.722168190@fuller.cnet> (raw)
In-Reply-To: 20211112123531.497831890@fuller.cnet
The logic to disable vmstat worker thread, when entering
nohz full, does not cover all scenarios. For example, it is possible
for the following to happen:
1) enter nohz_full, which calls refresh_cpu_vm_stats, syncing the stats.
2) app runs mlock, which increases counters for mlock'ed pages.
3) start -RT loop
Since refresh_cpu_vm_stats from nohz_full logic can happen _before_
the mlock, vmstat shepherd can restart vmstat worker thread on
the CPU in question.
To fix this, use the task isolation prctl interface to quiesce
deferred actions when returning to userspace.
Keep task_isol_has_work returning 0 until all elements
are in place.
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
v6: modify exit_to_user_mode_loop to cover exceptions and interrupts
v5: no changes
v4: add oneshot mode support
include/linux/task_isolation.h | 16 ++++++++++++++++
include/linux/vmstat.h | 8 ++++++++
kernel/entry/common.c | 15 +++++++++++----
kernel/task_isolation.c | 21 +++++++++++++++++++++
mm/vmstat.c | 21 +++++++++++++++++++++
5 files changed, 77 insertions(+), 4 deletions(-)
Index: linux-2.6/include/linux/task_isolation.h
===================================================================
--- linux-2.6.orig/include/linux/task_isolation.h
+++ linux-2.6/include/linux/task_isolation.h
@@ -40,8 +40,19 @@ int prctl_task_isolation_activate_set(un
int __copy_task_isolation(struct task_struct *tsk);
+void isolation_exit_to_user_mode(void);
+
+static inline int task_isol_has_work(void)
+{
+ return 0;
+}
+
#else
+static void isolation_exit_to_user_mode(void)
+{
+}
+
static inline void tsk_isol_free(struct task_struct *tsk)
{
}
@@ -86,6 +97,11 @@ static inline int prctl_task_isolation_a
return -EOPNOTSUPP;
}
+static inline int task_isol_has_work(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_CPU_ISOLATION */
#endif /* __LINUX_TASK_ISOL_H */
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -21,6 +21,14 @@ int sysctl_vm_numa_stat_handler(struct c
void *buffer, size_t *length, loff_t *ppos);
#endif
+#ifdef CONFIG_SMP
+void sync_vmstat(void);
+#else
+static inline void sync_vmstat(void)
+{
+}
+#endif
+
struct reclaim_stat {
unsigned nr_dirty;
unsigned nr_unqueued_dirty;
Index: linux-2.6/kernel/entry/common.c
===================================================================
--- linux-2.6.orig/kernel/entry/common.c
+++ linux-2.6/kernel/entry/common.c
@@ -6,6 +6,7 @@
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
+#include <linux/task_isolation.h>
#include "common.h"
@@ -149,13 +150,14 @@ static void handle_signal_work(struct pt
}
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+ unsigned long ti_work,
+ unsigned long tsk_isol_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
- while (ti_work & EXIT_TO_USER_MODE_WORK) {
+ while ((ti_work & EXIT_TO_USER_MODE_WORK) || tsk_isol_work) {
local_irq_enable_exit_to_user(ti_work);
@@ -177,6 +179,9 @@ static unsigned long exit_to_user_mode_l
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
+ if (tsk_isol_work)
+ isolation_exit_to_user_mode();
+
/*
* Disable interrupts and reevaluate the work flags as they
* might have changed while interrupts and preemption was
@@ -188,6 +193,7 @@ static unsigned long exit_to_user_mode_l
tick_nohz_user_enter_prepare();
ti_work = READ_ONCE(current_thread_info()->flags);
+ tsk_isol_work = task_isol_has_work();
}
/* Return the latest work state for arch_exit_to_user_mode() */
@@ -197,14 +203,15 @@ static unsigned long exit_to_user_mode_l
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long tsk_isol_work = task_isol_has_work();
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
- ti_work = exit_to_user_mode_loop(regs, ti_work);
+ if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || tsk_isol_work))
+ ti_work = exit_to_user_mode_loop(regs, ti_work, tsk_isol_work);
arch_exit_to_user_mode_prepare(regs, ti_work);
Index: linux-2.6/kernel/task_isolation.c
===================================================================
--- linux-2.6.orig/kernel/task_isolation.c
+++ linux-2.6/kernel/task_isolation.c
@@ -18,6 +18,8 @@
#include <linux/sysfs.h>
#include <linux/init.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
void __tsk_isol_free(struct task_struct *tsk)
{
@@ -348,3 +350,22 @@ int prctl_task_isolation_activate_get(un
return 0;
}
+
+void isolation_exit_to_user_mode(void)
+{
+ struct isol_info *i;
+
+ i = current->isol_info;
+ if (!i)
+ return;
+
+ if (i->active_mask != ISOL_F_QUIESCE)
+ return;
+
+ if (i->quiesce_mask & ISOL_F_QUIESCE_VMSTATS) {
+ sync_vmstat();
+ if (i->oneshot_mask & ISOL_F_QUIESCE_VMSTATS)
+ i->active_mask &= ~ISOL_F_QUIESCE_VMSTATS;
+ }
+}
+EXPORT_SYMBOL_GPL(isolation_exit_to_user_mode);
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -2003,6 +2003,27 @@ static void vmstat_shepherd(struct work_
round_jiffies_relative(sysctl_stat_interval));
}
+void sync_vmstat(void)
+{
+ int cpu;
+
+ cpu = get_cpu();
+
+ refresh_cpu_vm_stats(false);
+ put_cpu();
+
+ /*
+ * If task is migrated to another CPU between put_cpu
+ * and cancel_delayed_work_sync, the code below might
+ * cancel vmstat_update work for a different cpu
+ * (than the one from which the vmstats were flushed).
+ *
+ * However, vmstat shepherd will re-enable it later,
+ * so its harmless.
+ */
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+}
+
static void __init start_shepherd_timer(void)
{
int cpu;
next prev parent reply other threads:[~2021-11-12 12:44 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-11-12 12:35 [patch v7 00/10] extensible prctl task isolation interface and vmstat sync Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 01/10] add basic task isolation prctl interface Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 02/10] add prctl task isolation prctl docs and samples Marcelo Tosatti
2021-11-23 12:36 ` Frederic Weisbecker
2021-11-29 15:13 ` Marcelo Tosatti
2021-12-02 17:13 ` Frederic Weisbecker
2021-12-02 18:29 ` Marcelo Tosatti
2021-12-07 17:05 ` Marcelo Tosatti
2021-11-23 14:37 ` Frederic Weisbecker
2021-11-29 15:19 ` Marcelo Tosatti
2021-12-02 17:44 ` Frederic Weisbecker
2021-11-12 12:35 ` Marcelo Tosatti [this message]
2021-11-12 12:35 ` [patch v7 04/10] procfs: add per-pid task isolation state Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 05/10] task isolation: add hook to task exit Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 06/10] task isolation: sync vmstats conditional on changes Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 07/10] task isolation: enable return to userspace processing Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 08/10] KVM: x86: process isolation work from VM-entry code path Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 09/10] mm: vmstat: move need_update Marcelo Tosatti
2021-11-12 12:35 ` [patch v7 10/10] mm: vmstat_refresh: avoid queueing work item if cpu stats are clean Marcelo Tosatti
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211112123750.722168190@fuller.cnet \
--to=mtosatti@redhat.com \
--cc=abelits@belits.com \
--cc=bristot@redhat.com \
--cc=cl@linux.com \
--cc=frederic@kernel.org \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=nilal@redhat.com \
--cc=nsaenzju@redhat.com \
--cc=peterx@redhat.com \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox