From: Andy Lutomirski <luto@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>, Linux-MM <linux-mm@kvack.org>
Cc: Nicholas Piggin <npiggin@gmail.com>,
Anton Blanchard <anton@ozlabs.org>,
Benjamin Herrenschmidt <benh@kernel.crashing.org>,
Paul Mackerras <paulus@ozlabs.org>,
Randy Dunlap <rdunlap@infradead.org>,
linux-arch <linux-arch@vger.kernel.org>,
x86@kernel.org, Rik van Riel <riel@surriel.com>,
Dave Hansen <dave.hansen@intel.com>,
Peter Zijlstra <peterz@infradead.org>,
Nadav Amit <nadav.amit@gmail.com>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Andy Lutomirski <luto@kernel.org>
Subject: [PATCH 14/23] sched, exec: Factor current mm changes out from exec
Date: Sat, 8 Jan 2022 08:43:59 -0800 [thread overview]
Message-ID: <60eb8a98061100f95e53e7868841fbb9a68237c8.1641659630.git.luto@kernel.org> (raw)
In-Reply-To: <cover.1641659630.git.luto@kernel.org>
Currently, exec_mmap() open-codes an mm change. Create new core
__change_current_mm() and __change_current_mm_to_kernel() helpers
and use the former from exec_mmap(). This moves the nasty scheduler
details out of exec.c and prepares for reusing this code elsewhere.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
fs/exec.c | 32 +----------
include/linux/sched/mm.h | 20 +++++++
kernel/sched/core.c | 119 +++++++++++++++++++++++++++++++++++++++
3 files changed, 141 insertions(+), 30 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 2afa7b0c75f2..9e1c2ee7c986 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -971,15 +971,13 @@ EXPORT_SYMBOL(read_code);
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
- struct mm_struct *old_mm, *active_mm;
+ struct mm_struct *old_mm;
int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
- if (old_mm)
- sync_mm_rss(old_mm);
ret = down_write_killable(&tsk->signal->exec_update_lock);
if (ret)
@@ -1000,41 +998,15 @@ static int exec_mmap(struct mm_struct *mm)
}
}
- task_lock(tsk);
- /*
- * membarrier() requires a full barrier before switching mm.
- */
- smp_mb__after_spinlock();
+ __change_current_mm(mm, true);
- local_irq_disable();
- active_mm = tsk->active_mm;
- tsk->active_mm = mm;
- WRITE_ONCE(tsk->mm, mm); /* membarrier reads this without locks */
- membarrier_update_current_mm(mm);
- /*
- * This prevents preemption while active_mm is being loaded and
- * it and mm are being updated, which could cause problems for
- * lazy tlb mm refcounting when these are updated by context
- * switches. Not all architectures can handle irqs off over
- * activate_mm yet.
- */
- if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
- local_irq_enable();
- activate_mm(active_mm, mm);
- if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
- local_irq_enable();
- membarrier_finish_switch_mm(mm);
- vmacache_flush(tsk);
- task_unlock(tsk);
if (old_mm) {
mmap_read_unlock(old_mm);
- BUG_ON(active_mm != old_mm);
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
}
- mmdrop(active_mm);
return 0;
}
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index f1d2beac464c..7509b2b2e99d 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -83,6 +83,26 @@ extern void mmput(struct mm_struct *);
void mmput_async(struct mm_struct *);
#endif
+/*
+ * Switch the mm for current. This does not mmget() mm, nor does it mmput()
+ * the previous mm, if any. The caller is responsible for reference counting,
+ * although __change_current_mm() handles all details related to lazy mm
+ * refcounting.
+ *
+ * If the caller is a user task, the caller must call mm_update_next_owner().
+ */
+void __change_current_mm(struct mm_struct *mm, bool mm_is_brand_new);
+
+/*
+ * Switch the mm for current to the kernel mm. This does not mmdrop()
+ * -- the caller is responsible for reference counting, although
+ * __change_current_mm_to_kernel() handles all details related to lazy
+ * mm refcounting.
+ *
+ * If the caller is a user task, the caller must call mm_update_next_owner().
+ */
+void __change_current_mm_to_kernel(void);
+
/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 32275b4ff141..95eb0e78f74c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -14,6 +14,7 @@
#include <linux/nospec.h>
+#include <linux/vmacache.h>
#include <linux/kcov.h>
#include <linux/scs.h>
@@ -4934,6 +4935,124 @@ context_switch(struct rq *rq, struct task_struct *prev,
return finish_task_switch(prev);
}
+void __change_current_mm(struct mm_struct *mm, bool mm_is_brand_new)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *old_active_mm, *mm_to_drop = NULL;
+
+ BUG_ON(!mm); /* likely to cause corruption if we continue */
+
+ /*
+ * We do not want to schedule, nor should procfs peek at current->mm
+ * while we're modifying it. task_lock() disables preemption and
+ * locks against procfs.
+ */
+ task_lock(tsk);
+ /*
+ * membarrier() requires a full barrier before switching mm.
+ */
+ smp_mb__after_spinlock();
+
+ local_irq_disable();
+
+ if (tsk->mm) {
+ /* We're detaching from an old mm. Sync stats. */
+ sync_mm_rss(tsk->mm);
+ } else {
+ /*
+ * Switching from kernel mm to user. Drop the old lazy
+ * mm reference.
+ */
+ mm_to_drop = tsk->active_mm;
+ }
+
+ old_active_mm = tsk->active_mm;
+ tsk->active_mm = mm;
+ WRITE_ONCE(tsk->mm, mm); /* membarrier reads this without locks */
+ membarrier_update_current_mm(mm);
+
+ if (mm_is_brand_new) {
+ /*
+ * For historical reasons, some architectures want IRQs on
+ * when activate_mm() is called. If we're going to call
+ * activate_mm(), turn on IRQs but leave preemption
+ * disabled.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
+ activate_mm(old_active_mm, mm);
+ if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+ local_irq_enable();
+ } else {
+ switch_mm_irqs_off(old_active_mm, mm, tsk);
+ local_irq_enable();
+ }
+
+ /* IRQs are on now. Preemption is still disabled by task_lock(). */
+
+ membarrier_finish_switch_mm(mm);
+ vmacache_flush(tsk);
+ task_unlock(tsk);
+
+#ifdef finish_arch_post_lock_switch
+ if (!mm_is_brand_new) {
+ /*
+ * Some architectures want a callback after
+ * switch_mm_irqs_off() once locks are dropped. Callers of
+ * activate_mm() historically did not do this, so skip it if
+ * we did activate_mm(). On arm, this is because
+ * activate_mm() switches mm with IRQs on, which uses a
+ * different code path.
+ *
+ * Yes, this is extremely fragile and be cleaned up.
+ */
+ finish_arch_post_lock_switch();
+ }
+#endif
+
+ if (mm_to_drop)
+ mmdrop(mm_to_drop);
+}
+
+void __change_current_mm_to_kernel(void)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *old_mm = tsk->mm;
+
+ if (!old_mm)
+ return; /* nothing to do */
+
+ /*
+ * We do not want to schedule, nor should procfs peek at current->mm
+ * while we're modifying it. task_lock() disables preemption and
+ * locks against procfs.
+ */
+ task_lock(tsk);
+ /*
+ * membarrier() requires a full barrier before switching mm.
+ */
+ smp_mb__after_spinlock();
+
+ /* current has a real mm, so it must be active */
+ WARN_ON_ONCE(tsk->active_mm != tsk->mm);
+
+ local_irq_disable();
+
+ sync_mm_rss(old_mm);
+
+ WRITE_ONCE(tsk->mm, NULL); /* membarrier reads this without locks */
+ membarrier_update_current_mm(NULL);
+ vmacache_flush(tsk);
+
+ /* active_mm is still 'old_mm' */
+ mmgrab(old_mm);
+ enter_lazy_tlb(old_mm, tsk);
+
+ local_irq_enable();
+
+ task_unlock(tsk);
+}
+
/*
* nr_running and nr_context_switches:
*
--
2.33.1
next prev parent reply other threads:[~2022-01-08 16:44 UTC|newest]
Thread overview: 71+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-01-08 16:43 [PATCH 00/23] mm, sched: Rework lazy mm handling Andy Lutomirski
2022-01-08 16:43 ` [PATCH 01/23] membarrier: Document why membarrier() works Andy Lutomirski
2022-01-12 15:30 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 02/23] x86/mm: Handle unlazying membarrier core sync in the arch code Andy Lutomirski
2022-01-12 15:40 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 03/23] membarrier: Remove membarrier_arch_switch_mm() prototype in core code Andy Lutomirski
2022-01-08 16:43 ` [PATCH 04/23] membarrier: Make the post-switch-mm barrier explicit Andy Lutomirski
2022-01-12 15:52 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 05/23] membarrier, kthread: Use _ONCE accessors for task->mm Andy Lutomirski
2022-01-12 15:55 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 06/23] powerpc/membarrier: Remove special barrier on mm switch Andy Lutomirski
2022-01-10 8:42 ` Christophe Leroy
2022-01-12 15:57 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 07/23] membarrier: Rewrite sync_core_before_usermode() and improve documentation Andy Lutomirski
2022-01-12 16:11 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 08/23] membarrier: Remove redundant clear of mm->membarrier_state in exec_mmap() Andy Lutomirski
2022-01-12 16:13 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 09/23] membarrier: Fix incorrect barrier positions during exec and kthread_use_mm() Andy Lutomirski
2022-01-12 16:30 ` Mathieu Desnoyers
2022-01-12 17:08 ` Mathieu Desnoyers
2022-01-08 16:43 ` [PATCH 10/23] x86/events, x86/insn-eval: Remove incorrect active_mm references Andy Lutomirski
2022-01-08 16:43 ` [PATCH 11/23] sched/scs: Initialize shadow stack on idle thread bringup, not shutdown Andy Lutomirski
2022-01-10 22:06 ` Sami Tolvanen
2022-01-08 16:43 ` [PATCH 12/23] Rework "sched/core: Fix illegal RCU from offline CPUs" Andy Lutomirski
2022-01-08 16:43 ` [PATCH 13/23] exec: Remove unnecessary vmacache_seqnum clear in exec_mmap() Andy Lutomirski
2022-01-08 16:43 ` Andy Lutomirski [this message]
2022-01-08 16:44 ` [PATCH 15/23] kthread: Switch to __change_current_mm() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 16/23] sched: Use lightweight hazard pointers to grab lazy mms Andy Lutomirski
2022-01-08 19:22 ` Linus Torvalds
2022-01-08 22:04 ` Andy Lutomirski
2022-01-09 0:53 ` Linus Torvalds
2022-01-09 3:58 ` Andy Lutomirski
2022-01-09 4:38 ` Linus Torvalds
2022-01-09 20:19 ` Andy Lutomirski
2022-01-09 20:48 ` Linus Torvalds
2022-01-09 21:51 ` Linus Torvalds
2022-01-10 0:52 ` Andy Lutomirski
2022-01-10 2:36 ` Rik van Riel
2022-01-10 3:51 ` Linus Torvalds
2022-01-10 4:56 ` Nicholas Piggin
2022-01-10 5:17 ` Nicholas Piggin
2022-01-10 17:19 ` Linus Torvalds
2022-01-11 2:24 ` Nicholas Piggin
2022-01-10 20:52 ` Andy Lutomirski
2022-01-11 3:10 ` Nicholas Piggin
2022-01-11 15:39 ` Andy Lutomirski
2022-01-11 22:48 ` Nicholas Piggin
2022-01-12 0:42 ` Nicholas Piggin
2022-01-11 10:39 ` Will Deacon
2022-01-11 15:22 ` Andy Lutomirski
2022-01-09 5:56 ` Nadav Amit
2022-01-09 6:48 ` Linus Torvalds
2022-01-09 8:49 ` Nadav Amit
2022-01-09 19:10 ` Linus Torvalds
2022-01-09 19:52 ` Andy Lutomirski
2022-01-09 20:00 ` Linus Torvalds
2022-01-09 20:34 ` Nadav Amit
2022-01-09 20:48 ` Andy Lutomirski
2022-01-09 19:22 ` Rik van Riel
2022-01-09 19:34 ` Nadav Amit
2022-01-09 19:37 ` Rik van Riel
2022-01-09 19:51 ` Nadav Amit
2022-01-09 19:54 ` Linus Torvalds
2022-01-08 16:44 ` [PATCH 17/23] x86/mm: Make use/unuse_temporary_mm() non-static Andy Lutomirski
2022-01-08 16:44 ` [PATCH 18/23] x86/mm: Allow temporary mms when IRQs are on Andy Lutomirski
2022-01-08 16:44 ` [PATCH 19/23] x86/efi: Make efi_enter/leave_mm use the temporary_mm machinery Andy Lutomirski
2022-01-10 13:13 ` Ard Biesheuvel
2022-01-08 16:44 ` [PATCH 20/23] x86/mm: Remove leave_mm() in favor of unlazy_mm_irqs_off() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 21/23] x86/mm: Use unlazy_mm_irqs_off() in TLB flush IPIs Andy Lutomirski
2022-01-08 16:44 ` [PATCH 22/23] x86/mm: Optimize for_each_possible_lazymm_cpu() Andy Lutomirski
2022-01-08 16:44 ` [PATCH 23/23] x86/mm: Opt in to IRQs-off activate_mm() Andy Lutomirski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=60eb8a98061100f95e53e7868841fbb9a68237c8.1641659630.git.luto@kernel.org \
--to=luto@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=anton@ozlabs.org \
--cc=benh@kernel.crashing.org \
--cc=dave.hansen@intel.com \
--cc=linux-arch@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=nadav.amit@gmail.com \
--cc=npiggin@gmail.com \
--cc=paulus@ozlabs.org \
--cc=peterz@infradead.org \
--cc=rdunlap@infradead.org \
--cc=riel@surriel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).