* Re: [PATCH] powerpc/kup: Mark the kuap/keup function non __init
From: Christophe Leroy @ 2020-12-14 7:22 UTC (permalink / raw)
To: Aneesh Kumar K.V, linuxppc-dev, mpe
In-Reply-To: <20201214071306.346399-1-aneesh.kumar@linux.ibm.com>
Le 14/12/2020 à 08:13, Aneesh Kumar K.V a écrit :
> Kernel call these functions on cpu online and hence they should
> not be marked __init.
This is PPC64 only.
See commit
https://github.com/linuxppc/linux/commit/67d53f30e23ec66aa7bbdd1592d5e64d46876190#diff-9799ddc8e77e666295031a560afc2a754d2f5fa0ddfb96335495b26a07511ad4
Christophe
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
> arch/powerpc/mm/book3s32/mmu.c | 4 ++--
> arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
> arch/powerpc/mm/init-common.c | 2 +-
> arch/powerpc/mm/nohash/8xx.c | 4 ++--
> 4 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> index 23f60e97196e..8d9e90a99b51 100644
> --- a/arch/powerpc/mm/book3s32/mmu.c
> +++ b/arch/powerpc/mm/book3s32/mmu.c
> @@ -449,7 +449,7 @@ void __init print_system_hash_info(void)
> }
>
> #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
> {
> pr_info("Activating Kernel Userspace Execution Prevention\n");
>
> @@ -459,7 +459,7 @@ void __init setup_kuep(bool disabled)
> #endif
>
> #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
> {
> pr_info("Activating Kernel Userspace Access Protection\n");
>
> diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
> index 2b7ded396db4..f1c6f264ed91 100644
> --- a/arch/powerpc/mm/book3s64/pkeys.c
> +++ b/arch/powerpc/mm/book3s64/pkeys.c
> @@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
> }
>
> #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
> {
> if (disabled)
> return;
> @@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
> #endif
>
> #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
> {
> if (disabled)
> return;
> diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
> index afdebb95bcae..c71af3978496 100644
> --- a/arch/powerpc/mm/init-common.c
> +++ b/arch/powerpc/mm/init-common.c
> @@ -47,7 +47,7 @@ static int __init parse_nosmap(char *p)
> }
> early_param("nosmap", parse_nosmap);
>
> -void __ref setup_kup(void)
> +void setup_kup(void)
> {
> setup_kuep(disable_kuep);
> setup_kuap(disable_kuap);
> diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
> index 231ca95f9ffb..9fba29b95b5a 100644
> --- a/arch/powerpc/mm/nohash/8xx.c
> +++ b/arch/powerpc/mm/nohash/8xx.c
> @@ -245,7 +245,7 @@ void set_context(unsigned long id, pgd_t *pgd)
> }
>
> #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
> {
> if (disabled)
> return;
> @@ -257,7 +257,7 @@ void __init setup_kuep(bool disabled)
> #endif
>
> #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
> {
> pr_info("Activating Kernel Userspace Access Protection\n");
>
>
^ permalink raw reply
* [PATCH] powerpc/kup: Mark the kuap/keup function non __init
From: Aneesh Kumar K.V @ 2020-12-14 7:13 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V
Kernel call these functions on cpu online and hence they should
not be marked __init.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/powerpc/mm/book3s32/mmu.c | 4 ++--
arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
arch/powerpc/mm/init-common.c | 2 +-
arch/powerpc/mm/nohash/8xx.c | 4 ++--
4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 23f60e97196e..8d9e90a99b51 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -449,7 +449,7 @@ void __init print_system_hash_info(void)
}
#ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
{
pr_info("Activating Kernel Userspace Execution Prevention\n");
@@ -459,7 +459,7 @@ void __init setup_kuep(bool disabled)
#endif
#ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
{
pr_info("Activating Kernel Userspace Access Protection\n");
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 2b7ded396db4..f1c6f264ed91 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
}
#ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
{
if (disabled)
return;
@@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
#endif
#ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
{
if (disabled)
return;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index afdebb95bcae..c71af3978496 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -47,7 +47,7 @@ static int __init parse_nosmap(char *p)
}
early_param("nosmap", parse_nosmap);
-void __ref setup_kup(void)
+void setup_kup(void)
{
setup_kuep(disable_kuep);
setup_kuap(disable_kuap);
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 231ca95f9ffb..9fba29b95b5a 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -245,7 +245,7 @@ void set_context(unsigned long id, pgd_t *pgd)
}
#ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
{
if (disabled)
return;
@@ -257,7 +257,7 @@ void __init setup_kuep(bool disabled)
#endif
#ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
{
pr_info("Activating Kernel Userspace Access Protection\n");
--
2.28.0
^ permalink raw reply related
* Re: [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Randy Dunlap @ 2020-12-14 7:04 UTC (permalink / raw)
To: Nicholas Piggin, linux-kernel
Cc: linux-arch, linux-mm, linuxppc-dev, Andy Lutomirski
In-Reply-To: <20201214065312.270062-4-npiggin@gmail.com>
On 12/13/20 10:53 PM, Nicholas Piggin wrote:
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 84faaba66364..e69c974369cc 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -443,9 +443,22 @@ config MMU_LAZY_TLB
> config MMU_LAZY_TLB_REFCOUNT
> def_bool y
> depends on MMU_LAZY_TLB
> + depends on !MMU_LAZY_TLB_SHOOTDOWN
> help
> - This must be enabled if MMU_LAZY_TLB is enabled until the next
> - patch.
> + This refcounts the mm that is used as the lazy TLB mm when switching
> + switching to a kernel thread.
duplicate "switching".
> +
> +config MMU_LAZY_TLB_SHOOTDOWN
> + bool
> + depends on MMU_LAZY_TLB
> + help
> + Instead of refcounting the "lazy tlb" mm struct, which can cause
> + contention with multi-threaded apps on large multiprocessor systems,
> + this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
> + switch to init_mm if they were using the to-be-freed mm as the lazy
> + tlb. To implement this, architectures must use _lazy_tlb variants of
> + mm refcounting, and mm_cpumask must include at least all possible
> + CPUs in which mm might be lazy.
>
> config ARCH_HAVE_NMI_SAFE_CMPXCHG
> bool
--
^ permalink raw reply
* [PATCH v2 5/5] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
From: Nicholas Piggin @ 2020-12-14 6:53 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>
On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5181872f9452..356138bdb5bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
select HAVE_PERF_USER_STACK_DUMP
select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_PAGE_SIZE
+ select MMU_LAZY_TLB_SHOOTDOWN if PPC_BOOK3S_64
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
--
2.23.0
^ permalink raw reply related
* [PATCH v2 4/5] powerpc: use lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-12-14 6:53 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>
Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/kernel/smp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
{
unsigned int cpu = raw_smp_processor_id();
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
smp_store_cpu_info(cpu);
--
2.23.0
^ permalink raw reply related
* [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-12-14 6:53 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>
On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).
Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.
Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/Kconfig | 17 +++++++++++++++--
kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 67 insertions(+), 2 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 84faaba66364..e69c974369cc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -443,9 +443,22 @@ config MMU_LAZY_TLB
config MMU_LAZY_TLB_REFCOUNT
def_bool y
depends on MMU_LAZY_TLB
+ depends on !MMU_LAZY_TLB_SHOOTDOWN
help
- This must be enabled if MMU_LAZY_TLB is enabled until the next
- patch.
+ This refcounts the mm that is used as the lazy TLB mm when switching
+ switching to a kernel thread.
+
+config MMU_LAZY_TLB_SHOOTDOWN
+ bool
+ depends on MMU_LAZY_TLB
+ help
+ Instead of refcounting the "lazy tlb" mm struct, which can cause
+ contention with multi-threaded apps on large multiprocessor systems,
+ this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+ switch to init_mm if they were using the to-be-freed mm as the lazy
+ tlb. To implement this, architectures must use _lazy_tlb variants of
+ mm refcounting, and mm_cpumask must include at least all possible
+ CPUs in which mm might be lazy.
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..74b972d2d8a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,53 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * IPI overheads have not found to be expensive, but they could
+ * be reduced in a number of possible ways, for example (in
+ * roughly increasing order of complexity):
+ * - A batch of mms requiring IPIs could be gathered and freed
+ * at once.
+ * - CPUs could store their active mm somewhere that can be
+ * remotely checked without a lock, to filter out
+ * false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away
+ * from the mm could clear mm_cpumask to reduce some IPIs
+ * (some batching or delaying would help).
+ * - A delayed freeing and RCU-like quiescing sequence based on
+ * mm switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+ } else {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ }
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -678,7 +725,12 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ shoot_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
+
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
--
2.23.0
^ permalink raw reply related
* [PATCH v2 2/5] lazy tlb: allow lazy tlb mm switching to be configurable
From: Nicholas Piggin @ 2020-12-14 6:53 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>
Add CONFIG_MMU_LAZY_TLB which can be configured out to disable
the lazy tlb mechanism entirely, and switches to init_mm when
switching to a kernel thread.
NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. They
have not been switched over by default because the arch code needs
to be audited and tested for lazy tlb mm refcounting and converted
to _lazy_tlb refcounting if necessary.
CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always
be enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch
which provides an alternate scheme.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/Kconfig | 17 +++++++++
include/linux/sched/mm.h | 13 +++++--
kernel/sched/core.c | 75 ++++++++++++++++++++++++++++++----------
kernel/sched/sched.h | 4 ++-
4 files changed, 87 insertions(+), 22 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..84faaba66364 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,23 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
irqs disabled over activate_mm. Architectures that do IPI based TLB
shootdowns should enable this.
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+ def_bool y
+ help
+ Enable "lazy TLB" mmu context switching for kernel threads.
+ If this is disabled then switching to a kernel thread always
+ switches to init_mm. If mm switches are inexpensive or free
+ (in the case of NOMMU) then this could be disabled.
+
+config MMU_LAZY_TLB_REFCOUNT
+ def_bool y
+ depends on MMU_LAZY_TLB
+ help
+ This must be enabled if MMU_LAZY_TLB is enabled until the next
+ patch.
+
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 94a117160083..5edf8e942c84 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
/* Helpers for lazy TLB mm refcounting */
static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
{
- mmgrab(mm);
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmgrab(mm);
}
static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
{
- mmdrop(mm);
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ mmdrop(mm);
+ } else {
+ /*
+ * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ * membarrier comment finish_task_switch which relies on this.
+ */
+ smp_mb();
+ }
}
/**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2f8ea43d29b..9c1dc9406e4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
- struct mm_struct *mm = rq->prev_mm;
+ struct mm_struct *mm = NULL;
long prev_state;
/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
- rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ mm = rq->prev_lazy_mm;
+ rq->prev_lazy_mm = NULL;
+#endif
/*
* A task struct has one reference for the use as "current".
@@ -3722,22 +3725,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
calculate_sigpending();
}
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
{
- prepare_task_switch(rq, prev, next);
-
- /*
- * For paravirt, this is coupled with an exit in switch_to to
- * combine the page table reload and the switch backend into
- * one hypercall.
- */
- arch_start_context_switch(prev);
-
/*
* kernel -> kernel lazy + transfer active
* user -> kernel lazy + mmgrab_lazy_tlb() active
@@ -3766,11 +3757,57 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
- /* will mmdrop_lazy_tlb() in finish_task_switch(). */
- rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ /* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+ rq->prev_lazy_mm = prev->active_mm;
prev->active_mm = NULL;
+#else
+ /*
+ * Without MMU_LAZY_REFCOUNT there is no lazy
+ * tracking (because no rq->prev_lazy_mm) in
+ * finish_task_switch, so no mmdrop_lazy_tlb(),
+ * so no memory barrier for membarrier (see the
+ * membarrier comment in finish_task_switch()).
+ * Do it here.
+ */
+ smp_mb();
+#endif
}
}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
+{
+ if (!next->mm)
+ next->active_mm = &init_mm;
+ membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+ switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+ if (!prev->mm)
+ prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next, struct rq_flags *rf)
+{
+ prepare_task_switch(rq, prev, next);
+
+ /*
+ * For paravirt, this is coupled with an exit in switch_to to
+ * combine the page table reload and the switch backend into
+ * one hypercall.
+ */
+ arch_start_context_switch(prev);
+
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+ context_switch_mm(rq, prev, next);
+ else
+ context_switch_mm_nolazy(rq, prev, next);
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
- struct mm_struct *prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+ struct mm_struct *prev_lazy_mm;
+#endif
unsigned int clock_update_flags;
u64 clock;
--
2.23.0
^ permalink raw reply related
* [PATCH v2 1/5] lazy tlb: introduce lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-12-14 6:53 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>
Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/arm/mach-rpc/ecard.c | 2 +-
arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++--
fs/exec.c | 4 ++--
include/linux/sched/mm.h | 11 +++++++++++
kernel/cpu.c | 2 +-
kernel/exit.c | 2 +-
kernel/kthread.c | 11 +++++++----
kernel/sched/core.c | 15 ++++++++-------
8 files changed, 33 insertions(+), 18 deletions(-)
diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
current->mm = mm;
current->active_mm = mm;
activate_mm(active_mm, mm);
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
ecard_init_pgtables(mm);
return 0;
}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b487b489d4b6..74708aef333e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,10 +658,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..56fc23dcbe4d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,9 +1028,9 @@ static int exec_mmap(struct mm_struct *mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
- return 0;
+ } else {
+ mmdrop_lazy_tlb(active_mm);
}
- mmdrop(active_mm);
return 0;
}
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..94a117160083 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ mmdrop(mm);
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..a54cdfa08d71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -576,7 +576,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..da189e0d26ed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
+ if (active_mm != mm)
tsk->active_mm = mm;
- }
tsk->mm = mm;
switch_mm_irqs_off(active_mm, mm, tsk);
local_irq_enable();
@@ -1257,7 +1257,7 @@ void kthread_use_mm(struct mm_struct *mm)
#endif
if (active_mm != mm)
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
@@ -1280,10 +1280,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..c2f8ea43d29b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3629,13 +3629,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -3739,9 +3740,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
*/
if (!next->mm) { // to kernel
@@ -3749,7 +3750,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -3765,7 +3766,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
--
2.23.0
^ permalink raw reply related
* [PATCH v2 0/5] shoot lazy tlbs
From: Nicholas Piggin @ 2020-12-14 6:53 UTC (permalink / raw)
To: linux-kernel
Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
linuxppc-dev
This is another rebase, on top of mainline now (don't need the
asm-generic tree), and without any x86 or membarrier changes.
This makes the series far smaller and more manageable and
without the controversial bits.
Thanks,
Nick
Nicholas Piggin (5):
lazy tlb: introduce lazy mm refcount helper functions
lazy tlb: allow lazy tlb mm switching to be configurable
lazy tlb: shoot lazies, a non-refcounting lazy tlb option
powerpc: use lazy mm refcount helper functions
powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
arch/Kconfig | 30 ++++++++++
arch/arm/mach-rpc/ecard.c | 2 +-
arch/powerpc/Kconfig | 1 +
arch/powerpc/kernel/smp.c | 2 +-
arch/powerpc/mm/book3s64/radix_tlb.c | 4 +-
fs/exec.c | 4 +-
include/linux/sched/mm.h | 20 +++++++
kernel/cpu.c | 2 +-
kernel/exit.c | 2 +-
kernel/fork.c | 52 ++++++++++++++++
kernel/kthread.c | 11 ++--
kernel/sched/core.c | 88 ++++++++++++++++++++--------
kernel/sched/sched.h | 4 +-
13 files changed, 184 insertions(+), 38 deletions(-)
--
2.23.0
^ permalink raw reply
* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-14 5:53 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <1607918323.6muyu2l982.astroid@bobo.none>
Excerpts from Nicholas Piggin's message of December 14, 2020 2:07 pm:
> Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>
>>
>>> I'm still going to persue shoot-lazies for the merge window. As you
>>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>>> Your change is common code, but a significant complexity (which
>>> affects all archs) so needs a lot more review and testing at this
>>> point.
>>
>> I don't think it's ready for this merge window.
>
> Yes next one I meant (aka this one for development perspective :)).
>
>> I read the early
>> patches again, and I think they make the membarrier code worse, not
>> better.
>
> Mathieu and I disagree, so we are at an impasse.
Well actually not really, I went and cut out the exit_lazy_tlb stuff
from the patch series, those are better to be untangled anyway. I think
an earlier version had something in exit_lazy_tlb for the mm refcounting
change but it's not required now anyway.
I'll split them out and just work on the shoot lazies series for now, I
might revisit exit_lazy_tlb after the dust settles from that and the
current membarrier changes. I'll test and repost shortly.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH 2/2] powerpc/64s: Trim offlined CPUs from mm_cpumasks
From: Nicholas Piggin @ 2020-12-14 4:15 UTC (permalink / raw)
To: Geert Uytterhoeven
Cc: Peter Zijlstra, Aneesh Kumar K.V, Linux Kernel Mailing List,
Anton Vorontsov, Thomas Gleixner, linuxppc-dev
In-Reply-To: <CAMuHMdUdorW03=mipgm92SXNPBZO5owW1Wp6_SacRDZ7fOe9gw@mail.gmail.com>
Excerpts from Geert Uytterhoeven's message of December 10, 2020 7:06 pm:
> Hi Nicholas,
>
> On Fri, Nov 20, 2020 at 4:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> When offlining a CPU, powerpc/64s does not flush TLBs, rather it just
>> leaves the CPU set in mm_cpumasks, so it continues to receive TLBIEs
>> to manage its TLBs.
>>
>> However the exit_flush_lazy_tlbs() function expects that after
>> returning, all CPUs (except self) have flushed TLBs for that mm, in
>> which case TLBIEL can be used for this flush. This breaks for offline
>> CPUs because they don't get the IPI to flush their TLB. This can lead
>> to stale translations.
>>
>> Fix this by clearing the CPU from mm_cpumasks, then flushing all TLBs
>> before going offline.
>>
>> These offlined CPU bits stuck in the cpumask also prevents the cpumask
>> from being trimmed back to local mode, which means continual broadcast
>> IPIs or TLBIEs are needed for TLB flushing. This patch prevents that
>> situation too.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>
> Thanks for your patch!
>
>> --- a/arch/powerpc/platforms/powermac/smp.c
>> +++ b/arch/powerpc/platforms/powermac/smp.c
>> @@ -911,6 +911,8 @@ static int smp_core99_cpu_disable(void)
>>
>> mpic_cpu_set_priority(0xf);
>>
>> + cleanup_cpu_mmu_context();
>> +
>
> I guess this change broke pmac32_defconfig+SMP in v5.10-rc7?
>
> arch/powerpc/platforms/powermac/smp.c: error: implicit
> declaration of function 'cleanup_cpu_mmu_context'
> [-Werror=implicit-function-declaration]: => 914:2
>
> http://kisskb.ellerman.id.au/kisskb/buildresult/14423174/
Hey, yeah it does thanks for catching it. This patch fixes it for me
---
From a9b5ec92ffac975e81c6d7db6ff2b1486b2723f7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 14 Dec 2020 13:52:39 +1000
Subject: [PATCH] powerpc/32s: Fix cleanup_cpu_mmu_context() compile bug
32s has no tlbiel_all() defined, so just disable the cleanup with a
comment.
Fixes: 01b0f0eae081 ("powerpc/64s: Trim offlined CPUs from mm_cpumasks")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/platforms/powermac/smp.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index adae2a6712e1..66ef5f8f4445 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -911,7 +911,16 @@ static int smp_core99_cpu_disable(void)
mpic_cpu_set_priority(0xf);
+ /*
+ * Would be nice for consistency if all platforms clear mm_cpumask and
+ * flush TLBs on unplug, but the TLB invalidation bug described in
+ * commit 01b0f0eae081 ("powerpc/64s: Trim offlined CPUs from
+ * mm_cpumasks") only applies to 64s and for now we only have the TLB
+ * flush code for that platform.
+ */
+#ifdef CONFIG_PPC64
cleanup_cpu_mmu_context();
+#endif
return 0;
}
--
2.23.0
^ permalink raw reply related
* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-14 4:07 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrV5BzXuUYm5YAoEKPZZPfLrbHckvwBHzWKrxZS8hqzHEg@mail.gmail.com>
Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>
>> I'm still going to persue shoot-lazies for the merge window. As you
>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>> Your change is common code, but a significant complexity (which
>> affects all archs) so needs a lot more review and testing at this
>> point.
>
> I don't think it's ready for this merge window.
Yes next one I meant (aka this one for development perspective :)).
> I read the early
> patches again, and I think they make the membarrier code worse, not
> better.
Mathieu and I disagree, so we are at an impasse. I addressed your
comment about not being able to do the additional core sync avoidance
from the exit tlb call (you can indeed do so in your arch code) and
about exit_lazy_tlb being a call into the scheduler (it's not) and
about the arch code not being able to reconcile lazy tlb mm with the
core scheduler code (you can).
I fundamentally think the core sync is an issue with what the membarrier
/ arch specifics are doing with lazy tlb mm switching, and not something
the core scheduler needs to know about at all. I don't see the big
problem with essentially moving it from an explicit call to
exit_lazy_tlb (which from scheduler POV describes better what it is
doing, not how).
> I'm not fundamentally opposed to the shoot-lazies concept,
> but it needs more thought and it needs a cleaner foundation.
Well shoot lazies actually doesn't really rely on that membarrier
change at all, it just came as a nice looking cleanup so that part
can be dropped from the series. It's not really foundational.
Thanks,
Nick
^ permalink raw reply
* [powerpc:next] BUILD SUCCESS WITH WARNING dddc4ef92d1ce92987da1d6926cdfa99e8acb622
From: kernel test robot @ 2020-12-14 2:54 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
branch HEAD: dddc4ef92d1ce92987da1d6926cdfa99e8acb622 KVM: PPC: Book3S HV: XIVE: Add a comment regarding VP numbering
Warning reports:
https://lore.kernel.org/linuxppc-dev/202012042220.zO7hSFT2-lkp@intel.com
Warning in current branch:
arch/powerpc/kernel/vdso32/vgettimeofday.c:13:5: warning: no previous prototype for function '__c_kernel_clock_gettime64' [-Wmissing-prototypes]
Warning ids grouped by kconfigs:
clang_recent_errors
`-- powerpc64-randconfig-r025-20201213
`-- arch-powerpc-kernel-vdso32-vgettimeofday.c:warning:no-previous-prototype-for-function-__c_kernel_clock_gettime64
elapsed time: 1767m
configs tested: 138
configs skipped: 48
gcc tested configs:
arm defconfig
arm64 allyesconfig
arm64 defconfig
arm allyesconfig
arm allmodconfig
nios2 alldefconfig
sh polaris_defconfig
m68k mvme16x_defconfig
openrisc defconfig
mips qi_lb60_defconfig
arm tegra_defconfig
mips ip27_defconfig
s390 alldefconfig
mips bmips_be_defconfig
arm exynos_defconfig
arm collie_defconfig
arc axs101_defconfig
sh urquell_defconfig
sh dreamcast_defconfig
mips rm200_defconfig
sh j2_defconfig
powerpc pseries_defconfig
mips bcm63xx_defconfig
mips nlm_xlp_defconfig
arm shmobile_defconfig
powerpc linkstation_defconfig
sh apsh4ad0a_defconfig
arc tb10x_defconfig
riscv allnoconfig
arm multi_v5_defconfig
sh defconfig
powerpc pcm030_defconfig
m68k atari_defconfig
mips loongson3_defconfig
arm efm32_defconfig
arm footbridge_defconfig
powerpc ppc6xx_defconfig
powerpc amigaone_defconfig
powerpc ebony_defconfig
x86_64 defconfig
alpha allyesconfig
mips maltasmvp_defconfig
mips malta_kvm_guest_defconfig
c6x evmc6678_defconfig
powerpc rainier_defconfig
powerpc g5_defconfig
mips e55_defconfig
powerpc pmac32_defconfig
sh lboxre2_defconfig
sh se7206_defconfig
i386 alldefconfig
xtensa audio_kc705_defconfig
arc nsimosci_hs_smp_defconfig
powerpc mpc832x_mds_defconfig
powerpc currituck_defconfig
arm pxa910_defconfig
arm imote2_defconfig
h8300 h8s-sim_defconfig
microblaze mmu_defconfig
sh se7724_defconfig
m68k apollo_defconfig
mips capcella_defconfig
powerpc mpc7448_hpc2_defconfig
m68k hp300_defconfig
powerpc klondike_defconfig
xtensa smp_lx200_defconfig
sparc64 defconfig
ia64 bigsur_defconfig
parisc generic-32bit_defconfig
csky defconfig
arm multi_v4t_defconfig
sh ecovec24_defconfig
ia64 allmodconfig
ia64 defconfig
ia64 allyesconfig
m68k allmodconfig
m68k defconfig
m68k allyesconfig
nios2 defconfig
arc allyesconfig
nds32 allnoconfig
c6x allyesconfig
nds32 defconfig
nios2 allyesconfig
alpha defconfig
xtensa allyesconfig
h8300 allyesconfig
arc defconfig
sh allmodconfig
parisc defconfig
s390 allyesconfig
parisc allyesconfig
s390 defconfig
i386 allyesconfig
sparc allyesconfig
sparc defconfig
i386 tinyconfig
i386 defconfig
mips allyesconfig
mips allmodconfig
powerpc allyesconfig
powerpc allmodconfig
powerpc allnoconfig
x86_64 randconfig-a003-20201213
x86_64 randconfig-a006-20201213
x86_64 randconfig-a002-20201213
x86_64 randconfig-a005-20201213
x86_64 randconfig-a004-20201213
x86_64 randconfig-a001-20201213
i386 randconfig-a001-20201213
i386 randconfig-a004-20201213
i386 randconfig-a003-20201213
i386 randconfig-a002-20201213
i386 randconfig-a005-20201213
i386 randconfig-a006-20201213
i386 randconfig-a014-20201213
i386 randconfig-a013-20201213
i386 randconfig-a012-20201213
i386 randconfig-a011-20201213
i386 randconfig-a016-20201213
i386 randconfig-a015-20201213
riscv nommu_k210_defconfig
riscv allyesconfig
riscv nommu_virt_defconfig
riscv defconfig
riscv rv32_defconfig
riscv allmodconfig
x86_64 rhel
x86_64 allyesconfig
x86_64 rhel-7.6-kselftests
x86_64 rhel-8.3
x86_64 kexec
clang tested configs:
x86_64 randconfig-a016-20201213
x86_64 randconfig-a012-20201213
x86_64 randconfig-a013-20201213
x86_64 randconfig-a015-20201213
x86_64 randconfig-a014-20201213
x86_64 randconfig-a011-20201213
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* Re: [PATCH v12 00/31] Speculative page faults
From: Joel Fernandes @ 2020-12-14 2:03 UTC (permalink / raw)
To: Chinwen Chang
Cc: jack, sergey.senozhatsky.work, peterz, Will Deacon, mhocko,
linux-mm, paulus, Punit Agrawal, hpa, Michel Lespinasse,
Alexei Starovoitov, Andrea Arcangeli, ak, Minchan Kim,
aneesh.kumar, x86, Matthew Wilcox, Daniel Jordan, Ingo Molnar,
zhong jiang, David Rientjes, paulmck, npiggin, sj38.park,
Jerome Glisse, dave, kemi.wang, kirill, Thomas Gleixner,
Laurent Dufour, Haiyan Song, Ganesh Mahendran, Yang Shi,
Mike Rapoport, linuxppc-dev, linux-kernel, Sergey Senozhatsky,
miles.chen, vinayak menon, akpm, Tim Chen, haren
In-Reply-To: <1594099897.30360.58.camel@mtkswgap22>
On Tue, Jul 07, 2020 at 01:31:37PM +0800, Chinwen Chang wrote:
[..]
> > > Hi Laurent,
> > >
> > > We merged SPF v11 and some patches from v12 into our platforms. After
> > > several experiments, we observed SPF has obvious improvements on the
> > > launch time of applications, especially for those high-TLP ones,
> > >
> > > # launch time of applications(s):
> > >
> > > package version w/ SPF w/o SPF improve(%)
> > > ------------------------------------------------------------------
> > > Baidu maps 10.13.3 0.887 0.98 9.49
> > > Taobao 8.4.0.35 1.227 1.293 5.10
> > > Meituan 9.12.401 1.107 1.543 28.26
> > > WeChat 7.0.3 2.353 2.68 12.20
> > > Honor of Kings 1.43.1.6 6.63 6.713 1.24
> >
> > That's great news, thanks for reporting this!
> >
> > >
> > > By the way, we have verified our platforms with those patches and
> > > achieved the goal of mass production.
> >
> > Another good news!
> > For my information, what is your targeted hardware?
> >
> > Cheers,
> > Laurent.
>
> Hi Laurent,
>
> Our targeted hardware belongs to ARM64 multi-core series.
Hello!
I was trying to develop an intuition about why does SPF give improvement for
you on small CPU systems. This is just a high-level theory but:
1. Assume the improvement is because of elimination of "blocking" on
mmap_sem.
Could it be that the mmap_sem is acquired in write-mode unnecessarily in some
places, thus causing blocking on mmap_sem in other paths? If so, is it
feasible to convert such usages to acquiring them in read-mode?
2. Assume the improvement is because of lesser read-side contention on
mmap_sem.
On small CPU systems, I would not expect reducing cache-line bouncing to give
such a dramatic improvement in performance as you are seeing.
Thanks for any insight on this!
- Joel
^ permalink raw reply
* [PATCH] powerpc/ps3: use dma_mapping_error()
From: Vincent Stehlé @ 2020-12-13 18:26 UTC (permalink / raw)
To: linuxppc-dev, linux-kernel
Cc: Geoff Levand, Geert Uytterhoeven, Vincent Stehlé
The DMA address returned by dma_map_single() should be checked with
dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
---
drivers/ps3/ps3stor_lib.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/ps3/ps3stor_lib.c b/drivers/ps3/ps3stor_lib.c
index 333ba83006e48..a12a1ad9b5fe3 100644
--- a/drivers/ps3/ps3stor_lib.c
+++ b/drivers/ps3/ps3stor_lib.c
@@ -189,7 +189,7 @@ int ps3stor_setup(struct ps3_storage_device *dev, irq_handler_t handler)
dev->bounce_lpar = ps3_mm_phys_to_lpar(__pa(dev->bounce_buf));
dev->bounce_dma = dma_map_single(&dev->sbd.core, dev->bounce_buf,
dev->bounce_size, DMA_BIDIRECTIONAL);
- if (!dev->bounce_dma) {
+ if (dma_mapping_error(&dev->sbd.core, dev->bounce_dma)) {
dev_err(&dev->sbd.core, "%s:%u: map DMA region failed\n",
__func__, __LINE__);
error = -ENODEV;
--
2.29.2
^ permalink raw reply related
* [PATCH] powerpc: fix alignment bug whithin the init sections
From: Ariel Marcovitch @ 2020-12-13 18:35 UTC (permalink / raw)
To: mpe; +Cc: paulus, linuxppc-dev, Ariel Marcovitch, linux-kernel
This is a bug that can cause early crashes in configurations with a
.exit.text section smaller than a page and a .init.text section that
ends in the beginning of a physical page (this is kinda random, which
might explain why this wasn't really encountered before).
The init sections are ordered like this:
.init.text
.exit.text
.init.data
Currently, these sections aren't page aligned.
Because the init code is mapped read-only at runtime and because the
.init.text section can potentially reside on the same physical page as
.init.data, the beginning of .init.data might be mapped read-only along
with .init.text.
Then when the kernel tries to modify a variable in .init.data (like
kthreadd_done, used in kernel_init()) the kernel panics.
To avoid this, I made these sections page aligned.
Fixes: 060ef9d89d18 ("powerpc32: PAGE_EXEC required for inittext")
Signed-off-by: Ariel Marcovitch <ariel.marcovitch@gmail.com>
---
arch/powerpc/kernel/vmlinux.lds.S | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 326e113d2e45..e3a7c90c03f4 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -179,6 +179,11 @@ SECTIONS
#endif
} :text
+ /* .init.text is made RO and .exit.text is not, so we must
+ * ensure these sections reside in separate physical pages.
+ */
+ . = ALIGN(PAGE_SIZE);
+
/* .exit.text is discarded at runtime, not link time,
* to deal with references from __bug_table
*/
@@ -186,6 +191,8 @@ SECTIONS
EXIT_TEXT
}
+ . = ALIGN(PAGE_SIZE);
+
.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
INIT_DATA
}
base-commit: 1398820fee515873379809a6415930ad0764b2f6
--
2.17.1
^ permalink raw reply related
* Re: [PATCH] powerpc/ps3: use dma_mapping_error()
From: Geert Uytterhoeven @ 2020-12-13 19:39 UTC (permalink / raw)
To: Vincent Stehlé
Cc: Geoff Levand, Geert Uytterhoeven, linuxppc-dev,
Linux Kernel Mailing List
In-Reply-To: <20201213182622.23047-1-vincent.stehle@laposte.net>
On Sun, Dec 13, 2020 at 8:06 PM Vincent Stehlé
<vincent.stehle@laposte.net> wrote:
> The DMA address returned by dma_map_single() should be checked with
> dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
>
> Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
> Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
^ permalink raw reply
* Re: [PATCH] powerpc/ps3: use dma_mapping_error()
From: Geoff Levand @ 2020-12-13 19:38 UTC (permalink / raw)
To: Vincent Stehlé, linuxppc-dev, linux-kernel; +Cc: Geert Uytterhoeven
In-Reply-To: <20201213182622.23047-1-vincent.stehle@laposte.net>
On 12/13/20 10:26 AM, Vincent Stehlé wrote:
> The DMA address returned by dma_map_single() should be checked with
> dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
>
> Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
> Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
> Cc: Geoff Levand <geoff@infradead.org>
> Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
> ---
> drivers/ps3/ps3stor_lib.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
Looks good. Thanks for submitting.
Acked by: Geoff Levand <geoff@infradead.org>
^ permalink raw reply
* Re: [PATCH AUTOSEL 5.9 27/39] sched/idle: Fix arch_cpu_idle() vs tracing
From: Sasha Levin @ 2020-12-13 14:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Mark Rutland, uclinux-h8-devel, linux-ia64, linux-parisc,
linux-s390, linux-hexagon, Heiko Carstens, linux-sh, linux-um,
linux-kernel, stable, linux-mips, openrisc, linux-csky,
Sven Schnelle, linux-alpha, sparclinux, linux-riscv, linuxppc-dev,
linux-arm-kernel
In-Reply-To: <20201203171015.GN2414@hirez.programming.kicks-ass.net>
On Thu, Dec 03, 2020 at 06:10:15PM +0100, Peter Zijlstra wrote:
>On Thu, Dec 03, 2020 at 03:54:42PM +0100, Heiko Carstens wrote:
>> On Thu, Dec 03, 2020 at 08:28:21AM -0500, Sasha Levin wrote:
>> > From: Peter Zijlstra <peterz@infradead.org>
>> >
>> > [ Upstream commit 58c644ba512cfbc2e39b758dd979edd1d6d00e27 ]
>> >
>> > We call arch_cpu_idle() with RCU disabled, but then use
>> > local_irq_{en,dis}able(), which invokes tracing, which relies on RCU.
>> >
>> > Switch all arch_cpu_idle() implementations to use
>> > raw_local_irq_{en,dis}able() and carefully manage the
>> > lockdep,rcu,tracing state like we do in entry.
>> >
>> > (XXX: we really should change arch_cpu_idle() to not return with
>> > interrupts enabled)
>> >
>> > Reported-by: Sven Schnelle <svens@linux.ibm.com>
>> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> > Reviewed-by: Mark Rutland <mark.rutland@arm.com>
>> > Tested-by: Mark Rutland <mark.rutland@arm.com>
>> > Link: https://lkml.kernel.org/r/20201120114925.594122626@infradead.org
>> > Signed-off-by: Sasha Levin <sashal@kernel.org>
>>
>> This patch broke s390 irq state tracing. A patch to fix this is
>> scheduled to be merged upstream today (hopefully).
>> Therefore I think this patch should not yet go into 5.9 stable.
>
>Agreed.
I'll also grab b1cae1f84a0f ("s390: fix irq state tracing"). Thanks!
--
Thanks,
Sasha
^ permalink raw reply
* [PATCH 2/3] kbuild: LD_VERSION redenomination
From: Masahiro Yamada @ 2020-12-12 16:54 UTC (permalink / raw)
To: linux-kbuild
Cc: Thomas Bogendoerfer, Dominique Martinet, linuxppc-dev,
linux-kernel, Jiaxun Yang, linux-mips, Paul Mackerras,
Catalin Marinas, Huacai Chen, Will Deacon, Masahiro Yamada,
linux-arm-kernel
In-Reply-To: <20201212165431.150750-1-masahiroy@kernel.org>
Commit ccbef1674a15 ("Kbuild, lto: add ld-version and ld-ifversion
macros") introduced scripts/ld-version.sh for GCC LTO.
At that time, this script handled 5 version fields because GCC LTO
needed the downstream binutils. (https://lkml.org/lkml/2014/4/8/272)
The code snippet from the submitted patch was as follows:
# We need HJ Lu's Linux binutils because mainline binutils does not
# support mixing assembler and LTO code in the same ld -r object.
# XXX check if the gcc plugin ld is the expected one too
# XXX some Fedora binutils should also support it. How to check for that?
ifeq ($(call ld-ifversion,-ge,22710001,y),y)
...
However, GCC LTO was not merged into the mainline after all.
(https://lkml.org/lkml/2014/4/8/272)
So, the 4th and 5th fields were never used, and finally removed by
commit 0d61ed17dd30 ("ld-version: Drop the 4th and 5th version
components").
Since then, the last 4-digits returned by this script is always zeros.
Remove the meaningless last 4-digits. This makes the version format
consistent with GCC_VERSION, CLANG_VERSION, LLD_VERSION.
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
arch/arm64/Kconfig | 2 +-
arch/mips/loongson64/Platform | 2 +-
arch/mips/vdso/Kconfig | 2 +-
arch/powerpc/Makefile | 2 +-
arch/powerpc/lib/Makefile | 2 +-
scripts/ld-version.sh | 2 +-
6 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a6b5b7ef40ae..69d56b21a6ec 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1499,7 +1499,7 @@ config ARM64_PTR_AUTH
depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
# Modern compilers insert a .note.gnu.property section note for PAC
# which is only understood by binutils starting with version 2.33.1.
- depends on LD_IS_LLD || LD_VERSION >= 233010000 || (CC_IS_GCC && GCC_VERSION < 90100)
+ depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
help
diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform
index ec42c5085905..cc0b9c87f9ad 100644
--- a/arch/mips/loongson64/Platform
+++ b/arch/mips/loongson64/Platform
@@ -35,7 +35,7 @@ cflags-$(CONFIG_CPU_LOONGSON64) += $(call as-option,-Wa$(comma)-mno-fix-loongson
# can't easily be used safely within the kbuild framework.
#
ifeq ($(call cc-ifversion, -ge, 0409, y), y)
- ifeq ($(call ld-ifversion, -ge, 225000000, y), y)
+ ifeq ($(call ld-ifversion, -ge, 22500, y), y)
cflags-$(CONFIG_CPU_LOONGSON64) += \
$(call cc-option,-march=loongson3a -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64)
else
diff --git a/arch/mips/vdso/Kconfig b/arch/mips/vdso/Kconfig
index 7aec721398d5..a665f6108cb5 100644
--- a/arch/mips/vdso/Kconfig
+++ b/arch/mips/vdso/Kconfig
@@ -12,7 +12,7 @@
# the lack of relocations. As such, we disable the VDSO for microMIPS builds.
config MIPS_LD_CAN_LINK_VDSO
- def_bool LD_VERSION >= 225000000 || LD_IS_LLD
+ def_bool LD_VERSION >= 22500 || LD_IS_LLD
config MIPS_DISABLE_VDSO
def_bool CPU_MICROMIPS || (!CPU_MIPSR6 && !MIPS_LD_CAN_LINK_VDSO)
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 5c8c06215dd4..6a9a852c3d56 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -65,7 +65,7 @@ UTS_MACHINE := $(subst $(space),,$(machine-y))
ifdef CONFIG_PPC32
KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
else
-ifeq ($(call ld-ifversion, -ge, 225000000, y),y)
+ifeq ($(call ld-ifversion, -ge, 22500, y),y)
# Have the linker provide sfpr if possible.
# There is a corresponding test in arch/powerpc/lib/Makefile
KBUILD_LDFLAGS_MODULE += --save-restore-funcs
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 69a91b571845..d4efc182662a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
# 64-bit linker creates .sfpr on demand for final link (vmlinux),
# so it is only needed for modules, and only for older linkers which
# do not support --save-restore-funcs
-ifeq ($(call ld-ifversion, -lt, 225000000, y),y)
+ifeq ($(call ld-ifversion, -lt, 22500, y),y)
extra-$(CONFIG_PPC64) += crtsavres.o
endif
diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh
index f2be0ff9a738..0f8a2c0f9502 100755
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -6,6 +6,6 @@
gsub(".*version ", "");
gsub("-.*", "");
split($1,a, ".");
- print a[1]*100000000 + a[2]*1000000 + a[3]*10000;
+ print a[1]*10000 + a[2]*100 + a[3];
exit
}
--
2.27.0
^ permalink raw reply related
* [PATCH 2/2] dmaengine: fsldma: Fix a resource leak in an error handling path of the probe function
From: Christophe JAILLET @ 2020-12-12 16:06 UTC (permalink / raw)
To: leoyang.li, zw, vkoul, dan.j.williams, iws
Cc: dmaengine, kernel-janitors, Christophe JAILLET, linuxppc-dev,
linux-kernel
In case of error, the previous 'fsl_dma_chan_probe()' calls must be undone
by some 'fsl_dma_chan_remove()', as already done in the remove function.
It was added in the remove function in commit 77cd62e8082b ("fsldma: allow
Freescale Elo DMA driver to be compiled as a module")
Fixes: d3f620b2c4fe ("fsldma: simplify IRQ probing and handling")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
Patch provided as-is.
I don't have the configuration to compile test this patch
---
drivers/dma/fsldma.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 554f70a0c18c..f8459cc5315d 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1214,6 +1214,7 @@ static int fsldma_of_probe(struct platform_device *op)
{
struct fsldma_device *fdev;
struct device_node *child;
+ unsigned int i;
int err;
fdev = kzalloc(sizeof(*fdev), GFP_KERNEL);
@@ -1292,6 +1293,10 @@ static int fsldma_of_probe(struct platform_device *op)
return 0;
out_free_fdev:
+ for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+ if (fdev->chan[i])
+ fsl_dma_chan_remove(fdev->chan[i]);
+ }
irq_dispose_mapping(fdev->irq);
iounmap(fdev->regs);
out_free:
--
2.27.0
^ permalink raw reply related
* [PATCH 1/2] dmaengine: fsldma: Fix a resource leak in the remove function
From: Christophe JAILLET @ 2020-12-12 16:05 UTC (permalink / raw)
To: leoyang.li, zw, vkoul, dan.j.williams, timur
Cc: dmaengine, kernel-janitors, Christophe JAILLET, linuxppc-dev,
linux-kernel
A 'irq_dispose_mapping()' call is missing in the remove function.
Add it.
This is needed to undo the 'irq_of_parse_and_map() call from the probe
function and already part of the error handling path of the probe function.
It was added in the probe function only in commit d3f620b2c4fe ("fsldma:
simplify IRQ probing and handling")
Fixes: 77cd62e8082b ("fsldma: allow Freescale Elo DMA driver to be compiled as a module")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
Patch provided as-is.
I don't have the configuration to compile test this patch
---
drivers/dma/fsldma.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 0feb323bae1e..554f70a0c18c 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1314,6 +1314,7 @@ static int fsldma_of_remove(struct platform_device *op)
if (fdev->chan[i])
fsl_dma_chan_remove(fdev->chan[i]);
}
+ irq_dispose_mapping(fdev->irq);
iounmap(fdev->regs);
kfree(fdev);
--
2.27.0
^ permalink raw reply related
* [PATCH] powerpc/vas: Fix IRQ name allocation
From: Cédric Le Goater @ 2020-12-12 14:27 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Sukadev Bhattiprolu, Haren Myneni, Cédric Le Goater
The VAS device allocates a generic interrupt to handle page faults but
the IRQ name doesn't show under /proc. This is because it's on
stack. Allocate the name.
Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
I didn't understand this part in init_vas_instance() :
if (vinst->virq) {
rc = vas_irq_fault_window_setup(vinst);
/*
* Fault window is used only for user space send windows.
* So if vinst->virq is NULL, tx_win_open returns -ENODEV
* for user space.
*/
if (rc)
vinst->virq = 0;
}
If the IRQ cannot be requested, the device probing should fail but
it's not today. The use of 'vinst->virq' is suspicious.
arch/powerpc/platforms/powernv/vas.h | 1 +
arch/powerpc/platforms/powernv/vas.c | 11 ++++++++---
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 70f793e8f6cc..c7db3190baca 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -340,6 +340,7 @@ struct vas_instance {
struct vas_window *rxwin[VAS_COP_TYPE_MAX];
struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+ char *name;
char *dbgname;
struct dentry *dbgdir;
};
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 598e4cd563fb..b65256a63e87 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -28,12 +28,10 @@ static DEFINE_PER_CPU(int, cpu_vas_id);
static int vas_irq_fault_window_setup(struct vas_instance *vinst)
{
- char devname[64];
int rc = 0;
- snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id);
rc = request_threaded_irq(vinst->virq, vas_fault_handler,
- vas_fault_thread_fn, 0, devname, vinst);
+ vas_fault_thread_fn, 0, vinst->name, vinst);
if (rc) {
pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
@@ -80,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev)
if (!vinst)
return -ENOMEM;
+ vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid);
+ if (!vinst->name) {
+ kfree(vinst);
+ return -ENOMEM;
+ }
+
INIT_LIST_HEAD(&vinst->node);
ida_init(&vinst->ida);
mutex_init(&vinst->mutex);
@@ -162,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev)
return 0;
free_vinst:
+ kfree(vinst->name);
kfree(vinst);
return -ENODEV;
--
2.26.2
^ permalink raw reply related
* [PATCH] powerpc/mm: Fix hugetlb_free_pmd_range() and hugetlb_free_pud_range()
From: Christophe Leroy @ 2020-12-12 13:41 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, qcai
Cc: linuxppc-dev, linux-kernel
Commit 7bfe54b5f165 ("powerpc/mm: Refactor the floor/ceiling check in
hugetlb range freeing functions") inadvertely removed the mask
applied to start parameter in those two functions, leading to the
following crash on power9.
[ 7703.114640][T58070] LTP: starting hugemmap05_1 (hugemmap05 -m)
[ 7703.157792][ C99] ------------[ cut here ]------------
[ 7703.158279][ C99] kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:387!
[ 7703.158306][ C99] Oops: Exception in kernel mode, sig: 5 [#1]
[ 7703.158330][ C99] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=256 NUMA PowerNV
[ 7703.158343][ C99] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_spapr_tce vfio vfio_spapr_eeh loop kvm_hv kvm ip_tables x_tables sd_mod ahci libahci tg3 libata firmware_class libphy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_del_mod]
[ 7703.158435][ C99] CPU: 99 PID: 308 Comm: ksoftirqd/99 Tainted: G O 5.10.0-rc7-next-20201211 #1
[ 7703.158464][ C99] NIP: c00000000005dbec LR: c0000000003352f4 CTR: 0000000000000000
[ 7703.158489][ C99] REGS: c00020000bb6f830 TRAP: 0700 Tainted: G O (5.10.0-rc7-next-20201211)
[ 7703.158528][ C99] MSR: 900000000282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 24002284 XER: 20040000
[ 7703.158570][ C99] GPR00: c0000000003352f4 c00020000bb6fad0 c000000007f70b00 c0002000385b3ff0
[ 7703.158570][ C99] GPR04: 0000000000000000 0000000000000003 c00020000bb6f8b4 0000000000000001
[ 7703.158570][ C99] GPR08: 0000000000000001 0000000000000009 0000000000000008 0000000000000002
[ 7703.158570][ C99] GPR12: 0000000024002488 c000201fff649c00 c000000007f2a20c 0000000000000000
[ 7703.158570][ C99] GPR16: 0000000000000007 0000000000000000 c000000000194d10 c000000000194d10
[ 7703.158570][ C99] GPR24: 0000000000000014 0000000000000015 c000201cc6e72398 c000000007fac4b4
[ 7703.158570][ C99] GPR28: c000000007f2bf80 c000000007fac2f8 0000000000000008 c000200033870000
[ 7703.158766][ C99] NIP [c00000000005dbec] __tlb_remove_table+0x1dc/0x1e0
pgtable_free at arch/powerpc/mm/book3s64/pgtable.c:387
(inlined by) __tlb_remove_table at arch/powerpc/mm/book3s64/pgtable.c:405
[ 7703.158805][ C99] LR [c0000000003352f4] tlb_remove_table_rcu+0x54/0xa0
[ 7703.158853][ C99] Call Trace:
[ 7703.158872][ C99] [c00020000bb6fad0] [c00000000005db4c] __tlb_remove_table+0x13c/0x1e0 (unreliable)
[ 7703.158890][ C99] [c00020000bb6fb00] [c0000000003352f4] tlb_remove_table_rcu+0x54/0xa0
__tlb_remove_table_free at mm/mmu_gather.c:101
(inlined by) tlb_remove_table_rcu at mm/mmu_gather.c:156
[ 7703.158927][ C99] [c00020000bb6fb30] [c000000000194d7c] rcu_core+0x35c/0xbb0
rcu_do_batch at kernel/rcu/tree.c:2502
(inlined by) rcu_core at kernel/rcu/tree.c:2737
[ 7703.158966][ C99] [c00020000bb6fbf0] [c00000000095a3d0] __do_softirq+0x480/0x704
[ 7703.159006][ C99] [c00020000bb6fd10] [c0000000000cc1f4] run_ksoftirqd+0x74/0xd0
run_ksoftirqd at kernel/softirq.c:651
(inlined by) run_ksoftirqd at kernel/softirq.c:642
[ 7703.159046][ C99] [c00020000bb6fd30] [c0000000001040c8] smpboot_thread_fn+0x278/0x320
[ 7703.159096][ C99] [c00020000bb6fda0] [c0000000000fc8a4] kthread+0x1c4/0x1d0
[ 7703.159145][ C99] [c00020000bb6fe10] [c00000000000d9fc] ret_from_kernel_thread+0x5c/0x80
[ 7703.159183][ C99] Instruction dump:
[ 7703.159204][ C99] 60000000 7c0802a6 3c82f8b4 7fe3fb78 38847470 f8010040 482b4fc5 60000000
[ 7703.159248][ C99] 0fe00000 7c0802a6 fbe10028 f8010040 <0fe00000> 3c4c07f1 38422f10 7c0802a6
[ 7703.159293][ C99] ---[ end trace 1d92a5231ba6a0d5 ]---
Properly apply the masks before calling pmd_free_tlb() and
pud_free_tlb() respectively.
Reported-by: Qian Cai <qcai@redhat.com>
Fixes: 7bfe54b5f165 ("powerpc/mm: Refactor the floor/ceiling check in hugetlb range freeing functions")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/mm/hugetlbpage.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f8d8a4988e15..8b3cc4d688e8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -396,9 +396,9 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
if (range_is_outside_limits(start, end, floor, ceiling, PUD_MASK))
return;
- pmd = pmd_offset(pud, start);
+ pmd = pmd_offset(pud, start & PUD_MASK);
pud_clear(pud);
- pmd_free_tlb(tlb, pmd, start);
+ pmd_free_tlb(tlb, pmd, start & PUD_MASK);
mm_dec_nr_pmds(tlb->mm);
}
@@ -439,9 +439,9 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
if (range_is_outside_limits(start, end, floor, ceiling, PGDIR_MASK))
return;
- pud = pud_offset(p4d, start);
+ pud = pud_offset(p4d, start & PGDIR_MASK);
p4d_clear(p4d);
- pud_free_tlb(tlb, pud, start);
+ pud_free_tlb(tlb, pud, start & PGDIR_MASK);
mm_dec_nr_puds(tlb->mm);
}
--
2.25.0
^ permalink raw reply related
* Re: [powerpc/merge] System crash during cpu offline/online operation
From: Sachin Sant @ 2020-12-12 10:07 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <F4809152-C7F5-4ED8-B071-85A9115BD29D@linux.vnet.ibm.com>
> On 11-Dec-2020, at 3:47 PM, Sachin Sant <sachinp@linux.vnet.ibm.com> wrote:
>
> I am observing system crash during a cpu offline/online operation
> with latest merge branch code running in a PowerVM LPAR (P8 onwards)
>
> # uname -r
> 5.10.0-rc7-01792-g244569c777ca
> # ppc64_cpu --smt=1
> [ 244.205194] cpu 1 (hwid 1) Ready to die…
> ………
> ……...
> [ 247.015113] cpu 30 (hwid 30) Ready to die...
> [ 247.104973] cpu 31 (hwid 31) Ready to die…
> # ppc64_cpu --smt=8
>
> At this point the LPAR reboots instantly without any trace message.
Git bisect leads me to the following commit:
3b47b7549ead0719e94022c6742199333c7c8d9f is the first bad commit
commit 3b47b7549ead0719e94022c6742199333c7c8d9f
Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Date: Fri Nov 27 10:14:07 2020 +0530
powerpc/book3s64/kuap: Move KUAP related function outside radix
Here is the bisect log:
# git bisect log
git bisect start
# bad: [244569c777ca638b08c75db88fe035bdec52ef80] Automatic merge of 'next' into merge (2020-12-10 00:34)
git bisect bad 244569c777ca638b08c75db88fe035bdec52ef80
# good: [9acd775e4579bde0a6d937d72f9669e418aa87ad] Automatic merge of 'master' into merge (2020-12-05 22:54)
git bisect good 9acd775e4579bde0a6d937d72f9669e418aa87ad
# good: [ab91292cb3e9f43d9c6839d7572d17b35bc21710] Merge tag 'char-misc-5.10-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc
git bisect good ab91292cb3e9f43d9c6839d7572d17b35bc21710
# bad: [19b311ca51e108b6d8d679496af8635fdc1984a8] ocxl: Initiate a TLB invalidate command
git bisect bad 19b311ca51e108b6d8d679496af8635fdc1984a8
# bad: [d94b827e89dc3f92cd871d10f4992a6bd3c861e5] powerpc/book3s64/kuap: Use Key 3 for kernel mapping with hash translation
git bisect bad d94b827e89dc3f92cd871d10f4992a6bd3c861e5
# good: [1d15ffdfc94127d75e04a88344ee1ce8c79f05fd] KVM: PPC: Book3S HV: Ratelimit machine check messages coming from guests
git bisect good 1d15ffdfc94127d75e04a88344ee1ce8c79f05fd
# good: [9f378b9f007cc94beadea40df83cc62a76975c6f] KVM: PPC: BOOK3S: PR: Ignore UAMOR SPR
git bisect good 9f378b9f007cc94beadea40df83cc62a76975c6f
# bad: [3b47b7549ead0719e94022c6742199333c7c8d9f] powerpc/book3s64/kuap: Move KUAP related function outside radix
git bisect bad 3b47b7549ead0719e94022c6742199333c7c8d9f
# good: [39df17bc20059c84ddc6f91831fce2e2cc79a6f3] powerpc/book3s64/kuap/kuep: Move uamor setup to pkey init
git bisect good 39df17bc20059c84ddc6f91831fce2e2cc79a6f3
# first bad commit: [3b47b7549ead0719e94022c6742199333c7c8d9f] powerpc/book3s64/kuap: Move KUAP related function outside radix
Thanks
-Sachin
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox