LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [RFC v2 2/2] [MOCKUP] sched/mm: Lightweight lazy mm refcounting
From: Andy Lutomirski @ 2020-12-04  5:26 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Nadav Amit, Arnd Bergmann, Rik van Riel, Will Deacon,
	X86 ML, Dave Hansen, LKML, Linux-MM, Mathieu Desnoyers,
	Andy Lutomirski, Catalin Marinas, Jann Horn, linuxppc-dev
In-Reply-To: <cover.1607059162.git.luto@kernel.org>

This is a mockup.  It's designed to illustrate the algorithm and how the
code might be structured.  There are several things blatantly wrong with
it:

The coding stype is not up to kernel standards.  I have prototypes in the
wrong places and other hacks.

There's a problem with mm_cpumask() not being reliable.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 kernel/fork.c        |   4 ++
 kernel/sched/core.c  | 128 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |  11 +++-
 3 files changed, 126 insertions(+), 17 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index da8d360fb032..0887a33cf84f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1066,6 +1066,8 @@ struct mm_struct *mm_alloc(void)
 	return mm_init(mm, current, current_user_ns());
 }
 
+extern void mm_fixup_lazy_refs(struct mm_struct *mm);
+
 static inline void __mmput(struct mm_struct *mm)
 {
 	VM_BUG_ON(atomic_read(&mm->mm_users));
@@ -1084,6 +1086,8 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+
+	mm_fixup_lazy_refs(mm);
 	mmdrop(mm);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6c4b76147166..69dfdfe0e5b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3555,6 +3555,75 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	prepare_arch_switch(next);
 }
 
+static void drop_extra_mm_refs(struct rq *rq)
+{
+	unsigned long old_mm;
+
+	if (likely(!atomic_long_read(&rq->mm_to_mmdrop)))
+		return;
+
+	/*
+	 * Slow path.  This only happens when we recently stopped using
+	 * an mm that is exiting.
+	 */
+	old_mm = atomic_long_xchg_relaxed(&rq->mm_to_mmdrop, 0);
+	if (old_mm)
+		mmdrop((struct mm_struct *)old_mm);
+}
+
+/*
+ * This ensures that all lazy_mm refs to mm are converted to mm_count
+ * refcounts.  Our caller holds an mm_count reference, so we don't need
+ * to worry about mm being freed out from under us.
+ */
+void mm_fixup_lazy_refs(struct mm_struct *mm)
+{
+	int cpu;
+
+	/*
+	 * mm_users is zero, so no new lazy refs will be taken.
+	 */
+	WARN_ON_ONCE(atomic_read(&mm->mm_users) != 0);
+
+	/*
+	 * XXX: this is wrong on arm64 and possibly on other architectures.
+	 * Maybe we need a config option for this?  Or a
+	 * for_each_possible_lazy_cpu(cpu, mm) helper?
+	 */
+	for_each_cpu(cpu, mm_cpumask(mm)) {
+		struct rq *rq = cpu_rq(cpu);
+		unsigned long old;
+
+		if (READ_ONCE(rq->lazy_mm) != mm)
+			continue;
+
+		// XXX: we could optimize this by doing a big addition to
+		// mm_count up front instead of incrementing it separately
+		// for each CPU.
+		mmgrab(mm);
+
+		// XXX: could this be relaxed instead?
+		old = atomic_long_xchg(&rq->mm_to_mmdrop, (unsigned long)mm);
+
+		// At this point, mm can be mmdrop()ed at any time, probably
+		// by the target cpu.
+
+		if (!old)
+			continue;  // All done!
+
+		WARN_ON_ONCE(old == (unsigned long)mm);
+
+		// Uh oh!  We just stole an mm reference from the target CPU.
+		// Fortunately, we just observed the target's lazy_mm pointing
+		// to something other than old, and we observed this after
+		// bringing mm_users down to 0.  This means that the remote
+		// cpu is definitely done with old.  So we can drop it on the
+		// remote CPU's behalf.
+
+		mmdrop((struct mm_struct *)old);
+	}
+}
+
 /**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
@@ -3578,7 +3647,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
 
 	/*
@@ -3597,8 +3665,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
-
 	/*
 	 * A task struct has one reference for the use as "current".
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
@@ -3629,11 +3695,28 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, and mmdrop() provides
 	 * this barrier.
 	 *
-	 * XXX: I don't think mmdrop() actually does this.  There's no
-	 * smp_mb__before/after_atomic() in there.
+	 * XXX: I don't think mmdrop() actually did this.  There's no
+	 * smp_mb__before/after_atomic() in there.  But mmdrop()
+	 * certainly doesn't do this now, since we don't call mmdrop().
 	 */
-	if (mm)
-		mmdrop(mm);
+	if (current->mm && rq->lazy_mm) {
+		/*
+		 * We are unlazying.  Any remote CPU that observes our
+		 * store to lazy_mm is permitted to free the mm if mm_users
+		 * and mm_count are both zero.
+		 */
+		WRITE_ONCE(rq->lazy_mm, NULL);
+	}
+
+	// Do this unconditionally.  There's a race in which a remote CPU
+	// sees rq->lazy_mm != NULL and gives us an extra mm ref while we
+	// are executing this code and we don't notice.  Instead of letting
+	// that ref sit around until the next time we unlazy, do it on every
+	// context switch.
+	//
+	// XXX: maybe we should do this at the beginning of a context switch
+	// instead?
+	drop_extra_mm_refs(rq);
 
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
@@ -3737,20 +3820,31 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	arch_start_context_switch(prev);
 
 	/*
-	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 * TODO: write a new comment!
 	 *
-	 * kernel ->   user   switch + mmdrop() active
-	 *   user ->   user   switch
+	 * NB: none of this is kept in sync with the arch code.
+	 * In particular, active_mm can point to an mm that is no longer
+	 * in use by the arch mm code, and this condition can persist
+	 * across multiple context switches.  This isn't a problem
+	 * per se, but it does mean that using active_mm for anything
+	 * other than keeping an mm from being freed is a bit dubious.
 	 */
 	if (!next->mm) {                                // to kernel
 		enter_lazy_tlb(prev->active_mm, next);
 
 		next->active_mm = prev->active_mm;
-		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
-		else
+		if (prev->mm) {                         // from user
+			WARN_ON_ONCE(rq->lazy_mm);
+			WRITE_ONCE(rq->lazy_mm, next->active_mm);
+			/*
+			 * barrier here?  this needs to be visible to any
+			 * remote CPU that starts executing __mmput().  That
+			 * can't happen until either we call mmput() or until
+			 * prev migrates elsewhere.
+			 */
+		} else {
 			prev->active_mm = NULL;
+		}
 	} else {                                        // to user
 		membarrier_switch_mm(rq, prev->active_mm, next->mm);
 		/*
@@ -3760,12 +3854,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		 * The below provides this either through switch_mm(), or in
 		 * case 'prev->active_mm == next->mm' through
 		 * finish_task_switch()'s mmdrop().
+		 *
+		 * XXX: mmdrop() didn't do this before, and the new
+		 * code doesn't even call mmdrop().
 		 */
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+			/* will release lazy_mm in finish_task_switch(). */
 			prev->active_mm = NULL;
 		}
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 28709f6b0975..e0caee5f158e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,16 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+
+	/*
+	 * Hazard pointer for an mm that we might be using lazily.
+	 */
+	struct mm_struct	*lazy_mm;
+
+	/*
+	 * An mm that needs mmdrop()ing.
+	 */
+	atomic_long_t		mm_to_mmdrop;
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.28.0


^ permalink raw reply related

* Re: [PATCH 3/7] powerpc/64s: flush L1D after user accesses
From: Christophe Leroy @ 2020-12-04  6:21 UTC (permalink / raw)
  To: Qian Cai
  Cc: cmr, spoorts2, npiggin, linuxppc-dev, Christoph Hellwig,
	Daniel Axtens
In-Reply-To: <da02e10d6b5a63dc10159d4420def15aa0bc4c19.camel@redhat.com>


Quoting Qian Cai <qcai@redhat.com>:

> On Thu, 2020-12-03 at 12:17 -0500, Qian Cai wrote:
>> []
>> > +static inline bool
>> > +bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool  
>> is_write)
>> > +{
>> > +	return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) &&
>> > +		    (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE :  
>> AMR_KUAP_BLOCK_READ)),
>> > +		    "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read");
>> > +}
>>
>> A simple "echo t > /proc/sysrq-trigger" will trigger this warning almost
>> endlessly on POWER9 NV.
>
> I have just realized the patch just moved this warning around, so  
> the issue was
> pre-existent. Since I have not tested sysrq-t regularly, I am not  
> sure when it
> started to break. So far, I have reverted some of those for testing which did
> not help, i.e., the sysrq-t issue remains.
>
> 16852975f0f  Revert "powerpc/64s: Use early_mmu_has_feature() in set_kuap()"
> 129e240ead32 Revert "powerpc: Implement user_access_save() and  
> user_access_restore()"
> edb0046c842c Revert "powerpc/64s/kuap: Add missing isync to KUAP  
> restore paths"
> 2d46ee87ce44 Revert "powerpc/64/kuap: Conditionally restore AMR in  
> interrupt exit"
> c1e0e805fc57 Revert "powerpc/64s/kuap: Conditionally restore AMR in  
> kuap_restore_amr asm"
> 7f30b7aaf23a Revert "selftests/powerpc: rfi_flush: disable entry  
> flush if present"
> bc9b9967a100 Revert "powerpc/64s: flush L1D on kernel entry"
> b77e7b54f5eb Revert "powerpc/64s: flush L1D after user accesses"
> 22dddf532c64 Revert "powerpc: Only include kup-radix.h for 64-bit Book3S"
> 2679d155c46a Revert "selftests/powerpc: entry flush test"
> 87954b9b4243 Revert "selftests/powerpc: refactor entry and rfi_flush tests"
> 342d82bd4c5d Revert "powerpc/64s: rename pnv|pseries_setup_rfi_flush  
> to _setup_security_mitigations"

I also hit that WARNING in the same way earlier this week.

I think it has been broken by commit c33165253492 ("powerpc: use  
non-set_fs based maccess routines")

IIUC we should provide copy_from_kernel_nofault_allowed() to avoid that.

Christophe

^ permalink raw reply

* Re: [PATCH] powerpc/mm: Don't see NULL pointer dereference as a KUAP fault
From: Christophe Leroy @ 2020-12-04  6:30 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <87mtyvumrn.fsf@mpe.ellerman.id.au>



Le 03/12/2020 à 12:55, Michael Ellerman a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Sometimes, NULL pointer dereferences are expected. Even when they
>> are accidental they are unlikely an exploit attempt because the
>> first page is never mapped.
> 
> The first page can be mapped if mmap_min_addr is 0.
> 
> Blocking all faults to the first page would potentially break any
> program that does that.
> 
> Also if there is something mapped at 0 it's a good chance it is an
> exploit attempt :)

Ok, I see.

In fact, we hit this warning because we don't provide copy_from_kernel_nofault_allowed()

I'll cook a patch for that.

Christophe

^ permalink raw reply

* Re: [RFC v2 1/2] [NEEDS HELP] x86/mm: Handle unlazying membarrier core sync in the arch code
From: Nicholas Piggin @ 2020-12-04  7:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Nadav Amit, X86 ML, Arnd Bergmann, Jann Horn,
	Catalin Marinas, Rik van Riel, LKML, Linux-MM, Dave Hansen,
	Will Deacon, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <203d39d11562575fd8bd6a094d97a3a332d8b265.1607059162.git.luto@kernel.org>

Excerpts from Andy Lutomirski's message of December 4, 2020 3:26 pm:
> The core scheduler isn't a great place for
> membarrier_mm_sync_core_before_usermode() -- the core scheduler doesn't
> actually know whether we are lazy.  With the old code, if a CPU is
> running a membarrier-registered task, goes idle, gets unlazied via a TLB
> shootdown IPI, and switches back to the membarrier-registered task, it
> will do an unnecessary core sync.
> 
> Conveniently, x86 is the only architecture that does anything in this
> hook, so we can just move the code.

This should go on top of my series that adds the exit_lazy_mm call
and switches x86 over, at least.

> XXX: there are some comments in swich_mm_irqs_off() that seem to be
> trying to document what barriers are expected, and it's not clear to me
> that these barriers are actually present in all paths through the
> code.  So I think this change makes the code more comprehensible and
> has no effect on the code's correctness, but I'm not at all convinced
> that the code is correct.
> 
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
> ---
>  arch/x86/mm/tlb.c   | 17 ++++++++++++++++-
>  kernel/sched/core.c | 14 +++++++-------
>  2 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 3338a1feccf9..23df035b80e8 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -8,6 +8,7 @@
>  #include <linux/export.h>
>  #include <linux/cpu.h>
>  #include <linux/debugfs.h>
> +#include <linux/sched/mm.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/mmu_context.h>
> @@ -496,6 +497,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>  		 * from one thread in a process to another thread in the same
>  		 * process. No TLB flush required.
>  		 */
> +
> +		// XXX: why is this okay wrt membarrier?
>  		if (!was_lazy)
>  			return;
>  
> @@ -508,12 +511,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
>  		smp_mb();
>  		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
>  		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
> -				next_tlb_gen)
> +		    next_tlb_gen) {
> +			/*
> +			 * We're reactivating an mm, and membarrier might
> +			 * need to serialize.  Tell membarrier.
> +			 */
> +
> +			// XXX: I can't understand the logic in
> +			// membarrier_mm_sync_core_before_usermode().  What's
> +			// the mm check for?

Writing CR3 is serializing, apparently. Another x86ism that gets 
commented and moved into arch/x86 with my patch.


> +			membarrier_mm_sync_core_before_usermode(next);
>  			return;
> +		}
>  
>  		/*
>  		 * TLB contents went out of date while we were in lazy
>  		 * mode. Fall through to the TLB switching code below.
> +		 * No need for an explicit membarrier invocation -- the CR3
> +		 * write will serialize.
>  		 */
>  		new_asid = prev_asid;
>  		need_flush = true;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2d95dc3f4644..6c4b76147166 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3619,22 +3619,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>  	kcov_finish_switch(current);
>  
>  	fire_sched_in_preempt_notifiers(current);
> +
>  	/*
>  	 * When switching through a kernel thread, the loop in
>  	 * membarrier_{private,global}_expedited() may have observed that
>  	 * kernel thread and not issued an IPI. It is therefore possible to
>  	 * schedule between user->kernel->user threads without passing though
>  	 * switch_mm(). Membarrier requires a barrier after storing to
> -	 * rq->curr, before returning to userspace, so provide them here:
> +	 * rq->curr, before returning to userspace, and mmdrop() provides
> +	 * this barrier.
>  	 *
> -	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
> -	 *   provided by mmdrop(),
> -	 * - a sync_core for SYNC_CORE.
> +	 * XXX: I don't think mmdrop() actually does this.  There's no
> +	 * smp_mb__before/after_atomic() in there.

mmdrop definitely does provide a full barrier.

>  	 */
> -	if (mm) {
> -		membarrier_mm_sync_core_before_usermode(mm);
> +	if (mm)
>  		mmdrop(mm);
> -	}
> +
>  	if (unlikely(prev_state == TASK_DEAD)) {
>  		if (prev->sched_class->task_dead)
>  			prev->sched_class->task_dead(prev);
> -- 
> 2.28.0
> 
> 

^ permalink raw reply

* Re: [RFC v2 2/2] [MOCKUP] sched/mm: Lightweight lazy mm refcounting
From: Nicholas Piggin @ 2020-12-04  7:54 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Nadav Amit, X86 ML, Arnd Bergmann, Jann Horn,
	Catalin Marinas, Rik van Riel, LKML, Linux-MM, Dave Hansen,
	Will Deacon, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <e69827244c2f1e534aa83a40334ffa00bafe54c2.1607059162.git.luto@kernel.org>

Excerpts from Andy Lutomirski's message of December 4, 2020 3:26 pm:
> This is a mockup.  It's designed to illustrate the algorithm and how the
> code might be structured.  There are several things blatantly wrong with
> it:
> 
> The coding stype is not up to kernel standards.  I have prototypes in the
> wrong places and other hacks.
> 
> There's a problem with mm_cpumask() not being reliable.

Interesting, this might be a way to reduce those IPIs with fairly 
minimal fast path cost. Would be interesting to see how much performance 
advantage it has over my dumb simple shoot-lazies.

For powerpc I don't think we'd be inclined to go that way, so don't feel 
the need to add this complexity for us alone -- we'd be more inclined to 
move the exit lazy to the final TLB shootdown path, which we're slowly 
getting more infrastructure in place to do.

(The powerpc hash MMU code which we're slowly moving away from might 
never get that capability because it's complex there and hard to do with
that virtualisation model so current big systems (and radix MMU until we
finish the TLB flushing stuff) want something here, but for those the
shoot-lazies could quite likely be sufficient)

But if core code was moved over to something like this for the benefit of
others archs we'd probably just as happily do that.

There's a few nits but I don't think I can see a fundamental problem 
yet.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH] powerpc/hotplug: assign hot added LMB to the right node
From: Laurent Dufour @ 2020-12-04  8:12 UTC (permalink / raw)
  To: Greg KH
  Cc: Nathan Lynch, Scott Cheloha, linux-kernel, stable, Paul Mackerras,
	linuxppc-dev
In-Reply-To: <X8ktsoAv4/h+p1/8@kroah.com>

Le 03/12/2020 à 19:25, Greg KH a écrit :
> On Thu, Dec 03, 2020 at 11:15:14AM +0100, Laurent Dufour wrote:
>> This patch applies to 5.9 and earlier kernels only.
>>
>> Since 5.10, this has been fortunately fixed by the commit
>> e5e179aa3a39 ("pseries/drmem: don't cache node id in drmem_lmb struct").
> 
> Why can't we just backport that patch instead?  It's almost always
> better to do that than to have a one-off patch, as almost always those
> have bugs in them.

That's a good option too.
I was thinking that this 5.10 patch was not matching the stable release's 
guidelines since it was targeting performance issue, but since it is also fixing 
this issue, I'm certainly wrong.

So, forget that patch.

Thanks,
Laurent.

^ permalink raw reply

* Re: [PATCH v8 11/12] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2020-12-04  8:12 UTC (permalink / raw)
  To: akpm@linux-foundation.org, linux-mm@kvack.org, Edgecombe, Rick P
  Cc: linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org,
	hch@infradead.org, lizefan@huawei.com,
	Jonathan.Cameron@Huawei.com, linuxppc-dev@lists.ozlabs.org
In-Reply-To: <e9d3a50e1b18f9ea1cdfdc221bef75db19273417.camel@intel.com>

Excerpts from Edgecombe, Rick P's message of December 1, 2020 6:21 am:
> On Sun, 2020-11-29 at 01:25 +1000, Nicholas Piggin wrote:
>> Support huge page vmalloc mappings. Config option
>> HAVE_ARCH_HUGE_VMALLOC
>> enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
>> supports PMD sized vmap mappings.
>> 
>> vmalloc will attempt to allocate PMD-sized pages if allocating PMD
>> size
>> or larger, and fall back to small pages if that was unsuccessful.
>> 
>> Allocations that do not use PAGE_KERNEL prot are not permitted to use
>> huge pages, because not all callers expect this (e.g., module
>> allocations vs strict module rwx).
> 
> Several architectures (x86, arm64, others?) allocate modules initially
> with PAGE_KERNEL and so I think this test will not exclude module
> allocations in those cases.

Ah, thanks. I guess archs must additionally ensure that their
PAGE_KERNEL allocations are suitable for huge page mappings before
enabling the option.

If there is interest from those archs to support this, I have an
early (un-posted) patch that adds an explicit VM_HUGE flag that could
override the pessemistic arch default. It's not much trouble to add this 
to the large system hash allocations. It's very out of date now but I 
can at least give what I have to anyone doing an arch support that
wants it.

> 
> [snip]
> 
>> @@ -2400,6 +2453,7 @@ static inline void set_area_direct_map(const
>> struct vm_struct *area,
>>  {
>>  	int i;
>>  
>> +	/* HUGE_VMALLOC passes small pages to set_direct_map */
>>  	for (i = 0; i < area->nr_pages; i++)
>>  		if (page_address(area->pages[i]))
>>  			set_direct_map(area->pages[i]);
>> @@ -2433,11 +2487,12 @@ static void vm_remove_mappings(struct
>> vm_struct *area, int deallocate_pages)
>>  	 * map. Find the start and end range of the direct mappings to
>> make sure
>>  	 * the vm_unmap_aliases() flush includes the direct map.
>>  	 */
>> -	for (i = 0; i < area->nr_pages; i++) {
>> +	for (i = 0; i < area->nr_pages; i += 1U << area->page_order) {
>>  		unsigned long addr = (unsigned long)page_address(area-
>> >pages[i]);
>>  		if (addr) {
>> +			unsigned long page_size = PAGE_SIZE << area-
>> >page_order;
>>  			start = min(addr, start);
>> -			end = max(addr + PAGE_SIZE, end);
>> +			end = max(addr + page_size, end);
>>  			flush_dmap = 1;
>>  		}
>>  	}
> 
> The logic around this is a bit tangled. The reset of the direct map has
> to succeed, but if the set_direct_map_() functions require a split they
> could fail. For x86, set_memory_ro() calls on a vmalloc alias will
> mirror the page size and permission on the direct map and so the direct
> map will be broken to 4k pages if it's a RO vmalloc allocation.
> 
> But after this, module vmalloc()'s could have large pages which would
> result in large RO pages on the direct map. Then it could possibly fail
> when trying to reset a 4k page out of a large RO direct map mapping. 
> 
> I think either module allocations need to be actually excluded from
> having large pages (seems like you might have seen other issues as
> well?), or another option could be to use the changes here:
> https://lore.kernel.org/lkml/20201125092208.12544-4-rppt@kernel.org/
> to reset the direct map for a large page range at a time for large 
> vmalloc pages.
> 

Right, x86 would have to do something about that before enabling.
A VM_HUGE flag might be quick and easy but maybe other options are not 
too difficult.

Thanks,
Nick

^ permalink raw reply

* Re: [RFC v2 1/2] [NEEDS HELP] x86/mm: Handle unlazying membarrier core sync in the arch code
From: Nadav Amit @ 2020-12-04  8:17 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Rik van Riel, Will Deacon, X86 ML,
	Dave Hansen, LKML, Nicholas Piggin, Linux-MM, Mathieu Desnoyers,
	Catalin Marinas, Jann Horn, linuxppc-dev
In-Reply-To: <203d39d11562575fd8bd6a094d97a3a332d8b265.1607059162.git.luto@kernel.org>

I am not very familiar with membarrier, but here are my 2 cents while trying
to answer your questions.

> On Dec 3, 2020, at 9:26 PM, Andy Lutomirski <luto@kernel.org> wrote:
> @@ -496,6 +497,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> 		 * from one thread in a process to another thread in the same
> 		 * process. No TLB flush required.
> 		 */
> +
> +		// XXX: why is this okay wrt membarrier?
> 		if (!was_lazy)
> 			return;

I am confused.

On one hand, it seems that membarrier_private_expedited() would issue an IPI
to that core, as it would find that this core’s cpu_rq(cpu)->curr->mm is the
same as the one that the membarrier applies to. But… (see below)


> @@ -508,12 +511,24 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
> 		smp_mb();
> 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
> 		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
> -				next_tlb_gen)
> +		    next_tlb_gen) {
> +			/*
> +			 * We're reactivating an mm, and membarrier might
> +			 * need to serialize.  Tell membarrier.
> +			 */
> +
> +			// XXX: I can't understand the logic in
> +			// membarrier_mm_sync_core_before_usermode().  What's
> +			// the mm check for?
> +			membarrier_mm_sync_core_before_usermode(next);

On the other hand the reason for this mm check that you mention contradicts
my previous understanding as the git log says:

commit 2840cf02fae627860156737e83326df354ee4ec6
Author: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date:   Thu Sep 19 13:37:01 2019 -0400

    sched/membarrier: Call sync_core only before usermode for same mm
    
    When the prev and next task's mm change, switch_mm() provides the core
    serializing guarantees before returning to usermode. The only case
    where an explicit core serialization is needed is when the scheduler
    keeps the same mm for prev and next.

> 	/*
> 	 * When switching through a kernel thread, the loop in
> 	 * membarrier_{private,global}_expedited() may have observed that
> 	 * kernel thread and not issued an IPI. It is therefore possible to
> 	 * schedule between user->kernel->user threads without passing though
> 	 * switch_mm(). Membarrier requires a barrier after storing to
> -	 * rq->curr, before returning to userspace, so provide them here:
> +	 * rq->curr, before returning to userspace, and mmdrop() provides
> +	 * this barrier.
> 	 *
> -	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
> -	 *   provided by mmdrop(),
> -	 * - a sync_core for SYNC_CORE.
> +	 * XXX: I don't think mmdrop() actually does this.  There's no
> +	 * smp_mb__before/after_atomic() in there.

I presume that since x86 is the only one that needs
membarrier_mm_sync_core_before_usermode(), nobody noticed the missing
smp_mb__before/after_atomic(). These are anyhow a compiler barrier in x86,
and such a barrier would take place before the return to userspace.


^ permalink raw reply

* Re: [PATCH v3 05/19] powerpc: interrupt handler wrapper functions
From: Nicholas Piggin @ 2020-12-04  8:31 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <87o8jfpa5k.fsf@linux.ibm.com>

Excerpts from Aneesh Kumar K.V's message of November 30, 2020 5:37 pm:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
> .....
>  +#endif
>> +DECLARE_INTERRUPT_HANDLER(emulation_assist_interrupt);
>> +DECLARE_INTERRUPT_HANDLER_RAW(do_slb_fault);
> 
> Can we add comments here explaining why some of these handlers need to remain RAW()?

I possibly could. My patch doesn't change the reason, of course, they 
already have these issues.

It wants to avoid reconciling interrupts and probably context tracking 
because you can take SLB faults within those subsystems, which are not 
expecting re-entrant calls into them. It's okay to avoid these things 
because the interrupts don't enable interrupts, go to process context, 
add any timers, etc.

Now that I look at it, possibly the primary do_hash_fault handler needs 
to be RAW as well for the same reason.

>> +DECLARE_INTERRUPT_HANDLER(do_bad_slb_fault);
>> +DECLARE_INTERRUPT_HANDLER_RET(do_hash_fault);
>> +DECLARE_INTERRUPT_HANDLER_RET(do_page_fault);
>> +DECLARE_INTERRUPT_HANDLER(do_bad_page_fault);
>> +
>> +DECLARE_INTERRUPT_HANDLER_ASYNC(timer_interrupt);
>> +DECLARE_INTERRUPT_HANDLER_NMI(performance_monitor_exception_nmi);
>> +DECLARE_INTERRUPT_HANDLER_ASYNC(performance_monitor_exception_async);
>> +DECLARE_INTERRUPT_HANDLER_RAW(performance_monitor_exception);
> 
> Same for this.

That's just because nmi vs async is decided at runtime for PMIs. I can 
add a comment, although it's more obvious when looking at the body.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH] macintosh/adb-iop: Send correct poll command
From: Geert Uytterhoeven @ 2020-12-04 10:03 UTC (permalink / raw)
  To: Finn Thain; +Cc: linuxppc-dev, Linux Kernel Mailing List, Joshua Thompson
In-Reply-To: <58bba4310da4c29b068345a4b36af8a531397ff7.1605847196.git.fthain@telegraphics.com.au>

Hi Finn,

On Fri, Nov 20, 2020 at 5:54 AM Finn Thain <fthain@telegraphics.com.au> wrote:
> The behaviour of the IOP firmware is not well documented but we do know
> that IOP message reply data can be used to issue new ADB commands.
> Use the message reply to better control autopoll behaviour by sending
> a Talk Register 0 command after every ADB response, not unlike the
> algorithm in the via-macii driver. This poll command is addressed to
> that device which last received a Talk command (explicit or otherwise).
>
> Cc: Joshua Thompson <funaho@jurai.org>
> Fixes: fa3b5a9929fc ("macintosh/adb-iop: Implement idle -> sending state transition")

WARNING: Unknown commit id 'fa3b5a9929fc', maybe rebased or not pulled?

32226e817043?

> Tested-by: Stan Johnson <userm57@yahoo.com>
> Signed-off-by: Finn Thain <fthain@telegraphics.com.au>

Thanks, will queue in the m68k for-v5.11 branch.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH] macintosh/adb-iop: Always wait for reply message from IOP
From: Geert Uytterhoeven @ 2020-12-04 10:08 UTC (permalink / raw)
  To: Finn Thain; +Cc: linuxppc-dev, Linux Kernel Mailing List, Joshua Thompson
In-Reply-To: <0f0a25855391e7eaa53a50f651aea0124e8525dd.1605847196.git.fthain@telegraphics.com.au>

Hi Finn,

On Fri, Nov 20, 2020 at 5:54 AM Finn Thain <fthain@telegraphics.com.au> wrote:
> A recent patch incorrectly altered the adb-iop state machine behaviour
> and introduced a regression that can appear intermittently as a
> malfunctioning ADB input device. This seems to be caused when reply
> packets from different ADB commands become mixed up, especially during
> the adb bus scan. Fix this by unconditionally entering the awaiting_reply
> state after sending an explicit command, even when the ADB command won't
> generate a reply from the ADB device.
>
> Cc: Joshua Thompson <funaho@jurai.org>
> Fixes: e2954e5f727f ("macintosh/adb-iop: Implement sending -> idle state transition")
> Tested-by: Stan Johnson <userm57@yahoo.com>
> Signed-off-by: Finn Thain <fthain@telegraphics.com.au>

Thanks for your patch!

> --- a/drivers/macintosh/adb-iop.c
> +++ b/drivers/macintosh/adb-iop.c
> @@ -84,10 +84,7 @@ static void adb_iop_complete(struct iop_msg *msg)
>
>         local_irq_save(flags);
>
> -       if (current_req->reply_expected)
> -               adb_iop_state = awaiting_reply;
> -       else
> -               adb_iop_done();
> +       adb_iop_state = awaiting_reply;
>
>         local_irq_restore(flags);
>  }
> @@ -95,8 +92,9 @@ static void adb_iop_complete(struct iop_msg *msg)
>  /*
>   * Listen for ADB messages from the IOP.
>   *
> - * This will be called when unsolicited messages (usually replies to TALK
> - * commands or autopoll packets) are received.
> + * This will be called when unsolicited IOP messages are received.
> + * These IOP messages can carry ADB autopoll responses and also occur
> + * after explicit ADB commands.
>   */
>
>  static void adb_iop_listen(struct iop_msg *msg)
> @@ -123,8 +121,10 @@ static void adb_iop_listen(struct iop_msg *msg)
>                 if (adb_iop_state == awaiting_reply) {
>                         struct adb_request *req = current_req;
>
> -                       req->reply_len = amsg->count + 1;
> -                       memcpy(req->reply, &amsg->cmd, req->reply_len);
> +                       if (req->reply_expected) {
> +                               req->reply_len = amsg->count + 1;
> +                               memcpy(req->reply, &amsg->cmd, req->reply_len);
> +                       }

So if we're not expecting a reply. It's ignored.
Just wondering: what kind of messages are being dropped?
If reply packets from different ADB commands become mixed up,
they are still (expected?) replies to messages we sent before. Why
shouldn't we depend on receiving the replies?

>
>                         req_done = true;
>                 }

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH] powerpc/8xx: Fix early debug when SMC1 is relocated
From: Christophe Leroy @ 2020-12-04 10:11 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

When SMC1 is relocated and early debug is selected, the
board hangs is ppc_md.setup_arch(). This is because ones
the microcode has been loaded and SMC1 relocated, early
debug writes in the weed.

To allow smooth continuation, the SMC1 parameter RAM set up
by the bootloader have to be copied into the new location.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Fixes: 43db76f41824 ("powerpc/8xx: Add microcode patch to move SMC parameter RAM.")
Cc: stable@vger.kernel.org
---
 arch/powerpc/include/asm/cpm1.h         |  1 +
 arch/powerpc/platforms/8xx/micropatch.c | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/cpm1.h b/arch/powerpc/include/asm/cpm1.h
index a116fe931789..3bdd74739cb8 100644
--- a/arch/powerpc/include/asm/cpm1.h
+++ b/arch/powerpc/include/asm/cpm1.h
@@ -68,6 +68,7 @@ extern void cpm_reset(void);
 #define PROFF_SPI	((uint)0x0180)
 #define PROFF_SCC3	((uint)0x0200)
 #define PROFF_SMC1	((uint)0x0280)
+#define PROFF_DSP1	((uint)0x02c0)
 #define PROFF_SCC4	((uint)0x0300)
 #define PROFF_SMC2	((uint)0x0380)
 
diff --git a/arch/powerpc/platforms/8xx/micropatch.c b/arch/powerpc/platforms/8xx/micropatch.c
index aed4bc75f352..aef179fcbd4f 100644
--- a/arch/powerpc/platforms/8xx/micropatch.c
+++ b/arch/powerpc/platforms/8xx/micropatch.c
@@ -360,6 +360,17 @@ void __init cpm_load_patch(cpm8xx_t *cp)
 	if (IS_ENABLED(CONFIG_SMC_UCODE_PATCH)) {
 		smc_uart_t *smp;
 
+		if (IS_ENABLED(CONFIG_PPC_EARLY_DEBUG_CPM)) {
+			int i;
+
+			for (i = 0; i < sizeof(*smp); i += 4) {
+				u32 __iomem *src = (u32 __iomem *)&cp->cp_dparam[PROFF_SMC1 + i];
+				u32 __iomem *dst = (u32 __iomem *)&cp->cp_dparam[PROFF_DSP1 + i];
+
+				out_be32(dst, in_be32(src));
+			}
+		}
+
 		smp = (smc_uart_t *)&cp->cp_dparam[PROFF_SMC1];
 		out_be16(&smp->smc_rpbase, 0x1ec0);
 		smp = (smc_uart_t *)&cp->cp_dparam[PROFF_SMC2];
-- 
2.25.0


^ permalink raw reply related

* [PATCH] powerpc/process: Remove target specific __set_dabr()
From: Christophe Leroy @ 2020-12-04 10:12 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

__set_dabr() are simple functions that can be inline directly
inside set_dabr() and using IS_ENABLED() instead of #ifdef

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/process.c | 37 ++++++++++++-----------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index d421a2c7f822..5ef99138b696 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -807,29 +807,6 @@ static void switch_hw_breakpoint(struct task_struct *new)
 #endif /* !CONFIG_HAVE_HW_BREAKPOINT */
 #endif	/* CONFIG_PPC_ADV_DEBUG_REGS */
 
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
-{
-	mtspr(SPRN_DAC1, dabr);
-	if (IS_ENABLED(CONFIG_PPC_47x))
-		isync();
-	return 0;
-}
-#elif defined(CONFIG_PPC_BOOK3S)
-static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
-{
-	mtspr(SPRN_DABR, dabr);
-	if (cpu_has_feature(CPU_FTR_DABRX))
-		mtspr(SPRN_DABRX, dabrx);
-	return 0;
-}
-#else
-static inline int __set_dabr(unsigned long dabr, unsigned long dabrx)
-{
-	return -EINVAL;
-}
-#endif
-
 static inline int set_dabr(struct arch_hw_breakpoint *brk)
 {
 	unsigned long dabr, dabrx;
@@ -840,7 +817,19 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk)
 	if (ppc_md.set_dabr)
 		return ppc_md.set_dabr(dabr, dabrx);
 
-	return __set_dabr(dabr, dabrx);
+	if (IS_ENABLED(CONFIG_PPC_ADV_DEBUG_REGS)) {
+		mtspr(SPRN_DAC1, dabr);
+		if (IS_ENABLED(CONFIG_PPC_47x))
+			isync();
+		return 0;
+	} else if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
+		mtspr(SPRN_DABR, dabr);
+		if (cpu_has_feature(CPU_FTR_DABRX))
+			mtspr(SPRN_DABRX, dabrx);
+		return 0;
+	} else {
+		return -EINVAL;
+	}
 }
 
 static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk)
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH] powerpc/book3s_hv_uvmem: Check for failed page migration
From: Bharata B Rao @ 2020-12-04 10:18 UTC (permalink / raw)
  To: Alistair Popple; +Cc: linuxppc-dev, Ram Pai
In-Reply-To: <20201203050812.5234-1-alistair@popple.id.au>

On Thu, Dec 03, 2020 at 04:08:12PM +1100, Alistair Popple wrote:
> migrate_vma_pages() may still clear MIGRATE_PFN_MIGRATE on pages which
> are not able to be migrated. Drivers may safely copy data prior to
> calling migrate_vma_pages() however a remote mapping must not be
> established until after migrate_vma_pages() has returned as the
> migration could still fail.
> 
> UV_PAGE_IN_in both copies and maps the data page, therefore it should
> only be called after checking the results of migrate_vma_pages().
> 
> Signed-off-by: Alistair Popple <alistair@popple.id.au>
> ---
>  arch/powerpc/kvm/book3s_hv_uvmem.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 84e5a2dc8be5..08aa6a90c525 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -762,7 +762,10 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>  		goto out_finalize;
>  	}
>  
> -	if (pagein) {
> +	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> +	migrate_vma_pages(&mig);
> +
> +	if ((*mig.src & MIGRATE_PFN_MIGRATE) && pagein) {
>  		pfn = *mig.src >> MIGRATE_PFN_SHIFT;
>  		spage = migrate_pfn_to_page(*mig.src);
>  		if (spage) {
> @@ -773,8 +776,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma,
>  		}
>  	}
>  
> -	*mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> -	migrate_vma_pages(&mig);
>  out_finalize:
>  	migrate_vma_finalize(&mig);
>  	return ret;

Reviewed-by: Bharata B Rao <bharata@linux.ibm.com>

Did you actually hit this scenario with secure VMs where a UV-paged-in
page was later found to be not migratable?

Regards,
Bharata.

^ permalink raw reply

* [PATCH] powerpc/mce: Remove per cpu variables from MCE handlers
From: Ganesh Goudar @ 2020-12-04 10:23 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Ganesh Goudar, mahesh, npiggin

Access to per-cpu variables requires translation to be enabled on
pseries machine running in hash mmu mode, Since part of MCE handler
runs in realmode and part of MCE handling code is shared between ppc
architectures pseries and powernv, it becomes difficult to manage
these variables differently on different architectures, So have
these variables in paca instead of having them as per-cpu variables
to avoid complications.

Maximum recursive depth of MCE is 4, Considering the maximum depth
allowed reduce the size of event to 10 from 100.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
 arch/powerpc/include/asm/mce.h  |  2 +-
 arch/powerpc/include/asm/paca.h | 12 ++++++++
 arch/powerpc/kernel/mce.c       | 54 +++++++++++++--------------------
 3 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 89aa8248a57d..feef45f2b51b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
 	bool			ignore_event;
 };
 
-#define MAX_MC_EVT	100
+#define MAX_MC_EVT	10
 
 /* Release flags for get_mce_event() */
 #define MCE_EVENT_RELEASE	true
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 9454d29ff4b4..4769954efa7d 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include <asm/hmi.h>
 #include <asm/cpuidle.h>
 #include <asm/atomic.h>
+#include <asm/mce.h>
 
 #include <asm-generic/mmiowb_types.h>
 
@@ -273,6 +274,17 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
 	struct mmiowb_state mmiowb_state;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	int mce_nest_count;
+	struct machine_check_event mce_event[MAX_MC_EVT];
+	/* Queue for delayed MCE events. */
+	int mce_queue_count;
+	struct machine_check_event mce_event_queue[MAX_MC_EVT];
+
+	/* Queue for delayed MCE UE events. */
+	int mce_ue_count;
+	struct machine_check_event  mce_ue_event_queue[MAX_MC_EVT];
+#endif /* CONFIG_PPC_BOOK3S_64 */
 } ____cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 63702c0badb9..5f53d02d6cbb 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -22,18 +22,6 @@
 #include <asm/mce.h>
 #include <asm/nmi.h>
 
-static DEFINE_PER_CPU(int, mce_nest_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
-
-/* Queue for delayed MCE events. */
-static DEFINE_PER_CPU(int, mce_queue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
-
-/* Queue for delayed MCE UE events. */
-static DEFINE_PER_CPU(int, mce_ue_count);
-static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
-					mce_ue_event_queue);
-
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
 static void machine_check_ue_event(struct machine_check_event *evt);
@@ -103,8 +91,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 		    struct mce_error_info *mce_err,
 		    uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
-	int index = __this_cpu_inc_return(mce_nest_count) - 1;
-	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
+	int index = get_paca()->mce_nest_count++;
+	struct machine_check_event *mce = &get_paca()->mce_event[index];
 
 	/*
 	 * Return if we don't have enough space to log mce event.
@@ -191,7 +179,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
  */
 int get_mce_event(struct machine_check_event *mce, bool release)
 {
-	int index = __this_cpu_read(mce_nest_count) - 1;
+	int index = get_paca()->mce_nest_count - 1;
 	struct machine_check_event *mc_evt;
 	int ret = 0;
 
@@ -201,7 +189,7 @@ int get_mce_event(struct machine_check_event *mce, bool release)
 
 	/* Check if we have MCE info to process. */
 	if (index < MAX_MC_EVT) {
-		mc_evt = this_cpu_ptr(&mce_event[index]);
+		mc_evt = &get_paca()->mce_event[index];
 		/* Copy the event structure and release the original */
 		if (mce)
 			*mce = *mc_evt;
@@ -211,7 +199,7 @@ int get_mce_event(struct machine_check_event *mce, bool release)
 	}
 	/* Decrement the count to free the slot. */
 	if (release)
-		__this_cpu_dec(mce_nest_count);
+		get_paca()->mce_nest_count--;
 
 	return ret;
 }
@@ -233,13 +221,13 @@ static void machine_check_ue_event(struct machine_check_event *evt)
 {
 	int index;
 
-	index = __this_cpu_inc_return(mce_ue_count) - 1;
+	index = get_paca()->mce_ue_count++;
 	/* If queue is full, just return for now. */
 	if (index >= MAX_MC_EVT) {
-		__this_cpu_dec(mce_ue_count);
+		get_paca()->mce_ue_count--;
 		return;
 	}
-	memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
+	memcpy(&get_paca()->mce_ue_event_queue[index], evt, sizeof(*evt));
 
 	/* Queue work to process this event later. */
 	irq_work_queue(&mce_ue_event_irq_work);
@@ -256,13 +244,13 @@ void machine_check_queue_event(void)
 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 		return;
 
-	index = __this_cpu_inc_return(mce_queue_count) - 1;
+	index = get_paca()->mce_queue_count++;
 	/* If queue is full, just return for now. */
 	if (index >= MAX_MC_EVT) {
-		__this_cpu_dec(mce_queue_count);
+		get_paca()->mce_queue_count--;
 		return;
 	}
-	memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
+	memcpy(&get_paca()->mce_event_queue[index], &evt, sizeof(evt));
 
 	/* Queue irq work to process this event later. */
 	irq_work_queue(&mce_event_process_work);
@@ -289,9 +277,9 @@ static void machine_process_ue_event(struct work_struct *work)
 	int index;
 	struct machine_check_event *evt;
 
-	while (__this_cpu_read(mce_ue_count) > 0) {
-		index = __this_cpu_read(mce_ue_count) - 1;
-		evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+	while (get_paca()->mce_ue_count > 0) {
+		index = get_paca()->mce_ue_count - 1;
+		evt = &get_paca()->mce_ue_event_queue[index];
 		blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
 #ifdef CONFIG_MEMORY_FAILURE
 		/*
@@ -304,7 +292,7 @@ static void machine_process_ue_event(struct work_struct *work)
 		 */
 		if (evt->error_type == MCE_ERROR_TYPE_UE) {
 			if (evt->u.ue_error.ignore_event) {
-				__this_cpu_dec(mce_ue_count);
+				get_paca()->mce_ue_count--;
 				continue;
 			}
 
@@ -320,7 +308,7 @@ static void machine_process_ue_event(struct work_struct *work)
 					"was generated\n");
 		}
 #endif
-		__this_cpu_dec(mce_ue_count);
+		get_paca()->mce_ue_count--;
 	}
 }
 /*
@@ -338,17 +326,17 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	 * For now just print it to console.
 	 * TODO: log this error event to FSP or nvram.
 	 */
-	while (__this_cpu_read(mce_queue_count) > 0) {
-		index = __this_cpu_read(mce_queue_count) - 1;
-		evt = this_cpu_ptr(&mce_event_queue[index]);
+	while (get_paca()->mce_queue_count > 0) {
+		index = get_paca()->mce_queue_count - 1;
+		evt = &get_paca()->mce_event_queue[index];
 
 		if (evt->error_type == MCE_ERROR_TYPE_UE &&
 		    evt->u.ue_error.ignore_event) {
-			__this_cpu_dec(mce_queue_count);
+			get_paca()->mce_queue_count--;
 			continue;
 		}
 		machine_check_print_event_info(evt, false, false);
-		__this_cpu_dec(mce_queue_count);
+		get_paca()->mce_queue_count--;
 	}
 }
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH] powerpc/xmon: Change printk() to pr_cont()
From: Christophe Leroy @ 2020-12-04 10:35 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Since some time now, printk() adds carriage return, leading to
unusable xmon output:

[   54.288722] sysrq: Entering xmon
[   54.292209] Vector: 0  at [cace3d2c]
[   54.292274]     pc:
[   54.292331] c0023650
[   54.292468] : xmon+0x28/0x58
[   54.292519]
[   54.292574]     lr:
[   54.292630] c0023724
[   54.292749] : sysrq_handle_xmon+0xa4/0xfc
[   54.292801]
[   54.292867]     sp: cace3de8
[   54.292931]    msr: 9032
[   54.292999]   current = 0xc28d0000
[   54.293072]     pid   = 377, comm = sh
[   54.293157] Linux version 5.10.0-rc6-s3k-dev-01364-gedf13f0ccd76-dirty (root@po17688vm.idsi0.si.c-s.fr) (powerpc64-linux-gcc (GCC) 10.1.0, GNU ld (GNU Binutils) 2.34) #4211 PREEMPT Fri Dec 4 09:32:11 UTC 2020
[   54.293287] enter ? for help
[   54.293470] [cace3de8]
[   54.293532] c0023724
[   54.293654]  sysrq_handle_xmon+0xa4/0xfc
[   54.293711]  (unreliable)
[   54.293859] [cace3e18]
[   54.293918] c03885a8
[   54.294056]  __handle_sysrq+0xe4/0x1d4
[   54.294108]
[   54.294255] [cace3e48]
[   54.294314] c0388704
[   54.294441]  write_sysrq_trigger+0x34/0x74
[   54.294493]
[   54.294641] [cace3e68]
[   54.294700] c01f65d0
[   54.294822]  proc_reg_write+0xac/0x11c
[   54.294875]
[   54.295023] [cace3e88]
[   54.295082] c0179910
[   54.295198]  vfs_write+0x134/0x46c
[   54.295250]
[   54.295396] [cace3f08]
[   54.295455] c0179de8
[   54.295567]  ksys_write+0x78/0x11c
[   54.295619]
[   54.295766] [cace3f38]
[   54.295825] c00110d0
[   54.295951]  ret_from_syscall+0x0/0x34
[   54.296002]
[   54.296159] --- Exception: c01 (System Call) at
[   54.296217] 0fd4e784
[   54.296303]
[   54.296375] SP (7fca6ff0) is in userspace
[   54.296431] mon>
[   54.296484]  <no input ...>

Use pr_cont() instead.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Fixes: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines")
Cc: stable@vger.kernel.org
---
 arch/powerpc/xmon/nonstdio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/nonstdio.c b/arch/powerpc/xmon/nonstdio.c
index 5c1a50912229..9b0d85bff021 100644
--- a/arch/powerpc/xmon/nonstdio.c
+++ b/arch/powerpc/xmon/nonstdio.c
@@ -178,7 +178,7 @@ void xmon_printf(const char *format, ...)
 
 	if (n && rc == 0) {
 		/* No udbg hooks, fallback to printk() - dangerous */
-		printk("%s", xmon_outbuf);
+		pr_cont("%s", xmon_outbuf);
 	}
 }
 
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH] powerpc/xmon: Change printk() to pr_cont()
From: Michael Ellerman @ 2020-12-04 10:56 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <c8a6ec704416ecd5ff2bd26213c9bc026bdd19de.1607077340.git.christophe.leroy@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Since some time now, printk() adds carriage return, leading to
> unusable xmon output:
>
> [   54.288722] sysrq: Entering xmon
> [   54.292209] Vector: 0  at [cace3d2c]
> [   54.292274]     pc:
> [   54.292331] c0023650

...

> diff --git a/arch/powerpc/xmon/nonstdio.c b/arch/powerpc/xmon/nonstdio.c
> index 5c1a50912229..9b0d85bff021 100644
> --- a/arch/powerpc/xmon/nonstdio.c
> +++ b/arch/powerpc/xmon/nonstdio.c
> @@ -178,7 +178,7 @@ void xmon_printf(const char *format, ...)
>  
>  	if (n && rc == 0) {
>  		/* No udbg hooks, fallback to printk() - dangerous */
> -		printk("%s", xmon_outbuf);
> +		pr_cont("%s", xmon_outbuf);
>  	}

Ah OK, in the case where there's no udbg backend. We basically always
have a udbg backend on 64-bit, via hvc console. Which explains why we
haven't noticed it.

Will pick up the patch.

cheers

^ permalink raw reply

* Re: [PATCH 0/4] powerpc/64s: Fix for radix TLB invalidation bug
From: Michael Ellerman @ 2020-12-04 11:59 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev
  Cc: Milton Miller, Paul Mackerras, Aneesh Kumar K.V
In-Reply-To: <20201126102530.691335-1-npiggin@gmail.com>

On Thu, 26 Nov 2020 20:25:26 +1000, Nicholas Piggin wrote:
> This fixes a tricky bug that was noticed by TLB multi-hits in a guest
> stress testing CPU hotplug, but TLB invalidation means any kind of
> data corruption is possible.
> 
> Thanks,
> Nick
> 
> [...]

Applied to powerpc/fixes.

[1/4] powerpc/64s: Fix hash ISA v3.0 TLBIEL instruction generation
      https://git.kernel.org/powerpc/c/5844cc25fd121074de7895181a2fa1ce100a0fdd
[2/4] powerpc/64s/pseries: Fix hash tlbiel_all_isa300 for guest kernels
      https://git.kernel.org/powerpc/c/c0b27c517acf8a2b359dd373a7e7e88b01a8308e
[3/4] kernel/cpu: add arch override for clear_tasks_mm_cpumask() mm handling
      https://git.kernel.org/powerpc/c/8ff00399b153440c1c83e20c43020385b416415b
[4/4] powerpc/64s: Trim offlined CPUs from mm_cpumasks
      https://git.kernel.org/powerpc/c/01b0f0eae0812e80efeee4ee17687e5386335e08

cheers

^ permalink raw reply

* Re: [PATCH] KVM: PPC: Book3S HV: XIVE: Fix vCPU id sanity check
From: Michael Ellerman @ 2020-12-04 11:59 UTC (permalink / raw)
  To: Michael Ellerman, Paul Mackerras, Greg Kurz
  Cc: linuxppc-dev, Cédric Le Goater, kvm-ppc, linux-kernel
In-Reply-To: <160673876747.695514.1809676603724514920.stgit@bahia.lan>

On Mon, 30 Nov 2020 13:19:27 +0100, Greg Kurz wrote:
> Commit 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size
> configurable") updated kvmppc_xive_vcpu_id_valid() in a way that
> allows userspace to trigger an assertion in skiboot and crash the host:
> 
> [  696.186248988,3] XIVE[ IC 08  ] eq_blk != vp_blk (0 vs. 1) for target 0x4300008c/0
> [  696.186314757,0] Assert fail: hw/xive.c:2370:0
> [  696.186342458,0] Aborting!
> xive-kvCPU 0043 Backtrace:
>  S: 0000000031e2b8f0 R: 0000000030013840   .backtrace+0x48
>  S: 0000000031e2b990 R: 000000003001b2d0   ._abort+0x4c
>  S: 0000000031e2ba10 R: 000000003001b34c   .assert_fail+0x34
>  S: 0000000031e2ba90 R: 0000000030058984   .xive_eq_for_target.part.20+0xb0
>  S: 0000000031e2bb40 R: 0000000030059fdc   .xive_setup_silent_gather+0x2c
>  S: 0000000031e2bc20 R: 000000003005a334   .opal_xive_set_vp_info+0x124
>  S: 0000000031e2bd20 R: 00000000300051a4   opal_entry+0x134
>  --- OPAL call token: 0x8a caller R1: 0xc000001f28563850 ---
> 
> [...]

Applied to powerpc/fixes.

[1/1] KVM: PPC: Book3S HV: XIVE: Fix vCPU id sanity check
      https://git.kernel.org/powerpc/c/f54db39fbe40731c40aefdd3bc26e7d56d668c64

cheers

^ permalink raw reply

* Re: [PATCH 0/8] powerpc/64s: fix and improve machine check handling
From: Michael Ellerman @ 2020-12-04 11:59 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Mahesh Salgaonkar, kvm-ppc
In-Reply-To: <20201128070728.825934-1-npiggin@gmail.com>

On Sat, 28 Nov 2020 17:07:20 +1000, Nicholas Piggin wrote:
> First patch is a nasty memory scribble introduced by me :( That
> should go into fixes.
> 
> The next ones could wait for next merge window. They get things to the
> point where misbehaving or buggy guest isn't so painful for the host,
> and also get the guest SLB dumping code working (because the host no
> longer clears them before delivering the MCE to the guest).
> 
> [...]

Patch 1 applied to powerpc/fixes.

[1/8] powerpc/64s/powernv: Fix memory corruption when saving SLB entries on MCE
      https://git.kernel.org/powerpc/c/a1ee28117077c3bf24e5ab6324c835eaab629c45

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/numa: Fix a regression on memoryless node 0
From: Michael Ellerman @ 2020-12-04 11:59 UTC (permalink / raw)
  To: Michael Ellerman, Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Milan Mohanty, Haren Myneni,
	linuxppc-dev
In-Reply-To: <20201127053738.10085-1-srikar@linux.vnet.ibm.com>

On Fri, 27 Nov 2020 11:07:38 +0530, Srikar Dronamraju wrote:
> Commit e75130f20b1f ("powerpc/numa: Offline memoryless cpuless node 0")
> offlines node 0 and expects nodes to be subsequently onlined when CPUs
> or nodes are detected.
> 
> Commit 6398eaa26816 ("powerpc/numa: Prefer node id queried from vphn")
> skips onlining node 0 when CPUs are associated with node 0.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/numa: Fix a regression on memoryless node 0
      https://git.kernel.org/powerpc/c/10f78fd0dabbc3856ddd67b09a46abdedb045913

cheers

^ permalink raw reply

* Re: [PATCH v2 3/5] MIPS: configs: drop unused BACKLIGHT_GENERIC option
From: Thomas Bogendoerfer @ 2020-12-04 12:06 UTC (permalink / raw)
  To: Andrey Zhizhikin
  Cc: alexandre.belloni, tony, linux-kernel, James.Bottomley,
	thierry.reding, paulus, sam, daniel.thompson, linux-omap, deller,
	linux, krzk, jonathanh, ludovic.desroches, catalin.marinas,
	linux-mips, will, mripard, soc, linux-tegra, lee.jones, wens,
	linux-arm-kernel, jernej.skrabec, linux-parisc, emil.l.velikov,
	nicolas.ferre, linuxppc-dev
In-Reply-To: <20201201222922.3183-4-andrey.zhizhikin@leica-geosystems.com>

On Tue, Dec 01, 2020 at 10:29:20PM +0000, Andrey Zhizhikin wrote:
> Commit 7ecdea4a0226 ("backlight: generic_bl: Remove this driver as it is
> unused") removed geenric_bl driver from the tree, together with
> corresponding config option.
> 
> Remove BACKLIGHT_GENERIC config item from all MIPS configurations.
> 
> Fixes: 7ecdea4a0226 ("backlight: generic_bl: Remove this driver as it is unused")
> Cc: Sam Ravnborg <sam@ravnborg.org>
> Signed-off-by: Andrey Zhizhikin <andrey.zhizhikin@leica-geosystems.com>
> Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
> Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
> Acked-by: Sam Ravnborg <sam@ravnborg.org>
> ---
>  arch/mips/configs/gcw0_defconfig      | 1 -
>  arch/mips/configs/gpr_defconfig       | 1 -
>  arch/mips/configs/lemote2f_defconfig  | 1 -
>  arch/mips/configs/loongson3_defconfig | 1 -
>  arch/mips/configs/mtx1_defconfig      | 1 -
>  arch/mips/configs/rs90_defconfig      | 1 -
>  6 files changed, 6 deletions(-)

applied to mips-next.

Thomas.

-- 
Crap can work. Given enough thrust pigs will fly, but it's not necessarily a
good idea.                                                [ RFC1925, 2.3 ]

^ permalink raw reply

* Re: [PATCH 12/29] powerpc/pseries/mobility: extract VASI session polling logic
From: Michael Ellerman @ 2020-12-04 12:51 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, ajd, mmc, cforno12, drt, brking
In-Reply-To: <20201030011805.1224603-13-nathanl@linux.ibm.com>

Nathan Lynch <nathanl@linux.ibm.com> writes:
> The behavior of rtas_ibm_suspend_me_unsafe() is to return -EAGAIN to
> the caller until the specified VASI suspend session state makes the
> transition from H_VASI_ENABLED to H_VASI_SUSPENDING. In the interest
> of separating concerns to prepare for a new implementation of the
> join/suspend sequence, extract VASI session polling logic into a
> couple of local functions. Waiting for the session state to reach
> H_VASI_SUSPENDING before calling rtas_ibm_suspend_me_unsafe() ensures
> that we will never get an EAGAIN result necessitating a retry. No
> user-visible change in behavior is intended.
>
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/mobility.c | 76 +++++++++++++++++++++--
>  1 file changed, 71 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
> index dc6abf164db7..1b8ae221b98a 100644
> --- a/arch/powerpc/platforms/pseries/mobility.c
> +++ b/arch/powerpc/platforms/pseries/mobility.c
> @@ -345,6 +345,73 @@ void post_mobility_fixup(void)
...

> +
> +static int wait_for_vasi_session_suspending(u64 handle)
> +{
> +	unsigned long state;
> +	bool keep_polling;
> +	int ret;
> +
> +	/*
> +	 * Wait for transition from H_VASI_ENABLED to
> +	 * H_VASI_SUSPENDING. Treat anything else as an error.
> +	 */
> +	do {
> +		keep_polling = false;
> +		ret = poll_vasi_state(handle, &state);
> +		if (ret != 0)
> +			break;
> +
> +		switch (state) {
> +		case H_VASI_SUSPENDING:
> +			break;
> +		case H_VASI_ENABLED:
> +			keep_polling = true;
> +			ssleep(1);
> +			break;
> +		default:
> +			pr_err("unexpected H_VASI_STATE result %lu\n", state);
> +			ret = -EIO;
> +			break;
> +		}
> +	} while (keep_polling);

This seems like it could be simpler?

eg:

	while (true) {
		ret = poll_vasi_state(handle, &state);

		if (ret != 0 || state == H_VASI_SUSPENDING)
			break;
		else if (state == H_VASI_ENABLED)
			ssleep(1);
		else {
			pr_err("unexpected H_VASI_STATE result %lu\n", state);
			ret = -EIO;
			break;
		}
	}


Or did I miss something?

cheers

^ permalink raw reply

* Re: [PATCH 03/29] powerpc/rtas: complete ibm,suspend-me status codes
From: Michael Ellerman @ 2020-12-04 12:52 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, ajd, mmc, cforno12, drt, brking
In-Reply-To: <20201030011805.1224603-4-nathanl@linux.ibm.com>

Nathan Lynch <nathanl@linux.ibm.com> writes:
> We don't completely account for the possible return codes for
> ibm,suspend-me. Add definitions for these.
>
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/rtas.h | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 55f9a154c95d..f060181a0d32 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -23,11 +23,16 @@
>  #define RTAS_RMOBUF_MAX (64 * 1024)
>  
>  /* RTAS return status codes */
> -#define RTAS_NOT_SUSPENDABLE	-9004
>  #define RTAS_BUSY		-2    /* RTAS Busy */
>  #define RTAS_EXTENDED_DELAY_MIN	9900
>  #define RTAS_EXTENDED_DELAY_MAX	9905
>  
> +/* statuses specific to ibm,suspend-me */
> +#define RTAS_SUSPEND_ABORTED     9000 /* Suspension aborted */

This made me ... pause.

But it really is positive 9000, I checked PAPR.

cheers


> +#define RTAS_NOT_SUSPENDABLE    -9004 /* Partition not suspendable */
> +#define RTAS_THREADS_ACTIVE     -9005 /* Multiple processor threads active */
> +#define RTAS_OUTSTANDING_COPROC -9006 /* Outstanding coprocessor operations */
> +
>  /*
>   * In general to call RTAS use rtas_token("string") to lookup
>   * an RTAS token for the given string (e.g. "event-scan").
> -- 
> 2.25.4

^ permalink raw reply

* Re: [PATCH 13/29] powerpc/pseries/mobility: use stop_machine for join/suspend
From: Michael Ellerman @ 2020-12-04 12:52 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, ajd, mmc, cforno12, drt, brking
In-Reply-To: <20201030011805.1224603-14-nathanl@linux.ibm.com>

Hi Nathan,

Just a couple of nits ...

Nathan Lynch <nathanl@linux.ibm.com> writes:
> The partition suspend sequence as specified in the platform
> architecture requires that all active processor threads call
> H_JOIN, which:
...
> diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
> index 1b8ae221b98a..44ca7d4e143d 100644
> --- a/arch/powerpc/platforms/pseries/mobility.c
> +++ b/arch/powerpc/platforms/pseries/mobility.c
> @@ -412,6 +414,128 @@ static int wait_for_vasi_session_suspending(u64 handle)
...

> +
> +static int do_join(void *arg)
> +{
> +	atomic_t *counter = arg;
> +	long hvrc;
> +	int ret;
> +
> +	/* Must ensure MSR.EE off for H_JOIN. */
> +	hard_irq_disable();

Didn't stop_machine() already do that for us?

In the state machine in multi_cpu_stop().

> +	hvrc = plpar_hcall_norets(H_JOIN);
> +
> +	switch (hvrc) {
> +	case H_CONTINUE:
> +		/*
> +		 * All other CPUs are offline or in H_JOIN. This CPU
> +		 * attempts the suspend.
> +		 */
> +		ret = do_suspend();
> +		break;
> +	case H_SUCCESS:
> +		/*
> +		 * The suspend is complete and this cpu has received a
> +		 * prod.
> +		 */
> +		ret = 0;
> +		break;
> +	case H_BAD_MODE:
> +	case H_HARDWARE:
> +	default:
> +		ret = -EIO;
> +		pr_err_ratelimited("H_JOIN error %ld on CPU %i\n",
> +				   hvrc, smp_processor_id());
> +		break;
> +	}
> +
> +	if (atomic_inc_return(counter) == 1) {
> +		pr_info("CPU %u waking all threads\n", smp_processor_id());
> +		prod_others();
> +	}

Do we even need the counter? IIUC only one CPU receives H_CONTINUE. So
couldn't we just have that CPU do the prodding of others?

> +	/*
> +	 * Execution may have been suspended for several seconds, so
> +	 * reset the watchdog.
> +	 */
> +	touch_nmi_watchdog();
> +	return ret;
> +}
> +
> +static int pseries_migrate_partition(u64 handle)
> +{
> +	atomic_t counter = ATOMIC_INIT(0);
> +	int ret;
> +
> +	ret = wait_for_vasi_session_suspending(handle);
> +	if (ret)
> +		goto out;

Direct return would be clearer IMHO.

> +
> +	ret = stop_machine(do_join, &counter, cpu_online_mask);
> +	if (ret == 0)
> +		post_mobility_fixup();
> +out:
> +	return ret;
> +}
> +

cheers

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox