LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 4/7] powerpc/64s/radix: make ptep_get_and_clear_full non-atomic for the full case
From: Nicholas Piggin @ 2018-05-20  0:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180520004347.19508-1-npiggin@gmail.com>

This matches other architectures, when we know there will be no
further accesses to the address (e.g., for teardown), page table
entries can be cleared non-atomically.

The comments about NMMU are bogus: all MMU notifiers (including NMMU)
are released at this point, with their TLBs flushed. An NMMU access at
this point would be a bug.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 705193e7192f..fcd92f9b6ec0 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -176,14 +176,8 @@ static inline pte_t radix__ptep_get_and_clear_full(struct mm_struct *mm,
 	unsigned long old_pte;
 
 	if (full) {
-		/*
-		 * If we are trying to clear the pte, we can skip
-		 * the DD1 pte update sequence and batch the tlb flush. The
-		 * tlb flush batching is done by mmu gather code. We
-		 * still keep the cmp_xchg update to make sure we get
-		 * correct R/C bit which might be updated via Nest MMU.
-		 */
-		old_pte = __radix_pte_update(ptep, ~0ul, 0);
+		old_pte = pte_val(*ptep);
+		*ptep = __pte(0);
 	} else
 		old_pte = radix__pte_update(mm, addr, ptep, ~0ul, 0, 0);
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH v2 3/7] powerpc/64s/radix: make single threaded mms always flush all translations from non-local CPUs
From: Nicholas Piggin @ 2018-05-20  0:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180520004347.19508-1-npiggin@gmail.com>

Go one step further, if we're going to put a tlbie on the bus at all,
make it count. Make any global invalidation from a single threaded mm
do a full PID flush so the mm_cpumask can be reset.

The tradeoff is that it will over-flush one time the local CPU's TLB
if there was a small number of pages to flush that could be done with
specific address tlbies.

If the workload is invalidate-heavy enough for this to be a concern,
this should be outweighed by the benefit that it can subsequently
avoid the global flush.

This reduces tlbies for a kernel compile workload from 0.40M to 0.18M,
tlbiels are increased from 22.5M to 23.8M because local pid flushes
take 128 tlbiels vs 1 for global pid flush.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/mm/tlb-radix.c | 45 ++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index d5593a78702a..55f93d66c8d2 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -587,10 +587,16 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 		return;
 
 	preempt_disable();
-	if (!mm_is_thread_local(mm))
-		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
-	else
+	if (mm_is_thread_local(mm)) {
 		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+	} else {
+		if (mm_is_singlethreaded(mm)) {
+			_tlbie_pid(pid, RIC_FLUSH_ALL);
+			mm_reset_thread_local(mm);
+		} else {
+			_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+		}
+	}
 	preempt_enable();
 }
 
@@ -659,14 +665,14 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 				nr_pages > tlb_single_page_flush_ceiling);
 	}
 
-	if (full) {
+	if (!local && mm_is_singlethreaded(mm)) {
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
+		mm_reset_thread_local(mm);
+	} else if (full) {
 		if (local) {
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		} else {
-			if (mm_is_singlethreaded(mm)) {
-				_tlbie_pid(pid, RIC_FLUSH_ALL);
-				mm_reset_thread_local(mm);
-			} else if (mm_needs_flush_escalation(mm)) {
+			if (mm_needs_flush_escalation(mm)) {
 				_tlbie_pid(pid, RIC_FLUSH_ALL);
 			} else {
 				_tlbie_pid(pid, RIC_FLUSH_TLB);
@@ -824,19 +830,17 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 				nr_pages > tlb_single_page_flush_ceiling);
 	}
 
-	if (full) {
+	if (!local && mm_is_singlethreaded(mm)) {
+		_tlbie_pid(pid, RIC_FLUSH_ALL);
+		mm_reset_thread_local(mm);
+	} else if (full) {
 		if (local) {
 			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		} else {
-			if (mm_is_singlethreaded(mm)) {
-				_tlbie_pid(pid, RIC_FLUSH_ALL);
-				mm_reset_thread_local(mm);
-			} else {
-				if (mm_needs_flush_escalation(mm))
-					also_pwc = true;
+			if (mm_needs_flush_escalation(mm))
+				also_pwc = true;
 
-				_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-			}
+			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		}
 	} else {
 		if (local)
@@ -882,7 +886,12 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 	if (mm_is_thread_local(mm)) {
 		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	} else {
-		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+		if (mm_is_singlethreaded(mm)) {
+			_tlbie_pid(pid, RIC_FLUSH_ALL);
+			mm_reset_thread_local(mm);
+		} else {
+			_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+		}
 	}
 
 	preempt_enable();
-- 
2.17.0

^ permalink raw reply related

* [PATCH v2 2/7] powerpc/64s/radix: reset mm_cpumask for single thread process when possible
From: Nicholas Piggin @ 2018-05-20  0:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180520004347.19508-1-npiggin@gmail.com>

When a single-threaded process has a non-local mm_cpumask and requires
a full PID tlbie invalidation, use that as an opportunity to reset the
cpumask back to the current CPU we're running on.

No other thread can concurrently switch to this mm, because it must
have been given a reference to mm_users by the current thread before it
can use_mm. mm_users can be asynchronously incremented (by
mmget_not_zero), but those users must not do a use_mm, because existing
convention already disallows it (sparc64 has reset mm_cpumask using this
condition since the start of history, see arch/sparc/kernel/smp_64.c).

This reduces tlbies for a kernel compile workload from 0.90M to 0.40M,
tlbiels are increased from 20M to 22.5M because local pid flushes take
128 tlbiels vs 1 for global pid flush.

This slows down context switch benchmark ping-ponging on different
CPUs by 7.5%, because switching to the idle thread and back now
requires a PID switch. There are ways this could be avoided, but for
now it's simpler not to allow lazy PID switching.

Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Since v1:
- Fix switch_mm to be irq-safe (thanks Balbir)

 arch/powerpc/include/asm/mmu_context.h | 19 +++++++++
 arch/powerpc/include/asm/tlb.h         |  8 ++++
 arch/powerpc/mm/tlb-radix.c            | 57 +++++++++++++++++++-------
 3 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 1835ca1505d6..f26704371fdc 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <asm/mmu.h>	
 #include <asm/cputable.h>
@@ -201,6 +202,24 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 static inline void enter_lazy_tlb(struct mm_struct *mm,
 				  struct task_struct *tsk)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * Under radix, we do not want to keep lazy PIDs around because
+	 * even if the CPU does not access userspace, it can still bring
+	 * in translations through speculation and prefetching.
+	 *
+	 * Switching away here allows us to trim back the mm_cpumask in
+	 * cases where we know the process is not running on some CPUs
+	 * (see mm/tlb-radix.c).
+	 */
+	if (radix_enabled() && mm != &init_mm) {
+		mmgrab(&init_mm);
+		tsk->active_mm = &init_mm;
+		switch_mm(mm, tsk->active_mm, tsk);
+		mmdrop(mm);
+	}
+#endif
+
 	/* 64-bit Book3E keeps track of current PGD in the PACA */
 #ifdef CONFIG_PPC_BOOK3E_64
 	get_paca()->pgd = NULL;
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index a7eabff27a0f..006fce98c403 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -76,6 +76,14 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
 		return false;
 	return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
 }
+static inline void mm_reset_thread_local(struct mm_struct *mm)
+{
+	WARN_ON(atomic_read(&mm->context.copros) > 0);
+	WARN_ON(!(atomic_read(&mm->mm_users) == 1 && current->mm == mm));
+	atomic_set(&mm->context.active_cpus, 1);
+	cpumask_clear(mm_cpumask(mm));
+	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
+}
 #else /* CONFIG_PPC_BOOK3S_64 */
 static inline int mm_is_thread_local(struct mm_struct *mm)
 {
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5ac3206c51cc..d5593a78702a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -504,6 +504,15 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+	if (atomic_read(&mm->context.copros) > 0)
+		return false;
+	if (atomic_read(&mm->mm_users) == 1 && current->mm == mm)
+		return true;
+	return false;
+}
+
 static bool mm_needs_flush_escalation(struct mm_struct *mm)
 {
 	/*
@@ -511,7 +520,9 @@ static bool mm_needs_flush_escalation(struct mm_struct *mm)
 	 * caching PTEs and not flushing them properly when
 	 * RIC = 0 for a PID/LPID invalidate
 	 */
-	return atomic_read(&mm->context.copros) != 0;
+	if (atomic_read(&mm->context.copros) > 0)
+		return true;
+	return false;
 }
 
 #ifdef CONFIG_SMP
@@ -525,12 +536,17 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 
 	preempt_disable();
 	if (!mm_is_thread_local(mm)) {
-		if (mm_needs_flush_escalation(mm))
+		if (mm_is_singlethreaded(mm)) {
 			_tlbie_pid(pid, RIC_FLUSH_ALL);
-		else
+			mm_reset_thread_local(mm);
+		} else if (mm_needs_flush_escalation(mm)) {
+			_tlbie_pid(pid, RIC_FLUSH_ALL);
+		} else {
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
-	} else
+		}
+	} else {
 		_tlbiel_pid(pid, RIC_FLUSH_TLB);
+	}
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -544,10 +560,13 @@ void radix__flush_all_mm(struct mm_struct *mm)
 		return;
 
 	preempt_disable();
-	if (!mm_is_thread_local(mm))
+	if (!mm_is_thread_local(mm)) {
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
-	else
+		if (mm_is_singlethreaded(mm))
+			mm_reset_thread_local(mm);
+	} else {
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
+	}
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_all_mm);
@@ -644,10 +663,14 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		if (local) {
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		} else {
-			if (mm_needs_flush_escalation(mm))
+			if (mm_is_singlethreaded(mm)) {
+				_tlbie_pid(pid, RIC_FLUSH_ALL);
+				mm_reset_thread_local(mm);
+			} else if (mm_needs_flush_escalation(mm)) {
 				_tlbie_pid(pid, RIC_FLUSH_ALL);
-			else
+			} else {
 				_tlbie_pid(pid, RIC_FLUSH_TLB);
+			}
 		}
 	} else {
 		bool hflush = false;
@@ -802,13 +825,19 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 	}
 
 	if (full) {
-		if (!local && mm_needs_flush_escalation(mm))
-			also_pwc = true;
-
-		if (local)
+		if (local) {
 			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
-		else
-			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
+		} else {
+			if (mm_is_singlethreaded(mm)) {
+				_tlbie_pid(pid, RIC_FLUSH_ALL);
+				mm_reset_thread_local(mm);
+			} else {
+				if (mm_needs_flush_escalation(mm))
+					also_pwc = true;
+
+				_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+			}
+		}
 	} else {
 		if (local)
 			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
-- 
2.17.0

^ permalink raw reply related

* [PATCH v2 1/7] powerpc/64s/radix: do not flush TLB on spurious fault
From: Nicholas Piggin @ 2018-05-20  0:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180520004347.19508-1-npiggin@gmail.com>

In the case of a spurious fault (which can happen due to a race with
another thread that changes the page table), the default Linux mm code
calls flush_tlb_page for that address. This is not required because
the pte will be re-fetched. Hash does not wire this up to a hardware
TLB flush for this reason. This patch avoids the flush for radix.

>From Power ISA v3.0B, p.1090:

    Setting a Reference or Change Bit or Upgrading Access Authority
    (PTE Subject to Atomic Hardware Updates)

    If the only change being made to a valid PTE that is subject to
    atomic hardware updates is to set the Refer- ence or Change bit to
    1 or to add access authorities, a simpler sequence suffices
    because the translation hardware will refetch the PTE if an access
    is attempted for which the only problems were reference and/or
    change bits needing to be set or insufficient access authority.

The nest MMU on POWER9 does not re-fetch the PTE after such an access
attempt before faulting, so address spaces with a coprocessor
attached will continue to flush in these cases.

This reduces tlbies for a kernel compile workload from 0.95M to 0.90M.

fork --fork --exec benchmark improved 0.5% (12300->12400).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Since v1:
- Added NMMU handling

 arch/powerpc/include/asm/book3s/64/tlbflush.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 0cac17253513..ebf572ea621e 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -4,7 +4,7 @@
 
 #define MMU_NO_CONTEXT	~0UL
 
-
+#include <linux/mm_types.h>
 #include <asm/book3s/64/tlbflush-hash.h>
 #include <asm/book3s/64/tlbflush-radix.h>
 
@@ -137,6 +137,16 @@ static inline void flush_all_mm(struct mm_struct *mm)
 #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
 #define flush_all_mm(mm)		local_flush_all_mm(mm)
 #endif /* CONFIG_SMP */
+
+#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
+static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
+						unsigned long address)
+{
+	/* See ptep_set_access_flags comment */
+	if (atomic_read(&vma->vm_mm->context.copros) > 0)
+		flush_tlb_page(vma, address);
+}
+
 /*
  * flush the page walk cache for the address
  */
-- 
2.17.0

^ permalink raw reply related

* [PATCH v2 0/7] Various TLB and PTE improvements
From: Nicholas Piggin @ 2018-05-20  0:43 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

I've posted most of these separately at one time or another but I
will send them as a series, there have been some bug fixes and
changelog and comment improvements, and got some more numbers.

Most of the patches are logically independent (except 2 and 3 AFAIKS).

Thanks,
Nick

Nicholas Piggin (7):
  powerpc/64s/radix: do not flush TLB on spurious fault
  powerpc/64s/radix: reset mm_cpumask for single thread process when
    possible
  powerpc/64s/radix: make single threaded mms always flush all translations
    from non-local CPUs
  powerpc/64s/radix: make ptep_get_and_clear_full non-atomic for the
    full case
  powerpc/64s/radix: optimise pte_update
  powerpc/64s/radix: prefetch user address in update_mmu_cache
  powerpc/64s/radix: avoid ptesync after set_pte and
    ptep_set_access_flags

 arch/powerpc/include/asm/book3s/64/radix.h    | 37 ++++----
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 12 ++-
 arch/powerpc/include/asm/cacheflush.h         | 13 +++
 arch/powerpc/include/asm/mmu_context.h        | 19 ++++
 arch/powerpc/include/asm/tlb.h                |  8 ++
 arch/powerpc/mm/mem.c                         |  4 +-
 arch/powerpc/mm/mmu_context.c                 |  6 +-
 arch/powerpc/mm/pgtable-book3s64.c            |  3 +-
 arch/powerpc/mm/tlb-radix.c                   | 89 ++++++++++++++-----
 9 files changed, 143 insertions(+), 48 deletions(-)

-- 
2.17.0

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Andy Lutomirski @ 2018-05-19 23:47 UTC (permalink / raw)
  To: linuxram
  Cc: Andrew Lutomirski, Florian Weimer, linuxppc-dev, Linux-MM,
	Dave Hansen
In-Reply-To: <20180519202747.GK5479@ram.oc3035372033.ibm.com>

On Sat, May 19, 2018 at 1:28 PM Ram Pai <linuxram@us.ibm.com> wrote:

> You got it mostly right. Filling in some more details below for
> completeness.

> [...]

Okay, so I guess I was correct as to what the functionality was but not as
to the encoding or the name of UAMOR.

Can you also confirm that mprotect_key() affects all threads?


> And finally the kernel reserves some subset of keys, in advance, that
> it wants for itself. It will never give away those keys to userspace
> through sys_pkey_alloc(), and the bits corresponding to those keys will
> be 0 in UAMOR register.

> >
> > Here's my question: given that disallowed AMR bits are read-as-zero,
there
> > can always be a thread that is in the middle of a sequence like:
> >
> > step1 : unsigned long old = amr;
> > step2 : amr |= whatever;
> > step3 : ...  <- thread is here
> > step4 : amr = old;
> >
> > Now another thread calls pkey_alloc(), so UAMR is asynchronously
changed,
> > and the thread will write zero to the relevant AMR bits.

> > If I understand
> > correctly, this means that the decision to mask off unallocated keys via
> > UAMR effectively forces the initial value of newly-allocated keys in
other
> > threads in the allocating process to be zero, whatever zero means.

> The initial value of the newly allocated key will be whatever the
> init_value is, that is specified in the sys_pkey_alloc().

> Remember, the UAMOR and the AMR values are thread specific. If thread T2
> allocates a new key, then that thread will enable the bit in its version
> of the UAMOR register. It will not have any effect on the UAMOR value of
> any other threads's version.

So is it possible for two threads to each call pkey_alloc() and end up with
the same key?  If so, it seems entirely broken.  If not, then how do you
intend for a multithreaded application to usefully allocate a new key?
Regardless, it seems like the current behavior on POWER is very difficult
to work with.  Can you give an example of a use case for which POWER's
behavior makes sense?

For the use cases I've imagined, POWER's behavior does not make sense.
  x86's is not ideal but is still better.  Here are my two example use cases:

1. A crypto library.  Suppose I'm writing a TLS-terminating server, and I
want it to be resistant to Heartbleed-like bugs.  I could store my private
keys protected by mprotect_key() and arrange for all threads and signal
handlers to have PKRU/AMR values that prevent any access to the memory.
When an explicit call is made to sign with the key, I would temporarily
change PKRU/AMR to allow access, compute the signature, and change PKRU/AMR
back.  On x86 right now, this works nicely.  On POWER, it doesn't, because
any thread started before my pkey_alloc() call can access the protected
memory, as can any signal handler.

2. A database using mmap() (with persistent memory or otherwise).  It would
be nice to be resistant to accidental corruption due to stray writes.  I
would do more or less the same thing as (1), except that I would want
threads that are not actively writing to the database to be able the
protected memory.  On x86, I need to manually convince threads that may
have been started before my pkey_alloc() call as well as signal handlers to
update their PKRU values.  On POWER, as in example (1), the error goes the
other direction -- if I fail to propagate the AMR bits to all threads,
writes are not blocked.

--Andy

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Ram Pai @ 2018-05-19 20:27 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Florian Weimer, linuxppc-dev, Linux-MM, Dave Hansen
In-Reply-To: <CALCETrWMP9kTmAFCR0WHR3YP93gLSzgxhfnb0ma_0q=PCuSdQA@mail.gmail.com>

On Fri, May 18, 2018 at 06:50:39PM -0700, Andy Lutomirski wrote:
> On Fri, May 18, 2018 at 6:19 PM Ram Pai <linuxram@us.ibm.com> wrote:
> 
> > However the fundamental issue is still the same, as mentioned in the
> > other thread.
> 
> > "Should the permissions on a key be allowed to be changed, if the key
> > is not allocated in the first place?".
> 
> > my answer is NO. Lets debate :)
> 
> As a preface, here's my two-minute attempt to understand POWER's behavior:
> there are two registers, AMR and UAMR.  AMR contains both kernel-relevant
> state and user-relevant state.  UAMR specifies which bits of AMR are
> available for user code to directly access.  AMR bits outside UAMR are read
> as zero and are unaffected by writes.  I'm assuming that the kernel
> reserves some subset of AMR bits in advance to correspond to allocatable
> pkeys.


You got it mostly right. Filling in some more details below for
completeness.

Yes there is a AMR register which has two bits each corresponding to
each key.  One bit corresponds to read permission, and the other bit
corresponds to write permission.  If set, the corresponding permission
is denied.  Both userspace and kernel can modify the register.

Yes there is a UAMOR register which has a bit corresponding to each key.
When a bit in UAMOR register is set, that key's permission can be
changed by userspace. Only kernel can modify UAMOR register.

for example, if bit 10 is set in UAMOR register, only then key-10's read-write
permissions can be modified by userspace.

NOTE: the UAMR register you mention above, is actually UAMOR register. 

And finally the kernel reserves some subset of keys, in advance, that
it wants for itself. It will never give away those keys to userspace
through sys_pkey_alloc(), and the bits corresponding to those keys will
be 0 in UAMOR register.

> 
> Here's my question: given that disallowed AMR bits are read-as-zero, there
> can always be a thread that is in the middle of a sequence like:
> 
> step1 : unsigned long old = amr;
> step2 : amr |= whatever;
> step3 : ...  <- thread is here
> step4 : amr = old;
> 
> Now another thread calls pkey_alloc(), so UAMR is asynchronously changed,
> and the thread will write zero to the relevant AMR bits. 

> If I understand
> correctly, this means that the decision to mask off unallocated keys via
> UAMR effectively forces the initial value of newly-allocated keys in other
> threads in the allocating process to be zero, whatever zero means.

The initial value of the newly allocated key will be whatever the
init_value is, that is specified in the sys_pkey_alloc().

Remember, the UAMOR and the AMR values are thread specific. If thread T2
allocates a new key, then that thread will enable the bit in its version
of the UAMOR register. It will not have any effect on the UAMOR value of
any other threads's version.

So in the above code snippet, assuming T1 is executing the code, and T2 is
running parallely, and assume that both threads have the same AMR and
UAMOR value till the end of step2.  At step 3, when T2 allocates a new
key,  thread T2's UAMOR will enable the bit corresponding to the
allocated key and set the corresponding bits in AMR to the init_val
specified in sys_pkey_alloc(). It has no effect on thread T1's UAMOR or
AMR register.

> (I
> didn't get far enough in the POWER docs to figure out what zero means.).

ok.

> So
> I don't think you're doing anyone any favors by making UAMR dynamic.

depending on the explaination above, do you continue to hold that opinion?

> 
> IOW both x86 and POWER screwed up the ISA.

-- 
Ram Pai

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Florian Weimer @ 2018-05-19 11:11 UTC (permalink / raw)
  To: Ram Pai; +Cc: linuxppc-dev, linux-mm, Dave Hansen, Andy Lutomirski
In-Reply-To: <20180519011947.GJ5479@ram.oc3035372033.ibm.com>

On 05/19/2018 03:19 AM, Ram Pai wrote:
> The issue you may be talking about here is that  --
> 
> "when you set the AMR register to 0xffffffffffffffff, it
> just sets it to 0x0c00000000000000."
> 
> To me it looks like, exec/fork are not related to the issue.
> Or are they also somehow connected to the issue?
> 
> 
> The reason the AMR register does not get set to 0xffffffffffffffff,
> is because none of those keys; except key 2, are active. So it ignores
> all other bits and just sets the bits corresponding to key 2.

Here's a slightly different test:

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <err.h>

/* Return the value of the AMR register.  */
static inline unsigned long int
pkey_read (void)
{
   unsigned long int result;
   __asm__ volatile ("mfspr %0, 13" : "=r" (result));
   return result;
}

/* Overwrite the AMR register with VALUE.  */
static inline void
pkey_write (unsigned long int value)
{
   __asm__ volatile ("mtspr 13, %0" : : "r" (value));
}

int
main (int argc, char **argv)
{
   printf ("AMR (PID %d): 0x%016lx\n", (int) getpid (), pkey_read());
   if (argc > 1 && strcmp (argv[1], "alloc") == 0)
     {
       int key = syscall (__NR_pkey_alloc, 0, 0);
       if (key < 0)
         err (1, "pkey_alloc");
       printf ("Allocated key in subprocess (PID %d): %d\n",
               (int) getpid (), key);
       return 0;
     }

   pid_t pid = fork ();
   if (pid == 0)
     {
       printf ("AMR after fork (PID %d): 0x%016lx\n",
               (int) getpid (), pkey_read());
       execl ("/proc/self/exe", argv[0], "alloc", NULL);
       _exit (1);
     }
   if (pid < 0)
     err (1, "fork");
   int status;
   if (waitpid (pid, &status, 0) < 0)
     err (1, "waitpid");

   int key = syscall (__NR_pkey_alloc, 0, 0);
   if (key < 0)
     err (1, "pkey_alloc");
   printf ("Allocated key (PID %d): %d\n", (int) getpid (), key);

   unsigned long int amr = -1;
   printf ("Setting AMR: 0x%016lx\n", amr);
   pkey_write (amr);
   printf ("New AMR value (PID %d): 0x%016lx\n",
           (int) getpid (), pkey_read());
   if (argc == 1)
     {
       printf ("About to call execl (PID %d) ...\n", (int) getpid ());
       execl ("/proc/self/exe", argv[0], "execl", NULL);
       err (1, "exec");
       return 1;
     }
   else
     return 0;
}

It produces:

AMR (PID 110163): 0x0000000000000000
AMR after fork (PID 110164): 0x0000000000000000
AMR (PID 110164): 0x0000000000000000
Allocated key in subprocess (PID 110164): 2
Allocated key (PID 110163): 2
Setting AMR: 0xffffffffffffffff
New AMR value (PID 110163): 0x0c00000000000000
About to call execl (PID 110163) ...
AMR (PID 110163): 0x0c00000000000000
AMR after fork (PID 110165): 0x0000000000000000
AMR (PID 110165): 0x0000000000000000
Allocated key in subprocess (PID 110165): 2
Allocated key (PID 110163): 2
Setting AMR: 0xffffffffffffffff
New AMR value (PID 110163): 0x0c00000000000000

A few things which are odd stand out (apart the wrong default for AMR 
and the AMR update restriction covered in the other thread):

* execve does not reset AMR (see after “About to call execl”)
* fork resets AMR (see lines with PID 110165))
* After execve, a key with non-default access rights is allocated
   (see “Allocated key (PID 110163): 2”, second time, after execl)

No matter what you think about the AMR default, I posit that each of 
those are bugs (although the last one should be fixed by resetting AMR 
on execve).

Thanks,
Florian

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Florian Weimer @ 2018-05-19  5:26 UTC (permalink / raw)
  To: Andy Lutomirski, linuxram; +Cc: linuxppc-dev, Linux-MM, Dave Hansen
In-Reply-To: <CALCETrWMP9kTmAFCR0WHR3YP93gLSzgxhfnb0ma_0q=PCuSdQA@mail.gmail.com>

On 05/19/2018 03:50 AM, Andy Lutomirski wrote:
> Now another thread calls pkey_alloc(), so UAMR is asynchronously changed,
> and the thread will write zero to the relevant AMR bits.  If I understand
> correctly, this means that the decision to mask off unallocated keys via
> UAMR effectively forces the initial value of newly-allocated keys in other
> threads in the allocating process to be zero, whatever zero means.  (I
> didn't get far enough in the POWER docs to figure out what zero means.)  So

(Note that this belongs on the other thread, here I originally wanted to 
talk about the lack of reset of AMR to the default value on execve.)

I don't think UAMOR is updated asynchronously.  On pkey_alloc, it is 
only changed for the current thread, and future threads launched from 
it.  Existing threads are unaffected.  This still results in a 
programming model which is substantially different from x86.

> I don't think you're doing anyone any favors by making UAMR dynamic.

This is still true, I think.

Thanks,
Florian

^ permalink raw reply

* Re: [PATCH 1/2] m68k: Set default dma mask for platform devices
From: Finn Thain @ 2018-05-19  5:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Geert Uytterhoeven, linux-m68k, linux-kernel, linuxppc-dev,
	Greg Ungerer
In-Reply-To: <20180518090434.GC24436@lst.de>

On Fri, 18 May 2018, Christoph Hellwig wrote:

> > This implementation of arch_setup_pdev_archdata() differs from the 
> > powerpc one, in that this one avoids clobbering a device dma mask 
> > which has already been initialized.
> 
> I think your implementation should move into the generic implementation 
> in drivers/base/platform.c instead of being stuck in m68k.
> 
> Also powerpc probably wants fixing, but that's something left to the
> ppc folks..

On powerpc, all platform devices get a dma mask. But they don't do that in 
drivers/base/platform.c, they use arch_setup_pdev_archdata(). Why didn't 
they take the approach you suggest?

How would I support the claim that replacing an empty platform device dma 
mask with 0xffffffff is safe on all architectures and platforms?

Is there no code conditional upon dev.coherent_dma_mask or dev.dma_mask 
that could misbehave? (Didn't I cite an example in the other thread?*)

If you can convince me that it is safe, I'd be happy to submit the patch 
you asked for.

For now, I still think that patching the platform driver was the correct 
patch*.

Maybe the real problem is your commit 205e1b7f51e4 ("dma-mapping: warn 
when there is no coherent_dma_mask"), because it assumes that all dma_ops 
implementations care about coherent_dma_mask.

The dma_map_ops implementations that do use coherent_dma_mask should 
simply fail when it is unset, right?

Would it not be better to revert your patch and fix the dma_map_ops 
failure paths, than to somehow audit all the platform drivers and patch 
drivers/base/platform.c?

Thanks.

* https://lkml.kernel.org/r/alpine.LNX.2.21.1805091804290.72%40nippy.intranet

-- 

^ permalink raw reply

* Re: pkeys on POWER: Default AMR, UAMOR values
From: Florian Weimer @ 2018-05-19  5:15 UTC (permalink / raw)
  To: Ram Pai; +Cc: Andy Lutomirski, linuxppc-dev, Linux-MM, Dave Hansen
In-Reply-To: <20180519005219.GI5479@ram.oc3035372033.ibm.com>

On 05/19/2018 02:52 AM, Ram Pai wrote:

>>> The POWER semantics make it very hard for a multithreaded program to
>>> meaningfully use protection keys to prevent accidental access to important
>>> memory.
>>
>> And you can change access rights for unallocated keys (unallocated
>> at thread start time, allocated later) on x86.  I have extended the
>> misc/tst-pkeys test to verify that, and it passes on x86, but not on
>> POWER, where the access rights are stuck.
> 
> This is something I do not understand. How can a thread change permissions
> on a key, that is not even allocated in the first place.

It was allocated by another thread, and there is synchronization so that 
the allocation happens before the change in access rights.

> Do you consider a key
> allocated in some other thread's context, as allocated in this threads
> context?

Yes, x86 does that.

> If not, does that mean -- On x86, you can activate a key just
> by changing its permission?

This also true on x86, but just an artifact of the implementation.  You 
are supposed to call pkey_alloc before changing the flag.

Thanks,
Florian

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Florian Weimer @ 2018-05-19  5:12 UTC (permalink / raw)
  To: Ram Pai; +Cc: linuxppc-dev, linux-mm, Dave Hansen, Andy Lutomirski
In-Reply-To: <20180519011947.GJ5479@ram.oc3035372033.ibm.com>

On 05/19/2018 03:19 AM, Ram Pai wrote:

>> New AMR value (PID 112291, before execl): 0x0c00000000000000
>> AMR (PID 112291): 0x0c00000000000000

The issue is here.  The second line is after the execl (printed from the 
start of main), and the AMR value is not reset to zero.

>> Allocated key (PID 112291): 2
>>
>> I think this is a real bug and needs to be fixed even if the
>> defaults are kept as-is (see the other thread).
> 
> The issue you may be talking about here is that  --
> 
> "when you set the AMR register to 0xffffffffffffffff, it
> just sets it to 0x0c00000000000000."
> 
> To me it looks like, exec/fork are not related to the issue.
> Or are they also somehow connected to the issue?

Yes, this is the other issue.  It is not really important here.

Thanks,
Florian

^ permalink raw reply

* [PATCH 3/3] powerpc/64: hard disable irqs on the panic()ing CPU
From: Nicholas Piggin @ 2018-05-19  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180519043554.26640-1-npiggin@gmail.com>

Similar to previous patches, hard disable interrupts when a CPU is
in panic. This reduces the chance the watchdog has to interfere with
the panic, and avoids any other type of masked interrupt being
executed when crashing which minimises the length of the crash path.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/setup-common.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 0af5c11b9e78..d45fb522fe8a 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -706,12 +706,19 @@ EXPORT_SYMBOL(check_legacy_ioport);
 static int ppc_panic_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
 {
+	/*
+	 * panic does a local_irq_disable, but we really
+	 * want interrupts to be hard disabled.
+	 */
+	hard_irq_disable();
+
 	/*
 	 * If firmware-assisted dump has been registered then trigger
 	 * firmware-assisted dump and let firmware handle everything else.
 	 */
 	crash_fadump(NULL, ptr);
-	ppc_md.panic(ptr);  /* May not return */
+	if (ppc_md.panic)
+		ppc_md.panic(ptr);  /* May not return */
 	return NOTIFY_DONE;
 }
 
@@ -722,7 +729,8 @@ static struct notifier_block ppc_panic_block = {
 
 void __init setup_panic(void)
 {
-	if (!ppc_md.panic)
+	/* PPC64 always does a hard irq disable in its panic handler */
+	if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic)
 		return;
 	atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block);
 }
-- 
2.17.0

^ permalink raw reply related

* [PATCH 2/3] powerpc: smp_send_stop do not offline stopped CPUs
From: Nicholas Piggin @ 2018-05-19  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180519043554.26640-1-npiggin@gmail.com>

Marking CPUs stopped by smp_send_stop as offline can cause warnings
due to cross-CPU wakeups. This trace was noticed on a busy system
running a sysrq+c crash test, after the injected crash:

WARNING: CPU: 51 PID: 1546 at kernel/sched/core.c:1179 set_task_cpu+0x22c/0x240
CPU: 51 PID: 1546 Comm: kworker/u352:1 Tainted: G      D
Workqueue: mlx5e mlx5e_update_stats_work [mlx5_core]
[...]
NIP [c00000000017c21c] set_task_cpu+0x22c/0x240
LR [c00000000017d580] try_to_wake_up+0x230/0x720
Call Trace:
[c000000001017700] runqueues+0x0/0xb00 (unreliable)
[c00000000017d580] try_to_wake_up+0x230/0x720
[c00000000015a214] insert_work+0x104/0x140
[c00000000015adb0] __queue_work+0x230/0x690
[c000003fc5007910] [c00000000015b26c] queue_work_on+0x5c/0x90
[c0080000135fc8f8] mlx5_cmd_exec+0x538/0xcb0 [mlx5_core]
[c008000013608fd0] mlx5_core_access_reg+0x140/0x1d0 [mlx5_core]
[c00800001362777c] mlx5e_update_pport_counters.constprop.59+0x6c/0x90 [mlx5_core]
[c008000013628868] mlx5e_update_ndo_stats+0x28/0x90 [mlx5_core]
[c008000013625558] mlx5e_update_stats_work+0x68/0xb0 [mlx5_core]
[c00000000015bcec] process_one_work+0x1bc/0x5f0
[c00000000015ecac] worker_thread+0xac/0x6b0
[c000000000168338] kthread+0x168/0x1b0
[c00000000000b628] ret_from_kernel_thread+0x5c/0xb4

This happens because firstly the CPU is not really offline in the
usual sense, processes and interrupts have not been migrated away.
Secondly smp_send_stop does not happen atomically on all CPUs, so
one CPU can have marked itself offline, while another CPU is still
running processes or interrupts which can affect the first CPU.

Fix this by just not marking the CPU as offline. It's more like
frozen in time, so offline does not really reflect its state properly
anyway. There should be nothing in the crash/panic path that walks
online CPUs and synchronously waits for them, so this change should
not introduce new hangs.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9ca7148b5881..6d6cf14009cf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -579,9 +579,6 @@ static void nmi_stop_this_cpu(struct pt_regs *regs)
 	nmi_ipi_busy_count--;
 	nmi_ipi_unlock();
 
-	/* Remove this CPU */
-	set_cpu_online(smp_processor_id(), false);
-
 	spin_begin();
 	while (1)
 		spin_cpu_relax();
@@ -596,9 +593,6 @@ void smp_send_stop(void)
 
 static void stop_this_cpu(void *dummy)
 {
-	/* Remove this CPU */
-	set_cpu_online(smp_processor_id(), false);
-
 	hard_irq_disable();
 	spin_begin();
 	while (1)
-- 
2.17.0

^ permalink raw reply related

* [PATCH 1/3] powerpc/64: hard disable irqs in panic_smp_self_stop
From: Nicholas Piggin @ 2018-05-19  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180519043554.26640-1-npiggin@gmail.com>

Similarly to commit 855bfe0de1 ("powerpc: hard disable irqs in
smp_send_stop loop"), irqs should be hard disabled by
panic_smp_self_stop.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/setup_64.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index b78f142a4148..d122321ad542 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -380,6 +380,14 @@ void early_setup_secondary(void)
 
 #endif /* CONFIG_SMP */
 
+void panic_smp_self_stop(void)
+{
+	hard_irq_disable();
+	spin_begin();
+	while (1)
+		spin_cpu_relax();
+}
+
 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
 static bool use_spinloop(void)
 {
-- 
2.17.0

^ permalink raw reply related

* [PATCH 0/3] further reduce warnings and watchdogs in crash/shutdown paths
From: Nicholas Piggin @ 2018-05-19  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin

Patches 1 and 3 are more of the same, just upgrading local_irq_disable
to hard_irq_disable in places.

Patch 2 fixes a warning people have been hitting in crash testing on
busy systems.

Thanks,
Nick

Nicholas Piggin (3):
  powerpc/64: hard disable irqs in panic_smp_self_stop
  powerpc: smp_send_stop do not offline stopped CPUs
  powerpc/64: hard disable irqs on the panic()ing CPU

 arch/powerpc/kernel/setup-common.c | 12 ++++++++++--
 arch/powerpc/kernel/setup_64.c     |  8 ++++++++
 arch/powerpc/kernel/smp.c          |  6 ------
 3 files changed, 18 insertions(+), 8 deletions(-)

-- 
2.17.0

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Andy Lutomirski @ 2018-05-19  1:50 UTC (permalink / raw)
  To: linuxram; +Cc: Florian Weimer, linuxppc-dev, Linux-MM, Dave Hansen
In-Reply-To: <20180519011947.GJ5479@ram.oc3035372033.ibm.com>

On Fri, May 18, 2018 at 6:19 PM Ram Pai <linuxram@us.ibm.com> wrote:

> However the fundamental issue is still the same, as mentioned in the
> other thread.

> "Should the permissions on a key be allowed to be changed, if the key
> is not allocated in the first place?".

> my answer is NO. Lets debate :)

As a preface, here's my two-minute attempt to understand POWER's behavior:
there are two registers, AMR and UAMR.  AMR contains both kernel-relevant
state and user-relevant state.  UAMR specifies which bits of AMR are
available for user code to directly access.  AMR bits outside UAMR are read
as zero and are unaffected by writes.  I'm assuming that the kernel
reserves some subset of AMR bits in advance to correspond to allocatable
pkeys.

Here's my question: given that disallowed AMR bits are read-as-zero, there
can always be a thread that is in the middle of a sequence like:

unsigned long old = amr;
amr |= whatever;
...  <- thread is here
amr = old;

Now another thread calls pkey_alloc(), so UAMR is asynchronously changed,
and the thread will write zero to the relevant AMR bits.  If I understand
correctly, this means that the decision to mask off unallocated keys via
UAMR effectively forces the initial value of newly-allocated keys in other
threads in the allocating process to be zero, whatever zero means.  (I
didn't get far enough in the POWER docs to figure out what zero means.)  So
I don't think you're doing anyone any favors by making UAMR dynamic.

IOW both x86 and POWER screwed up the ISA.

^ permalink raw reply

* Re: pkeys on POWER: Access rights not reset on execve
From: Ram Pai @ 2018-05-19  1:19 UTC (permalink / raw)
  To: Florian Weimer; +Cc: linuxppc-dev, linux-mm, Dave Hansen, Andy Lutomirski
In-Reply-To: <53828769-23c4-b2e3-cf59-239936819c3e@redhat.com>

On Fri, May 18, 2018 at 04:27:14PM +0200, Florian Weimer wrote:
> This test program:
> 
> #include <errno.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <sys/syscall.h>
> #include <err.h>
> 
> /* Return the value of the AMR register.  */
> static inline unsigned long int
> pkey_read (void)
> {
>   unsigned long int result;
>   __asm__ volatile ("mfspr %0, 13" : "=r" (result));
>   return result;
> }
> 
> /* Overwrite the AMR register with VALUE.  */
> static inline void
> pkey_write (unsigned long int value)
> {
>   __asm__ volatile ("mtspr 13, %0" : : "r" (value));
> }
> 
> int
> main (int argc, char **argv)
> {
>   printf ("AMR (PID %d): 0x%016lx\n", (int) getpid (), pkey_read());
>   if (argc > 1)
>     {
>       int key = syscall (__NR_pkey_alloc, 0, 0);
>       if (key < 0)
>         err (1, "pkey_alloc");
>       printf ("Allocated key (PID %d): %d\n", (int) getpid (), key);
>       return 0;
>     }
> 
>   pid_t pid = fork ();
>   if (pid == 0)
>     {
>       execl ("/proc/self/exe", argv[0], "subprocess", NULL);
>       _exit (1);
>     }
>   if (pid < 0)
>     err (1, "fork");
>   int status;
>   if (waitpid (pid, &status, 0) < 0)
>     err (1, "waitpid");
> 
>   int key = syscall (__NR_pkey_alloc, 0, 0);
>   if (key < 0)
>     err (1, "pkey_alloc");
>   printf ("Allocated key (PID %d): %d\n", (int) getpid (), key);
> 
>   unsigned long int amr = -1;
>   printf ("Setting AMR: 0x%016lx\n", amr);
>   pkey_write (amr);
>   printf ("New AMR value (PID %d, before execl): 0x%016lx\n",
>           (int) getpid (), pkey_read());
>   execl ("/proc/self/exe", argv[0], "subprocess", NULL);
>   err (1, "exec");
>   return 1;
> }
> 
> shows that the AMR register value is not reset on execve:
> 
> AMR (PID 112291): 0x0000000000000000
> AMR (PID 112292): 0x0000000000000000
> Allocated key (PID 112292): 2
> Allocated key (PID 112291): 2
> Setting AMR: 0xffffffffffffffff
> New AMR value (PID 112291, before execl): 0x0c00000000000000
> AMR (PID 112291): 0x0c00000000000000
> Allocated key (PID 112291): 2
> 
> I think this is a real bug and needs to be fixed even if the
> defaults are kept as-is (see the other thread).

The issue you may be talking about here is that  --

"when you set the AMR register to 0xffffffffffffffff, it 
just sets it to 0x0c00000000000000."

To me it looks like, exec/fork are not related to the issue.
Or are they also somehow connected to the issue?


The reason the AMR register does not get set to 0xffffffffffffffff,
is because none of those keys; except key 2, are active. So it ignores
all other bits and just sets the bits corresponding to key 2.

However the fundamental issue is still the same, as mentioned in the
other thread.

"Should the permissions on a key be allowed to be changed, if the key
is not allocated in the first place?".

my answer is NO. Lets debate :)
RP

^ permalink raw reply

* Re: pkeys on POWER: Default AMR, UAMOR values
From: Ram Pai @ 2018-05-19  0:52 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Andy Lutomirski, linuxppc-dev, Linux-MM, Dave Hansen
In-Reply-To: <27e01118-be5c-5f90-78b2-56bb69d2ab95@redhat.com>

On Fri, May 18, 2018 at 11:13:30PM +0200, Florian Weimer wrote:
> On 05/18/2018 09:39 PM, Andy Lutomirski wrote:
> >The difference is that x86 starts out with deny-all instead of allow-all.

Ah!. this explains the discrepency. But still does not explain one
thing.. see below.

> >The POWER semantics make it very hard for a multithreaded program to
> >meaningfully use protection keys to prevent accidental access to important
> >memory.
> 
> And you can change access rights for unallocated keys (unallocated
> at thread start time, allocated later) on x86.  I have extended the
> misc/tst-pkeys test to verify that, and it passes on x86, but not on
> POWER, where the access rights are stuck.

This is something I do not understand. How can a thread change permissions
on a key, that is not even allocated in the first place. Do you consider a key
allocated in some other thread's context, as allocated in this threads
context? If not, does that mean -- On x86, you can activate a key just
by changing its permission?


RP

^ permalink raw reply

* Re: pkeys on POWER: Default AMR, UAMOR values
From: Florian Weimer @ 2018-05-18 21:13 UTC (permalink / raw)
  To: Andy Lutomirski, linuxram; +Cc: linuxppc-dev, Linux-MM, Dave Hansen
In-Reply-To: <CALCETrV_wYPKHna8R2Bu19nsDqF2dJWarLLsyHxbcYD_AgYfPg@mail.gmail.com>

On 05/18/2018 09:39 PM, Andy Lutomirski wrote:
> The difference is that x86 starts out with deny-all instead of allow-all.
> The POWER semantics make it very hard for a multithreaded program to
> meaningfully use protection keys to prevent accidental access to important
> memory.

And you can change access rights for unallocated keys (unallocated at 
thread start time, allocated later) on x86.  I have extended the 
misc/tst-pkeys test to verify that, and it passes on x86, but not on 
POWER, where the access rights are stuck.

I believe this is due to an incorrect UAMOR setting.

Thanks,
Florian

^ permalink raw reply

* Re: pkeys on POWER: Default AMR, UAMOR values
From: Florian Weimer @ 2018-05-18 21:09 UTC (permalink / raw)
  To: Ram Pai; +Cc: linuxppc-dev, linux-mm, Dave Hansen, Andy Lutomirski
In-Reply-To: <20180518174448.GE5479@ram.oc3035372033.ibm.com>

On 05/18/2018 07:44 PM, Ram Pai wrote:
> Florian, is the behavior on x86 any different? A key allocated in the
> context off one thread is not meaningful in the context of any other
> thread.
> 
> Since thread B was created prior to the creation of the key, and the key
> was created in the context of thread A, thread B neither inherits the
> key nor its permissions. Atleast that is how the semantics are supposed
> to work as per the man page.
> 
> man 7 pkey
> 
> " Applications  using  threads  and  protection  keys  should
> be especially careful.  Threads inherit the protection key rights of the
> parent at the time of the clone(2), system call.  Applications should
> either ensure that their own permissions are appropriate for child
> threads at the time when clone(2) is  called,  or ensure that each child
> thread can perform its own initialization of protection key rights."

I reported two separate issues (actually three, but the execve bug is in 
a separate issue).  The default, and the write restrictions.

The default is just a difference to x86 (however, x86 can be booted with 
init_pkru=0 and behaves the same way, but we're probably going to remove 
that).

The POWER implementation has the additional wrinkle that threads 
launched early, before key allocation, can never change access rights 
because they inherited not just the access rights, but also the access 
rights access mask.  This is different from x86, where all threads can 
freely update access rights, and contradicts the behavior in the manpage 
which says that “each child thread can perform its own initialization of 
protection key rights”.  It can't do that if it is launched before key 
allocation, which is not the right behavior IMO.

Thanks,
Florian

^ permalink raw reply

* Re: [RFC v3 1/4] powerpc/hotplug/drcinfo: Fix bugs parsing ibm,drc-info structs
From: Michael Bringmann @ 2018-05-18 20:18 UTC (permalink / raw)
  To: linuxppc-dev, Nathan Fontenot
In-Reply-To: <66c02d35-2bf4-67c1-1ba0-072e9ac6562c@linux.vnet.ibm.com>

See below.

On 05/18/2018 02:57 PM, Nathan Fontenot wrote:
> On 05/17/2018 05:41 PM, Michael Bringmann wrote:
>> [Replace/withdraw previous patch submission to ensure that testing
>> of related patches on similar hardware progresses together.]
>>
>> This patch fixes a memory parsing bug when using of_prop_next_u32
>> calls at the start of a structure.  Depending upon the value of
>> "cur" memory pointer argument to of_prop_next_u32, it will or it
>> won't advance the value of the returned memory pointer by the
>> size of one u32.  This patch corrects the code to deal with that
>> indexing feature when parsing the ibm,drc-info structs for CPUs.
>> Also, need to advance the pointer at the end of_read_drc_info_cell
>> for same reason.
>>
> 
> I see that you provide an update for of_read_drc_info_cell to fix the
> unexpected behavior you're seeing, but I'm not sure why you're updating
> the code in pseries_energy.c and rpaphp_core.c? can you provide some
> additional information as to why these functions also need to be updated.

The changes are related.

of_prop_next_u32() does not read a u32 and then advance the pointer.
It advances the pointer and then reads a u32.  It does an error check
to see whether the u32 read is within the boundary of the property
value, but it returns a pointer to the u32 that was read.

> 
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info firmware feature" -- end of patch series applied to powerpc next)
>> ---
>> Changes in V3:
>>   -- Rebased patch to 4.17-rc5 kernel
>> ---
>>  arch/powerpc/platforms/pseries/of_helpers.c     |    5 ++---
>>  arch/powerpc/platforms/pseries/pseries_energy.c |    2 ++
>>  drivers/pci/hotplug/rpaphp_core.c               |    1 +
>>  3 files changed, 5 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c
>> index 6df192f..20598b2 100644
>> --- a/arch/powerpc/platforms/pseries/of_helpers.c
>> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
>> @@ -65,9 +65,7 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
>>
>>  	/* Get drc-index-start:encode-int */
>>  	p2 = (const __be32 *)p;
>> -	p2 = of_prop_next_u32(*prop, p2, &data->drc_index_start);
>> -	if (!p2)
>> -		return -EINVAL;
>> +	data->drc_index_start = of_read_number(p2, 1);
> 
> This appears to resolve advancing the pointer for the beginning of a struct.
> 
>>
>>  	/* Get drc-name-suffix-start:encode-int */
>>  	p2 = of_prop_next_u32(*prop, p2, &data->drc_name_suffix_start);
>> @@ -88,6 +86,7 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
>>  	p2 = of_prop_next_u32(*prop, p2, &data->drc_power_domain);
>>  	if (!p2)
>>  		return -EINVAL;
>> +	p2++;
> 
> ...but why is the advancement needed here? of_prop_next_u32 should have advanced it, correct?
> 
> -Nathan
> 
>>
>>  	/* Should now know end of current entry */
>>  	(*curval) = (void *)p2;
>> diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
>> index 6ed2212..c7d84aa 100644
>> --- a/arch/powerpc/platforms/pseries/pseries_energy.c
>> +++ b/arch/powerpc/platforms/pseries/pseries_energy.c
>> @@ -64,6 +64,7 @@ static u32 cpu_to_drc_index(int cpu)
>>  		value = of_prop_next_u32(info, NULL, &num_set_entries);
>>  		if (!value)
>>  			goto err_of_node_put;
>> +		value++;
>>
>>  		for (j = 0; j < num_set_entries; j++) {
>>
>> @@ -126,6 +127,7 @@ static int drc_index_to_cpu(u32 drc_index)
>>  		value = of_prop_next_u32(info, NULL, &num_set_entries);
>>  		if (!value)
>>  			goto err_of_node_put;
>> +		value++;
>>
>>  		for (j = 0; j < num_set_entries; j++) {
>>
>> diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
>> index fb5e084..dccdf62 100644
>> --- a/drivers/pci/hotplug/rpaphp_core.c
>> +++ b/drivers/pci/hotplug/rpaphp_core.c
>> @@ -239,6 +239,7 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
>>  	value = of_prop_next_u32(info, NULL, &entries);
>>  	if (!value)
>>  		return -EINVAL;
>> +	value++;
>>
>>  	for (j = 0; j < entries; j++) {
>>  		of_read_drc_info_cell(&info, &value, &drc);
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC v4 2/3] powerpc migration/cpu: Associativity & cpu changes
From: Michael Bringmann @ 2018-05-18 20:12 UTC (permalink / raw)
  To: Nathan Fontenot, linuxppc-dev; +Cc: John Allen, Thomas Falcon, Tyrel Datwyler
In-Reply-To: <1bddc982-3f91-e4b2-4a96-619734ee6bd9@linux.vnet.ibm.com>



On 05/18/2018 02:28 PM, Nathan Fontenot wrote:
> On 05/17/2018 05:26 PM, Michael Bringmann wrote:
>> powerpc migration/cpu: Now apply changes to the associativity of cpus
>> for the topology of LPARS in Post Migration events.  Recognize more
>> changes to the associativity of memory blocks described by the
>> 'cpu' properties when processing the topology of LPARS in Post Migration
>> events.  Previous efforts only recognized whether a memory block's
>> assignment had changed in the property.  Changes here include:
>>
>> * Provide hotplug CPU 'readd by index' operation
>> * Checking for changes in cpu associativity and making 'readd' calls
>>   when differences are observed.
>> * Queue up  changes to CPU properties so that they may take place
>>   after all PowerPC device-tree changes have been applied i.e. after
>>   the device hotplug is released in the mobility code.
> 
> This kinda feels like three different patches in one. any reason to not split
> this into three patches? Perhaps at the least split the last item into it's
> own patch.

E.g.
1) Conditional 'acquire_drc'
2) PSERIES_HP_ELOG_ACTION_READD call to dlpar_cpu_readd_by_index
3) Add lock/unlock device hotplug
> 
>>
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> ---
>> Changes include:
>>   -- Rearrange patches to co-locate CPU property-related changes.
>>   -- Modify dlpar_cpu_add & dlpar_cpu_remove to skip DRC index acquire
>>      or release operations during the CPU readd process.
>>   -- Correct a bug in DRC index selection for queued operation.
>>   -- Rebase to 4.17-rc5 kernel
>> ---
>>  arch/powerpc/platforms/pseries/hotplug-cpu.c |  123 +++++++++++++++++++-------
>>  arch/powerpc/platforms/pseries/mobility.c    |    3 +
>>  2 files changed, 95 insertions(+), 31 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> index a408217..23d4cb8 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
>> @@ -474,7 +474,7 @@ static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
>>  				&cdata);
>>  }
>>
>> -static ssize_t dlpar_cpu_add(u32 drc_index)
>> +static ssize_t dlpar_cpu_add(u32 drc_index, bool acquire_drc)
>>  {
>>  	struct device_node *dn, *parent;
>>  	int rc, saved_rc;
>> @@ -499,19 +499,22 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>>  		return -EINVAL;
>>  	}
>>
>> -	rc = dlpar_acquire_drc(drc_index);
>> -	if (rc) {
>> -		pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
>> -			rc, drc_index);
>> -		of_node_put(parent);
>> -		return -EINVAL;
>> +	if (acquire_drc) {
>> +		rc = dlpar_acquire_drc(drc_index);
>> +		if (rc) {
>> +			pr_warn("Failed to acquire DRC, rc: %d, drc index: %x\n",
>> +				rc, drc_index);
>> +			of_node_put(parent);
>> +			return -EINVAL;
>> +		}
>>  	}
>>
>>  	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
>>  	if (!dn) {
>>  		pr_warn("Failed call to configure-connector, drc index: %x\n",
>>  			drc_index);
>> -		dlpar_release_drc(drc_index);
>> +		if (acquire_drc)
>> +			dlpar_release_drc(drc_index);
>>  		of_node_put(parent);
>>  		return -EINVAL;
>>  	}
>> @@ -526,8 +529,9 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>>  		pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n",
>>  			dn->name, rc, drc_index);
>>
>> -		rc = dlpar_release_drc(drc_index);
>> -		if (!rc)
>> +		if (acquire_drc)
>> +			rc = dlpar_release_drc(drc_index);
>> +		if (!rc || acquire_drc)
>>  			dlpar_free_cc_nodes(dn);
> 
> This seems like it would be more readable if everything were inside the
> if (acquire_drc) block.
> 
>>
>>  		return saved_rc;
>> @@ -540,7 +544,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>>  			dn->name, rc, drc_index);
>>
>>  		rc = dlpar_detach_node(dn);
>> -		if (!rc)
>> +		if (!rc && acquire_drc)
>>  			dlpar_release_drc(drc_index);
>>
>>  		return saved_rc;
>> @@ -608,12 +612,13 @@ static int dlpar_offline_cpu(struct device_node *dn)
>>
>>  }
>>
>> -static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
>> +static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index,
>> +				bool release_drc)
>>  {
>>  	int rc;
>>
>> -	pr_debug("Attempting to remove CPU %s, drc index: %x\n",
>> -		 dn->name, drc_index);
>> +	pr_debug("Attempting to remove CPU %s, drc index: %x (%d)\n",
>> +		 dn->name, drc_index, release_drc);
>>
>>  	rc = dlpar_offline_cpu(dn);
>>  	if (rc) {
>> @@ -621,12 +626,14 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
>>  		return -EINVAL;
>>  	}
>>
>> -	rc = dlpar_release_drc(drc_index);
>> -	if (rc) {
>> -		pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n",
>> -			drc_index, dn->name, rc);
>> -		dlpar_online_cpu(dn);
>> -		return rc;
>> +	if (release_drc) {
>> +		rc = dlpar_release_drc(drc_index);
>> +		if (rc) {
>> +			pr_warn("Failed to release drc (%x) for CPU %s, rc: %d\n",
>> +				drc_index, dn->name, rc);
>> +			dlpar_online_cpu(dn);
>> +			return rc;
>> +		}
>>  	}
>>
>>  	rc = dlpar_detach_node(dn);
>> @@ -635,7 +642,8 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
>>
>>  		pr_warn("Failed to detach CPU %s, rc: %d", dn->name, rc);
>>
>> -		rc = dlpar_acquire_drc(drc_index);
>> +		if (release_drc)
>> +			rc = dlpar_acquire_drc(drc_index);
>>  		if (!rc)
>>  			dlpar_online_cpu(dn);
>>
>> @@ -664,7 +672,7 @@ static struct device_node *cpu_drc_index_to_dn(u32 drc_index)
>>  	return dn;
>>  }
>>
>> -static int dlpar_cpu_remove_by_index(u32 drc_index)
>> +static int dlpar_cpu_remove_by_index(u32 drc_index, bool release_drc)
>>  {
>>  	struct device_node *dn;
>>  	int rc;
>> @@ -676,10 +684,30 @@ static int dlpar_cpu_remove_by_index(u32 drc_index)
>>  		return -ENODEV;
>>  	}
>>
>> -	rc = dlpar_cpu_remove(dn, drc_index);
>> +	rc = dlpar_cpu_remove(dn, drc_index, release_drc);
>>  	of_node_put(dn);
>>  	return rc;
>>  }
>> + 
>> +static int dlpar_cpu_readd_by_index(u32 drc_index)
>> +{
>> +	int rc = 0;
> 
> No need to initialize to 0 here.
> 
>> +
>> +	pr_info("Attempting to re-add CPU, drc index %x\n", drc_index);
>> +
>> +	rc = dlpar_cpu_remove_by_index(drc_index, false);
>> +	if (!rc)
>> +		rc = dlpar_cpu_add(drc_index, false);
>> +
>> +	if (rc)
>> +		pr_info("Failed to update cpu at drc_index %lx\n",
>> +				(unsigned long int)drc_index);
>> +	else
>> +		pr_info("CPU at drc_index %lx was updated\n",
>> +				(unsigned long int)drc_index);
>> +
>> +	return rc;
>> +}
>>
>>  static int find_dlpar_cpus_to_remove(u32 *cpu_drcs, int cpus_to_remove)
>>  {
>> @@ -741,7 +769,7 @@ static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
>>  	}
>>
>>  	for (i = 0; i < cpus_to_remove; i++) {
>> -		rc = dlpar_cpu_remove_by_index(cpu_drcs[i]);
>> +		rc = dlpar_cpu_remove_by_index(cpu_drcs[i], true);
>>  		if (rc)
>>  			break;
>>
>> @@ -752,7 +780,7 @@ static int dlpar_cpu_remove_by_count(u32 cpus_to_remove)
>>  		pr_warn("CPU hot-remove failed, adding back removed CPUs\n");
>>
>>  		for (i = 0; i < cpus_removed; i++)
>> -			dlpar_cpu_add(cpu_drcs[i]);
>> +			dlpar_cpu_add(cpu_drcs[i], true);
>>
>>  		rc = -EINVAL;
>>  	} else {
>> @@ -843,7 +871,7 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add)
>>  	}
>>
>>  	for (i = 0; i < cpus_to_add; i++) {
>> -		rc = dlpar_cpu_add(cpu_drcs[i]);
>> +		rc = dlpar_cpu_add(cpu_drcs[i], true);
>>  		if (rc)
>>  			break;
>>
>> @@ -854,7 +882,7 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add)
>>  		pr_warn("CPU hot-add failed, removing any added CPUs\n");
>>
>>  		for (i = 0; i < cpus_added; i++)
>> -			dlpar_cpu_remove_by_index(cpu_drcs[i]);
>> +			dlpar_cpu_remove_by_index(cpu_drcs[i], true);
>>
>>  		rc = -EINVAL;
>>  	} else {
>> @@ -880,7 +908,7 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
>>  		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
>>  			rc = dlpar_cpu_remove_by_count(count);
>>  		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
>> -			rc = dlpar_cpu_remove_by_index(drc_index);
>> +			rc = dlpar_cpu_remove_by_index(drc_index, true);
>>  		else
>>  			rc = -EINVAL;
>>  		break;
>> @@ -888,10 +916,13 @@ int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
>>  		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
>>  			rc = dlpar_cpu_add_by_count(count);
>>  		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
>> -			rc = dlpar_cpu_add(drc_index);
>> +			rc = dlpar_cpu_add(drc_index, true);
>>  		else
>>  			rc = -EINVAL;
>>  		break;
>> +	case PSERIES_HP_ELOG_ACTION_READD:
>> +		rc = dlpar_cpu_readd_by_index(drc_index);
>> +		break;
>>  	default:
>>  		pr_err("Invalid action (%d) specified\n", hp_elog->action);
>>  		rc = -EINVAL;
>> @@ -913,7 +944,7 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
>>  	if (rc)
>>  		return -EINVAL;
>>
>> -	rc = dlpar_cpu_add(drc_index);
>> +	rc = dlpar_cpu_add(drc_index, true);
>>
>>  	return rc ? rc : count;
>>  }
>> @@ -934,7 +965,7 @@ static ssize_t dlpar_cpu_release(const char *buf, size_t count)
>>  		return -EINVAL;
>>  	}
>>
>> -	rc = dlpar_cpu_remove(dn, drc_index);
>> +	rc = dlpar_cpu_remove(dn, drc_index, true);
>>  	of_node_put(dn);
>>
>>  	return rc ? rc : count;
>> @@ -942,12 +973,38 @@ static ssize_t dlpar_cpu_release(const char *buf, size_t count)
>>
>>  #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
>>
>> +static int pseries_update_cpu(struct of_reconfig_data *pr)
>> +{
>> +	/* So far, we only handle the 'ibm,associativity' property,
>> +	 * here.
>> +	 */
>> +	struct pseries_hp_errorlog *hp_elog;
>> +
>> +        hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
>> +        if(!hp_elog)
>> +                return -ENOMEM;
> 
> You seem to have some odd indentation here, may want to run this through checkpatch

Okay.  I think it happened on one of my build systems which was setting 'ts=4'.
> 
> Also, I think queue_hotplug_event() makes a copy of the hotplug event passed in. If
> so you could just use a hp_elog struct on the stack and avoid the allocation.

Okay.
> 
> -Nathan

Thanks.
Michael
> 
>> +
>> +	hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_CPU;
>> +	hp_elog->action = PSERIES_HP_ELOG_ACTION_READD;
>> +	hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
>> +	hp_elog->_drc_u.drc_index = be32_to_cpu(pr->dn->phandle);
>> +
>> +	queue_hotplug_event(hp_elog, NULL, NULL);
>> +
>> +	kfree(hp_elog);
>> +
>> +	return 0;
>> +}
>> +
>>  static int pseries_smp_notifier(struct notifier_block *nb,
>>  				unsigned long action, void *data)
>>  {
>>  	struct of_reconfig_data *rd = data;
>>  	int err = 0;
>>
>> +	if (strcmp(rd->dn->type, "cpu"))
>> +		return notifier_from_errno(err);
>> +
>>  	switch (action) {
>>  	case OF_RECONFIG_ATTACH_NODE:
>>  		err = pseries_add_processor(rd->dn);
>> @@ -955,6 +1012,10 @@ static int pseries_smp_notifier(struct notifier_block *nb,
>>  	case OF_RECONFIG_DETACH_NODE:
>>  		pseries_remove_processor(rd->dn);
>>  		break;
>> +	case OF_RECONFIG_UPDATE_PROPERTY:
>> +		if (!strcmp(rd->prop->name, "ibm,associativity"))
>> +			err = pseries_update_cpu(rd);
>> +		break;
>>  	}
>>  	return notifier_from_errno(err);
>>  }
>> diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
>> index 8a8033a..6d98f84 100644
>> --- a/arch/powerpc/platforms/pseries/mobility.c
>> +++ b/arch/powerpc/platforms/pseries/mobility.c
>> @@ -283,6 +283,8 @@ int pseries_devicetree_update(s32 scope)
>>  	if (!rtas_buf)
>>  		return -ENOMEM;
>>
>> +	lock_device_hotplug();
>> +
>>  	do {
>>  		rc = mobility_rtas_call(update_nodes_token, rtas_buf, scope);
>>  		if (rc && rc != 1)
>> @@ -321,6 +323,7 @@ int pseries_devicetree_update(s32 scope)
>>  	} while (rc == 1);
>>
>>  	kfree(rtas_buf);
>> +	unlock_device_hotplug();
>>  	return rc;
>>  }
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC v4 1/3] powerpc migration/drmem: Modify DRMEM code to export more features
From: Michael Bringmann @ 2018-05-18 20:00 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <88f4921f-db9c-5b46-e812-b1ffd5cb66d5@linux.vnet.ibm.com>

See below.

On 05/18/2018 02:17 PM, Nathan Fontenot wrote:
> On 05/17/2018 05:26 PM, Michael Bringmann wrote:
>> powerpc migration/drmem: Export many of the functions of DRMEM to
>> parse "ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during
>> hotplug operations and for Post Migration events.
>>
>> Also modify the DRMEM initialization code to allow it to,
>>
>> * Be called after system initialization
>> * Provide a separate user copy of the LMB array that is produces
>> * Free the user copy upon request
>>
>> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
>> ---
>> Changes in RFC:
>>   -- Separate DRMEM changes into a standalone patch
>>   -- Do not export excess functions.  Make exported names more explicit.
>>   -- Add new iterator to work through a pair of drmem_info arrays.
>>   -- Modify DRMEM code to replace usages of dt_root_addr_cells, and
>>      dt_mem_next_cell, as these are only available at first boot.
>>   -- Rebase to 4.17-rc5 kernel
>> ---
>>  arch/powerpc/include/asm/drmem.h |   10 +++++
>>  arch/powerpc/mm/drmem.c          |   78 +++++++++++++++++++++++++++-----------
>>  2 files changed, 66 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
>> index ce242b9..c964b89 100644
>> --- a/arch/powerpc/include/asm/drmem.h
>> +++ b/arch/powerpc/include/asm/drmem.h
>> @@ -35,6 +35,13 @@ struct drmem_lmb_info {
>>  		&drmem_info->lmbs[0],				\
>>  		&drmem_info->lmbs[drmem_info->n_lmbs - 1])
>>
>> +#define for_each_pair_drmem_lmb(dinfo1, lmb1, dinfo2, lmb2)	\
>> +	for ((lmb1) = (&dinfo1->lmbs[0]),			\
>> +	     (lmb2) = (&dinfo2->lmbs[0]);			\
>> +             ((lmb1) <= (&dinfo1->lmbs[dinfo1->n_lmbs - 1])) &&	\
>> +             ((lmb2) <= (&dinfo2->lmbs[dinfo2->n_lmbs - 1]));	\
>> +	     (lmb1)++, (lmb2)++)
>> +
>>  /*
>>   * The of_drconf_cell_v1 struct defines the layout of the LMB data
>>   * specified in the ibm,dynamic-memory device tree property.
>> @@ -94,6 +101,9 @@ void __init walk_drmem_lmbs(struct device_node *dn,
>>  			void (*func)(struct drmem_lmb *, const __be32 **));
>>  int drmem_update_dt(void);
>>
>> +struct drmem_lmb_info* drmem_init_lmbs(struct property *prop);
>> +void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
>> +
>>  #ifdef CONFIG_PPC_PSERIES
>>  void __init walk_drmem_lmbs_early(unsigned long node,
>>  			void (*func)(struct drmem_lmb *, const __be32 **));
>> diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
>> index 3f18036..d9b281c 100644
>> --- a/arch/powerpc/mm/drmem.c
>> +++ b/arch/powerpc/mm/drmem.c
>> @@ -20,6 +20,7 @@
>>
>>  static struct drmem_lmb_info __drmem_info;
>>  struct drmem_lmb_info *drmem_info = &__drmem_info;
>> +static int n_root_addr_cells;
>>
>>  u64 drmem_lmb_memory_max(void)
>>  {
>> @@ -193,12 +194,13 @@ int drmem_update_dt(void)
>>  	return rc;
>>  }
>>
>> -static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
>> +static void read_drconf_v1_cell(struct drmem_lmb *lmb,
>>  				       const __be32 **prop)
>>  {
>>  	const __be32 *p = *prop;
>>
>> -	lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
>> +	lmb->base_addr = of_read_number(p, n_root_addr_cells);
>> +	p += n_root_addr_cells;
>>  	lmb->drc_index = of_read_number(p++, 1);
>>
>>  	p++; /* skip reserved field */
>> @@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
>>  	*prop = p;
>>  }
>>
>> -static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
>> +static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *data,
>>  			void (*func)(struct drmem_lmb *, const __be32 **))
>>  {
>>  	struct drmem_lmb lmb;
>> @@ -221,17 +223,18 @@ static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
>>
>>  	for (i = 0; i < n_lmbs; i++) {
>>  		read_drconf_v1_cell(&lmb, &prop);
>> -		func(&lmb, &usm);
>> +		func(&lmb, &data);
> 
> Is there a need to change the variable name from usm to data (bot here and below)?

Actually, this was your recommendation to me when we talked about
updating this code a couple of months ago.  Before we changed the
code to create a copy of the DRMEM lmbs and compare it.

I will change it back.

> 
>>  	}
>>  }
>>
>> -static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
>> +static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
>>  				       const __be32 **prop)
>>  {
>>  	const __be32 *p = *prop;
>>
>>  	dr_cell->seq_lmbs = of_read_number(p++, 1);
>> -	dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
>> +	dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
>> +	p += n_root_addr_cells;
>>  	dr_cell->drc_index = of_read_number(p++, 1);
>>  	dr_cell->aa_index = of_read_number(p++, 1);
>>  	dr_cell->flags = of_read_number(p++, 1);
>> @@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
>>  	*prop = p;
>>  }
>>
>> -static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
>> +static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *data,
>>  			void (*func)(struct drmem_lmb *, const __be32 **))
>>  {
>>  	struct of_drconf_cell_v2 dr_cell;
>> @@ -263,7 +266,7 @@ static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
>>  			lmb.aa_index = dr_cell.aa_index;
>>  			lmb.flags = dr_cell.flags;
>>
>> -			func(&lmb, &usm);
>> +			func(&lmb, &data);
>>  		}
>>  	}
>>  }
>> @@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(unsigned long node,
>>  	const __be32 *prop, *usm;
>>  	int len;
>>
>> +	if (n_root_addr_cells == 0)
>> +		n_root_addr_cells = dt_root_addr_cells;
>> +
>>  	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
>>  	if (!prop || len < dt_root_size_cells * sizeof(__be32))
>>  		return;
>> @@ -353,24 +359,26 @@ void __init walk_drmem_lmbs(struct device_node *dn,
>>  	}
>>  }
>>
>> -static void __init init_drmem_v1_lmbs(const __be32 *prop)
>> +static void init_drmem_v1_lmbs(const __be32 *prop,
>> +			struct drmem_lmb_info *dinfo)
> 
> Please make sure you line up the function args, here and other places below.

Okay.

> 
>>  {
>>  	struct drmem_lmb *lmb;
>>
>> -	drmem_info->n_lmbs = of_read_number(prop++, 1);
>> -	if (drmem_info->n_lmbs == 0)
>> +	dinfo->n_lmbs = of_read_number(prop++, 1);
>> +	if (dinfo->n_lmbs == 0)
>>  		return;
>>
>> -	drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
>> +	dinfo->lmbs = kcalloc(dinfo->n_lmbs, sizeof(*lmb),
>>  				   GFP_KERNEL);> -	if (!drmem_info->lmbs)
>> +	if (!dinfo->lmbs)
>>  		return;
>>
>>  	for_each_drmem_lmb(lmb)
>>  		read_drconf_v1_cell(lmb, &prop);
>>  }
>>
>> -static void __init init_drmem_v2_lmbs(const __be32 *prop)
>> +static void init_drmem_v2_lmbs(const __be32 *prop,
>> +			struct drmem_lmb_info *dinfo)
>>  {
>>  	struct drmem_lmb *lmb;
>>  	struct of_drconf_cell_v2 dr_cell;
>> @@ -386,12 +394,12 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
>>  	p = prop;
>>  	for (i = 0; i < lmb_sets; i++) {
>>  		read_drconf_v2_cell(&dr_cell, &p);
>> -		drmem_info->n_lmbs += dr_cell.seq_lmbs;
>> +		dinfo->n_lmbs += dr_cell.seq_lmbs;
>>  	}
>>
>> -	drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
>> +	dinfo->lmbs = kcalloc(dinfo->n_lmbs, sizeof(*lmb),
>>  				   GFP_KERNEL);
>> -	if (!drmem_info->lmbs)
>> +	if (!dinfo->lmbs)
>>  		return;
>>
>>  	/* second pass, read in the LMB information */
>> @@ -402,25 +410,51 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
>>  		read_drconf_v2_cell(&dr_cell, &p);
>>
>>  		for (j = 0; j < dr_cell.seq_lmbs; j++) {
>> -			lmb = &drmem_info->lmbs[lmb_index++];
>> +			lmb = &dinfo->lmbs[lmb_index++];
>>
>>  			lmb->base_addr = dr_cell.base_addr;
>> -			dr_cell.base_addr += drmem_info->lmb_size;
>> +			dr_cell.base_addr += dinfo->lmb_size;
>>
>>  			lmb->drc_index = dr_cell.drc_index;
>>  			dr_cell.drc_index++;
>>
>>  			lmb->aa_index = dr_cell.aa_index;
>> -			lmb->flags = dr_cell.flags;
> 
> Why are you removing the setting of the flags field?

Definitely, an error.  Fixed.

> 
>>  		}
>>  	}
>>  }
>>
>> +void drmem_lmbs_free(struct drmem_lmb_info *dinfo)
>> +{
>> +	if (dinfo) {
>> +		kfree(dinfo->lmbs);
>> +		kfree(dinfo);
>> +	}
>> +}
>> +
>> +struct drmem_lmb_info* drmem_init_lmbs(struct property *prop)
> 
> Small nit, but this should probably be named drmem_lmbs_init. This follows
> more closely to the naming used elsewhere.

Okay.

> 
> -Nathan

Thanks.
Michael

> 
>> +{
>> +	struct drmem_lmb_info *dinfo;
>> +
>> +	dinfo = kzalloc(sizeof(*dinfo), GFP_KERNEL);
>> +	if (!dinfo)
>> +		return NULL;
>> +
>> +	if (!strcmp("ibm,dynamic-memory", prop->name))
>> +		init_drmem_v1_lmbs(prop->value, dinfo);
>> +	else if (!strcmp("ibm,dynamic-memory-v2", prop->name))
>> +		init_drmem_v2_lmbs(prop->value, dinfo);
>> +
>> +	return dinfo;
>> +}
>> +
>>  static int __init drmem_init(void)
>>  {
>>  	struct device_node *dn;
>>  	const __be32 *prop;
>>
>> +	if (n_root_addr_cells == 0)
>> +		n_root_addr_cells = dt_root_addr_cells;
>> +
>>  	dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>>  	if (!dn) {
>>  		pr_info("No dynamic reconfiguration memory found\n");
>> @@ -434,11 +468,11 @@ static int __init drmem_init(void)
>>
>>  	prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
>>  	if (prop) {
>> -		init_drmem_v1_lmbs(prop);
>> +		init_drmem_v1_lmbs(prop, drmem_info);
>>  	} else {
>>  		prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
>>  		if (prop)
>> -			init_drmem_v2_lmbs(prop);
>> +			init_drmem_v2_lmbs(prop, drmem_info);
>>  	}
>>
>>  	of_node_put(dn);
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:       (512) 466-0650
mwb@linux.vnet.ibm.com

^ permalink raw reply

* Re: [RFC v3 1/4] powerpc/hotplug/drcinfo: Fix bugs parsing ibm,drc-info structs
From: Nathan Fontenot @ 2018-05-18 19:57 UTC (permalink / raw)
  To: Michael Bringmann, linuxppc-dev; +Cc: John Allen, Tyrel Datwyler, Thomas Falcon
In-Reply-To: <5a950883-d06d-4cdc-2716-8c1ce25f394d@linux.vnet.ibm.com>

On 05/17/2018 05:41 PM, Michael Bringmann wrote:
> [Replace/withdraw previous patch submission to ensure that testing
> of related patches on similar hardware progresses together.]
> 
> This patch fixes a memory parsing bug when using of_prop_next_u32
> calls at the start of a structure.  Depending upon the value of
> "cur" memory pointer argument to of_prop_next_u32, it will or it
> won't advance the value of the returned memory pointer by the
> size of one u32.  This patch corrects the code to deal with that
> indexing feature when parsing the ibm,drc-info structs for CPUs.
> Also, need to advance the pointer at the end of_read_drc_info_cell
> for same reason.
> 

I see that you provide an update for of_read_drc_info_cell to fix the
unexpected behavior you're seeing, but I'm not sure why you're updating
the code in pseries_energy.c and rpaphp_core.c? can you provide some
additional information as to why these functions also need to be updated.

> Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
> Fixes: 3f38000eda48 ("powerpc/firmware: Add definitions for new drc-info firmware feature" -- end of patch series applied to powerpc next)
> ---
> Changes in V3:
>   -- Rebased patch to 4.17-rc5 kernel
> ---
>  arch/powerpc/platforms/pseries/of_helpers.c     |    5 ++---
>  arch/powerpc/platforms/pseries/pseries_energy.c |    2 ++
>  drivers/pci/hotplug/rpaphp_core.c               |    1 +
>  3 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c b/arch/powerpc/platforms/pseries/of_helpers.c
> index 6df192f..20598b2 100644
> --- a/arch/powerpc/platforms/pseries/of_helpers.c
> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
> @@ -65,9 +65,7 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
> 
>  	/* Get drc-index-start:encode-int */
>  	p2 = (const __be32 *)p;
> -	p2 = of_prop_next_u32(*prop, p2, &data->drc_index_start);
> -	if (!p2)
> -		return -EINVAL;
> +	data->drc_index_start = of_read_number(p2, 1);

This appears to resolve advancing the pointer for the beginning of a struct.

> 
>  	/* Get drc-name-suffix-start:encode-int */
>  	p2 = of_prop_next_u32(*prop, p2, &data->drc_name_suffix_start);
> @@ -88,6 +86,7 @@ int of_read_drc_info_cell(struct property **prop, const __be32 **curval,
>  	p2 = of_prop_next_u32(*prop, p2, &data->drc_power_domain);
>  	if (!p2)
>  		return -EINVAL;
> +	p2++;

...but why is the advancement needed here? of_prop_next_u32 should have advanced it, correct?

-Nathan

> 
>  	/* Should now know end of current entry */
>  	(*curval) = (void *)p2;
> diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
> index 6ed2212..c7d84aa 100644
> --- a/arch/powerpc/platforms/pseries/pseries_energy.c
> +++ b/arch/powerpc/platforms/pseries/pseries_energy.c
> @@ -64,6 +64,7 @@ static u32 cpu_to_drc_index(int cpu)
>  		value = of_prop_next_u32(info, NULL, &num_set_entries);
>  		if (!value)
>  			goto err_of_node_put;
> +		value++;
> 
>  		for (j = 0; j < num_set_entries; j++) {
> 
> @@ -126,6 +127,7 @@ static int drc_index_to_cpu(u32 drc_index)
>  		value = of_prop_next_u32(info, NULL, &num_set_entries);
>  		if (!value)
>  			goto err_of_node_put;
> +		value++;
> 
>  		for (j = 0; j < num_set_entries; j++) {
> 
> diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c
> index fb5e084..dccdf62 100644
> --- a/drivers/pci/hotplug/rpaphp_core.c
> +++ b/drivers/pci/hotplug/rpaphp_core.c
> @@ -239,6 +239,7 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name,
>  	value = of_prop_next_u32(info, NULL, &entries);
>  	if (!value)
>  		return -EINVAL;
> +	value++;
> 
>  	for (j = 0; j < entries; j++) {
>  		of_read_drc_info_cell(&info, &value, &drc);
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox