LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 4/7] powerpc/signal: Include the new stack frame inside the user access block
From: Christophe Leroy @ 2021-06-15  6:41 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <b813c1f4d3dab2f51300eac44d99029aa8e57830.1623739212.git.christophe.leroy@csgroup.eu>

Include the new stack frame inside the user access block and set it up
using unsafe_put_user().

On an mpc 8321 (book3s/32) the improvment is about 4% on a process
sending a signal to itself.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/signal_32.c | 29 +++++++++++++----------------
 arch/powerpc/kernel/signal_64.c | 14 +++++++-------
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 8f05ed0da292..621de6e457b3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -781,7 +781,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	struct rt_sigframe __user *frame;
 	struct mcontext __user *mctx;
 	struct mcontext __user *tm_mctx = NULL;
-	unsigned long newsp = 0;
+	unsigned long __user *newsp;
 	unsigned long tramp;
 	struct pt_regs *regs = tsk->thread.regs;
 	/* Save the thread's msr before get_tm_stackpointer() changes it */
@@ -789,6 +789,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 
 	/* Set up Signal Frame */
 	frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+	newsp = (unsigned long __user *)((unsigned long)frame - (__SIGNAL_FRAMESIZE + 16));
 	mctx = &frame->uc.uc_mcontext;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_mctx = &frame->uc_transact.uc_mcontext;
@@ -798,7 +799,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	else
 		prepare_save_user_regs(1);
 
-	if (!user_access_begin(frame, sizeof(*frame)))
+	if (!user_access_begin(newsp, __SIGNAL_FRAMESIZE + 16 + sizeof(*frame)))
 		goto badframe;
 
 	/* Put the siginfo & fill in most of the ucontext */
@@ -836,6 +837,9 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	}
 	unsafe_put_sigset_t(&frame->uc.uc_sigmask, oldset, failed);
 
+	/* create a stack frame for the caller of the handler */
+	unsafe_put_user(regs->gpr[1], newsp, failed);
+
 	user_access_end();
 
 	if (copy_siginfo_to_user(&frame->info, &ksig->info))
@@ -847,13 +851,8 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	tsk->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
 #endif
 
-	/* create a stack frame for the caller of the handler */
-	newsp = ((unsigned long)frame) - (__SIGNAL_FRAMESIZE + 16);
-	if (put_user(regs->gpr[1], (u32 __user *)newsp))
-		goto badframe;
-
 	/* Fill registers for signal handler */
-	regs->gpr[1] = newsp;
+	regs->gpr[1] = (unsigned long)newsp;
 	regs->gpr[3] = ksig->sig;
 	regs->gpr[4] = (unsigned long)&frame->info;
 	regs->gpr[5] = (unsigned long)&frame->uc;
@@ -883,7 +882,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 	struct sigframe __user *frame;
 	struct mcontext __user *mctx;
 	struct mcontext __user *tm_mctx = NULL;
-	unsigned long newsp = 0;
+	unsigned long __user *newsp;
 	unsigned long tramp;
 	struct pt_regs *regs = tsk->thread.regs;
 	/* Save the thread's msr before get_tm_stackpointer() changes it */
@@ -891,6 +890,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 
 	/* Set up Signal Frame */
 	frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+	newsp = (unsigned long __user *)((unsigned long)frame - __SIGNAL_FRAMESIZE);
 	mctx = &frame->mctx;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_mctx = &frame->mctx_transact;
@@ -900,7 +900,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 	else
 		prepare_save_user_regs(1);
 
-	if (!user_access_begin(frame, sizeof(*frame)))
+	if (!user_access_begin(newsp, __SIGNAL_FRAMESIZE + sizeof(*frame)))
 		goto badframe;
 	sc = (struct sigcontext __user *) &frame->sctx;
 
@@ -931,6 +931,8 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 		unsafe_put_user(PPC_INST_SC, &mctx->mc_pad[1], failed);
 		asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0]));
 	}
+	/* create a stack frame for the caller of the handler */
+	unsafe_put_user(regs->gpr[1], newsp, failed);
 	user_access_end();
 
 	regs->link = tramp;
@@ -939,12 +941,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 	tsk->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
 #endif
 
-	/* create a stack frame for the caller of the handler */
-	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
-	if (put_user(regs->gpr[1], (u32 __user *)newsp))
-		goto badframe;
-
-	regs->gpr[1] = newsp;
+	regs->gpr[1] = (unsigned long)newsp;
 	regs->gpr[3] = ksig->sig;
 	regs->gpr[4] = (unsigned long) sc;
 	regs->nip = (unsigned long)ksig->ka.sa.sa_handler;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 9ca97b4366df..35c301457fbf 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -847,13 +847,14 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 		struct task_struct *tsk)
 {
 	struct rt_sigframe __user *frame;
-	unsigned long newsp = 0;
+	unsigned long __user *newsp;
 	long err = 0;
 	struct pt_regs *regs = tsk->thread.regs;
 	/* Save the thread's msr before get_tm_stackpointer() changes it */
 	unsigned long msr = regs->msr;
 
 	frame = get_sigframe(ksig, tsk, sizeof(*frame), 0);
+	newsp = (unsigned long __user *)((unsigned long)frame - __SIGNAL_FRAMESIZE);
 
 	/*
 	 * This only applies when calling unsafe_setup_sigcontext() and must be
@@ -862,7 +863,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	if (!MSR_TM_ACTIVE(msr))
 		prepare_setup_sigcontext(tsk);
 
-	if (!user_write_access_begin(frame, sizeof(*frame)))
+	if (!user_write_access_begin(newsp, __SIGNAL_FRAMESIZE + sizeof(*frame)))
 		goto badframe;
 
 	unsafe_put_user(&frame->info, &frame->pinfo, badframe_block);
@@ -900,6 +901,9 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	}
 
 	unsafe_copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set), badframe_block);
+	/* Allocate a dummy caller frame for the signal handler. */
+	unsafe_put_user(regs->gpr[1], newsp, badframe_block);
+
 	user_write_access_end();
 
 	/* Save the siginfo outside of the unsafe block. */
@@ -919,10 +923,6 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 		regs->nip = (unsigned long) &frame->tramp[0];
 	}
 
-	/* Allocate a dummy caller frame for the signal handler. */
-	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
-	err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
-
 	/* Set up "regs" so we "return" to the signal handler. */
 	if (is_elf2_task()) {
 		regs->ctr = (unsigned long) ksig->ka.sa.sa_handler;
@@ -953,7 +953,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	/* enter the signal handler in native-endian mode */
 	regs->msr &= ~MSR_LE;
 	regs->msr |= (MSR_KERNEL & MSR_LE);
-	regs->gpr[1] = newsp;
+	regs->gpr[1] = (unsigned long)newsp;
 	regs->gpr[3] = ksig->sig;
 	regs->result = 0;
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-- 
2.25.0


^ permalink raw reply related

* [PATCH 3/7] powerpc/signal64: Access function descriptor with user access block
From: Christophe Leroy @ 2021-06-15  6:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <b813c1f4d3dab2f51300eac44d99029aa8e57830.1623739212.git.christophe.leroy@csgroup.eu>

Access the function descriptor of the handler within a
user access block.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/signal_64.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 8b2eb758131c..9ca97b4366df 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -936,8 +936,18 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 		func_descr_t __user *funct_desc_ptr =
 			(func_descr_t __user *) ksig->ka.sa.sa_handler;
 
-		err |= get_user(regs->ctr, &funct_desc_ptr->entry);
-		err |= get_user(regs->gpr[2], &funct_desc_ptr->toc);
+		if (user_read_access_begin(funct_desc_ptr, sizeof(func_descr_t))) {
+			unsafe_get_user(regs->ctr, &funct_desc_ptr->entry, bad_funct_desc_block);
+			unsafe_get_user(regs->gpr[2], &funct_desc_ptr->toc, bad_funct_desc_block);
+		} else {
+			goto bad_funct_desc;
+bad_funct_desc_block:
+			user_read_access_end();
+bad_funct_desc:
+			signal_fault(current, regs, __func__, funct_desc_ptr);
+			return 1;
+		}
+		user_read_access_end();
 	}
 
 	/* enter the signal handler in native-endian mode */
-- 
2.25.0


^ permalink raw reply related

* [PATCH 2/7] powerpc/signal64: Don't read sigaction arguments back from user memory
From: Christophe Leroy @ 2021-06-15  6:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <b813c1f4d3dab2f51300eac44d99029aa8e57830.1623739212.git.christophe.leroy@csgroup.eu>

From: Michael Ellerman <mpe@ellerman.id.au>

When delivering a signal to a sigaction style handler (SA_SIGINFO), we
pass pointers to the siginfo and ucontext via r4 and r5.

Currently we populate the values in those registers by reading the
pointers out of the sigframe in user memory, even though the values in
user memory were written by the kernel just prior:

  unsafe_put_user(&frame->info, &frame->pinfo, badframe_block);
  unsafe_put_user(&frame->uc, &frame->puc, badframe_block);
  ...
  if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
  	err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
  	err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);

ie. we write &frame->info into frame->pinfo, and then read frame->pinfo
back into r4, and similarly for &frame->uc.

The code has always been like this, since linux-fullhistory commit
d4f2d95eca2c ("Forward port of 2.4 ppc64 signal changes.").

There's no reason for us to read the values back from user memory,
rather than just setting the value in the gpr[4/5] directly. In fact
reading the value back from user memory opens up the possibility of
another user thread changing the values before we read them back.
Although any process doing that would be racing against the kernel
delivering the signal, and would risk corrupting the stack, so that
would be a userspace bug.

Note that this is 64-bit only code, so there's no subtlety with the size
of pointers differing between kernel and user. Also the frame variable
is not modified to point elsewhere during the function.

In the past reading the values back from user memory was not costly, but
now that we have KUAP on some CPUs it is, so we'd rather avoid it for
that reason too.

So change the code to just set the values directly, using the same
values we have written to the sigframe previously in the function.

Note also that this matches what our 32-bit signal code does.

Using a version of will-it-scale's signal1_threads that sets SA_SIGINFO,
this results in a ~4% increase in signals per second on a Power9, from
229,777 to 239,766.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/signal_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f9e1f5428b9e..8b2eb758131c 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -947,8 +947,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	regs->gpr[3] = ksig->sig;
 	regs->result = 0;
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
-		err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);
+		regs->gpr[4] = (unsigned long)&frame->info;
+		regs->gpr[5] = (unsigned long)&frame->uc;
 		regs->gpr[6] = (unsigned long) frame;
 	} else {
 		regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext;
-- 
2.25.0


^ permalink raw reply related

* [PATCH 1/7] powerpc/signal64: Copy siginfo before changing regs->nip
From: Christophe Leroy @ 2021-06-15  6:40 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

From: Michael Ellerman <mpe@ellerman.id.au>

In commit 96d7a4e06fab ("powerpc/signal64: Rewrite handle_rt_signal64()
to minimise uaccess switches") the 64-bit signal code was rearranged to
use user_write_access_begin/end().

As part of that change the call to copy_siginfo_to_user() was moved
later in the function, so that it could be done after the
user_write_access_end().

In particular it was moved after we modify regs->nip to point to the
signal trampoline. That means if copy_siginfo_to_user() fails we exit
handle_rt_signal64() with an error but with regs->nip modified, whereas
previously we would not modify regs->nip until the copy succeeded.

Returning an error from signal delivery but with regs->nip updated
leaves the process in a sort of half-delivered state. We do immediately
force a SEGV in signal_setup_done(), called from do_signal(), so the
process should never run in the half-delivered state.

However that SEGV is not delivered until we've gone around to
do_notify_resume() again, so it's possible some tracing could observe
the half-delivered state.

There are other cases where we fail signal delivery with regs partly
updated, eg. the write to newsp and SA_SIGINFO, but the latter at least
is very unlikely to fail as it reads back from the frame we just wrote
to.

Looking at other arches they seem to be more careful about leaving regs
unchanged until the copy operations have succeeded, and in general that
seems like good hygenie.

So although the current behaviour is not cleary buggy, it's also not
clearly correct. So move the call to copy_siginfo_to_user() up prior to
the modification of regs->nip, which is closer to the old behaviour, and
easier to reason about.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/signal_64.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index dca66481d0c2..f9e1f5428b9e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -902,6 +902,10 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	unsafe_copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set), badframe_block);
 	user_write_access_end();
 
+	/* Save the siginfo outside of the unsafe block. */
+	if (copy_siginfo_to_user(&frame->info, &ksig->info))
+		goto badframe;
+
 	/* Make sure signal handler doesn't get spurious FP exceptions */
 	tsk->thread.fp_state.fpscr = 0;
 
@@ -915,11 +919,6 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 		regs->nip = (unsigned long) &frame->tramp[0];
 	}
 
-
-	/* Save the siginfo outside of the unsafe block. */
-	if (copy_siginfo_to_user(&frame->info, &ksig->info))
-		goto badframe;
-
 	/* Allocate a dummy caller frame for the signal handler. */
 	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
 	err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCH v5 04/17] powerpc/vas: Add platform specific user window operations
From: Haren Myneni @ 2021-06-15  6:37 UTC (permalink / raw)
  To: Nicholas Piggin, herbert, linux-crypto, linuxppc-dev, mpe
In-Reply-To: <1623636838.8tsdy9nisc.astroid@bobo.none>

On Mon, 2021-06-14 at 12:24 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of June 13, 2021 8:57 pm:
> > PowerNV uses registers to open/close VAS windows, and getting the
> > paste address. Whereas the hypervisor calls are used on PowerVM.
> > 
> > This patch adds the platform specific user space window operations
> > and register with the common VAS user space interface.
> > 
> > Signed-off-by: Haren Myneni <haren@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/vas.h              | 14 +++++-
> >  arch/powerpc/platforms/book3s/vas-api.c     | 53 +++++++++++++--
> > ------
> >  arch/powerpc/platforms/powernv/vas-window.c | 45 ++++++++++++++++-
> >  3 files changed, 89 insertions(+), 23 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/vas.h
> > b/arch/powerpc/include/asm/vas.h
> > index bab7891d43f5..85318d7446c7 100644
> > --- a/arch/powerpc/include/asm/vas.h
> > +++ b/arch/powerpc/include/asm/vas.h
> > @@ -5,6 +5,7 @@
> >  
> >  #ifndef _ASM_POWERPC_VAS_H
> >  #define _ASM_POWERPC_VAS_H
> > +#include <uapi/asm/vas-api.h>
> >  
> >  struct vas_window;
> >  
> > @@ -48,6 +49,16 @@ enum vas_cop_type {
> >  	VAS_COP_TYPE_MAX,
> >  };
> >  
> > +/*
> > + * User space window operations used for powernv and powerVM
> > + */
> > +struct vas_user_win_ops {
> > +	struct vas_window * (*open_win)(struct vas_tx_win_open_attr *,
> > +				enum vas_cop_type);
> > +	u64 (*paste_addr)(struct vas_window *);
> > +	int (*close_win)(struct vas_window *);
> > +};
> 
> This looks better, but rather than pull in uapi and the user API 
> structure here, could you just pass in vas_id and flags after the
> common 
> code does the user copy and verifies the version and other details?
> 
> I think it's generally good practice to limit the data that the usre
> can influence as much as possible. Sorry for not picking up on that
> earlier.

The user space pass vas_tx_win_open_attr struct - use only vas_id and
flags right now but it can be extended in future with reserve elements.
So passing the same struct to platform specific API.

do you prefer "struct vas_window * (*open_win)(vas_id, flags, cop)" and
extend later when more elments are used?

Thanks
Haren 

> 
> If that's changed, then
> 
> Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
> 
> Thanks,
> Nick
> 
> > +
> >  /*
> >   * Receive window attributes specified by the (in-kernel) owner of
> > window.
> >   */
> > @@ -175,7 +186,8 @@ void vas_unregister_api_powernv(void);
> >   * used for others in future.
> >   */
> >  int vas_register_coproc_api(struct module *mod, enum vas_cop_type
> > cop_type,
> > -				const char *name);
> > +			    const char *name,
> > +			    const struct vas_user_win_ops *vops);
> >  void vas_unregister_coproc_api(void);
> >  
> >  #endif /* __ASM_POWERPC_VAS_H */
> > diff --git a/arch/powerpc/platforms/book3s/vas-api.c
> > b/arch/powerpc/platforms/book3s/vas-api.c
> > index 72c126d87216..7cfc4b435ae8 100644
> > --- a/arch/powerpc/platforms/book3s/vas-api.c
> > +++ b/arch/powerpc/platforms/book3s/vas-api.c
> > @@ -42,6 +42,7 @@ static struct coproc_dev {
> >  	dev_t devt;
> >  	struct class *class;
> >  	enum vas_cop_type cop_type;
> > +	const struct vas_user_win_ops *vops;
> >  } coproc_device;
> >  
> >  struct coproc_instance {
> > @@ -72,11 +73,10 @@ static int coproc_open(struct inode *inode,
> > struct file *fp)
> >  static int coproc_ioc_tx_win_open(struct file *fp, unsigned long
> > arg)
> >  {
> >  	void __user *uptr = (void __user *)arg;
> > -	struct vas_tx_win_attr txattr = {};
> >  	struct vas_tx_win_open_attr uattr;
> >  	struct coproc_instance *cp_inst;
> >  	struct vas_window *txwin;
> > -	int rc, vasid;
> > +	int rc;
> >  
> >  	cp_inst = fp->private_data;
> >  
> > @@ -93,27 +93,20 @@ static int coproc_ioc_tx_win_open(struct file
> > *fp, unsigned long arg)
> >  	}
> >  
> >  	if (uattr.version != 1) {
> > -		pr_err("Invalid version\n");
> > +		pr_err("Invalid window open API version\n");
> >  		return -EINVAL;
> >  	}
> >  
> > -	vasid = uattr.vas_id;
> > -
> > -	vas_init_tx_win_attr(&txattr, cp_inst->coproc->cop_type);
> > -
> > -	txattr.lpid = mfspr(SPRN_LPID);
> > -	txattr.pidr = mfspr(SPRN_PID);
> > -	txattr.user_win = true;
> > -	txattr.rsvd_txbuf_count = false;
> > -	txattr.pswid = false;
> > -
> > -	pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr,
> > -				mfspr(SPRN_PID));
> > +	if (!cp_inst->coproc->vops && !cp_inst->coproc->vops->open_win) 
> > {
> > +		pr_err("VAS API is not registered\n");
> > +		return -EACCES;
> > +	}
> >  
> > -	txwin = vas_tx_win_open(vasid, cp_inst->coproc->cop_type,
> > &txattr);
> > +	txwin = cp_inst->coproc->vops->open_win(&uattr,
> > +						cp_inst->coproc-
> > >cop_type);
> >  	if (IS_ERR(txwin)) {
> > -		pr_err("%s() vas_tx_win_open() failed, %ld\n",
> > __func__,
> > -					PTR_ERR(txwin));
> > +		pr_err("%s() VAS window open failed, %ld\n", __func__,
> > +				PTR_ERR(txwin));
> >  		return PTR_ERR(txwin);
> >  	}
> >  
> > @@ -125,9 +118,15 @@ static int coproc_ioc_tx_win_open(struct file
> > *fp, unsigned long arg)
> >  static int coproc_release(struct inode *inode, struct file *fp)
> >  {
> >  	struct coproc_instance *cp_inst = fp->private_data;
> > +	int rc;
> >  
> >  	if (cp_inst->txwin) {
> > -		vas_win_close(cp_inst->txwin);
> > +		if (cp_inst->coproc->vops &&
> > +			cp_inst->coproc->vops->close_win) {
> > +			rc = cp_inst->coproc->vops->close_win(cp_inst-
> > >txwin);
> > +			if (rc)
> > +				return rc;
> > +		}
> >  		cp_inst->txwin = NULL;
> >  	}
> >  
> > @@ -168,7 +167,17 @@ static int coproc_mmap(struct file *fp, struct
> > vm_area_struct *vma)
> >  		return -EINVAL;
> >  	}
> >  
> > -	vas_win_paste_addr(txwin, &paste_addr, NULL);
> > +	if (!cp_inst->coproc->vops && !cp_inst->coproc->vops-
> > >paste_addr) {
> > +		pr_err("%s(): VAS API is not registered\n", __func__);
> > +		return -EACCES;
> > +	}
> > +
> > +	paste_addr = cp_inst->coproc->vops->paste_addr(txwin);
> > +	if (!paste_addr) {
> > +		pr_err("%s(): Window paste address failed\n",
> > __func__);
> > +		return -EINVAL;
> > +	}
> > +
> >  	pfn = paste_addr >> PAGE_SHIFT;
> >  
> >  	/* flags, page_prot from cxl_mmap(), except we want cachable */
> > @@ -208,7 +217,8 @@ static struct file_operations coproc_fops = {
> >   * extended to other coprocessor types later.
> >   */
> >  int vas_register_coproc_api(struct module *mod, enum vas_cop_type
> > cop_type,
> > -				const char *name)
> > +			    const char *name,
> > +			    const struct vas_user_win_ops *vops)
> >  {
> >  	int rc = -EINVAL;
> >  	dev_t devno;
> > @@ -230,6 +240,7 @@ int vas_register_coproc_api(struct module *mod,
> > enum vas_cop_type cop_type,
> >  	}
> >  	coproc_device.class->devnode = coproc_devnode;
> >  	coproc_device.cop_type = cop_type;
> > +	coproc_device.vops = vops;
> >  
> >  	coproc_fops.owner = mod;
> >  	cdev_init(&coproc_device.cdev, &coproc_fops);
> > diff --git a/arch/powerpc/platforms/powernv/vas-window.c
> > b/arch/powerpc/platforms/powernv/vas-window.c
> > index 41712b4b268e..5162e95c4090 100644
> > --- a/arch/powerpc/platforms/powernv/vas-window.c
> > +++ b/arch/powerpc/platforms/powernv/vas-window.c
> > @@ -16,6 +16,7 @@
> >  #include <linux/mmu_context.h>
> >  #include <asm/switch_to.h>
> >  #include <asm/ppc-opcode.h>
> > +#include <asm/vas.h>
> >  #include "vas.h"
> >  #include "copy-paste.h"
> >  
> > @@ -1443,6 +1444,48 @@ struct vas_window
> > *vas_pswid_to_window(struct vas_instance *vinst,
> >  	return window;
> >  }
> >  
> > +static struct vas_window *vas_user_win_open(struct
> > vas_tx_win_open_attr *uattr,
> > +				enum vas_cop_type cop_type)
> > +{
> > +	struct vas_tx_win_attr txattr = {};
> > +
> > +	vas_init_tx_win_attr(&txattr, cop_type);
> > +
> > +	txattr.lpid = mfspr(SPRN_LPID);
> > +	txattr.pidr = mfspr(SPRN_PID);
> > +	txattr.user_win = true;
> > +	txattr.rsvd_txbuf_count = false;
> > +	txattr.pswid = false;
> > +
> > +	pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr,
> > +				mfspr(SPRN_PID));
> > +
> > +	return vas_tx_win_open(uattr->vas_id, cop_type, &txattr);
> > +}
> > +
> > +static u64 vas_user_win_paste_addr(struct vas_window *win)
> > +{
> > +	u64 paste_addr;
> > +
> > +	vas_win_paste_addr(win, &paste_addr, NULL);
> > +
> > +	return paste_addr;
> > +}
> > +
> > +static int vas_user_win_close(struct vas_window *txwin)
> > +{
> > +
> > +	vas_win_close(txwin);
> > +
> > +	return 0;
> > +}
> > +
> > +static const struct vas_user_win_ops vops =  {
> > +	.open_win	=	vas_user_win_open,
> > +	.paste_addr	=	vas_user_win_paste_addr,
> > +	.close_win	=	vas_user_win_close,
> > +};
> > +
> >  /*
> >   * Supporting only nx-gzip coprocessor type now, but this API code
> >   * extended to other coprocessor types later.
> > @@ -1451,7 +1494,7 @@ int vas_register_api_powernv(struct module
> > *mod, enum vas_cop_type cop_type,
> >  			     const char *name)
> >  {
> >  
> > -	return vas_register_coproc_api(mod, cop_type, name);
> > +	return vas_register_coproc_api(mod, cop_type, name, &vops);
> >  }
> >  EXPORT_SYMBOL_GPL(vas_register_api_powernv);
> >  
> > -- 
> > 2.18.2
> > 
> > 
> > 


^ permalink raw reply

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
From: David Gibson @ 2021-06-15  6:34 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <87czsnoejl.fsf@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 3883 bytes --]

On Tue, Jun 15, 2021 at 11:27:50AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> >> FORM2 introduce a concept of secondary domain which is identical to the
> >> conceept of FORM1 primary domain. Use secondary domain as the numa node
> >> when using persistent memory device. For DAX kmem use the logical domain
> >> id introduced in FORM2. This new numa node
> >> 
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
> >>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
> >>  3 files changed, 45 insertions(+), 10 deletions(-)
> >> 
> >> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> >> index 86cd2af014f7..b9ac6d02e944 100644
> >> --- a/arch/powerpc/mm/numa.c
> >> +++ b/arch/powerpc/mm/numa.c
> >> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
> >>  	return nid;
> >>  }
> >>  
> >> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> >> +{
> >> +	int secondary_index;
> >> +	const __be32 *associativity;
> >> +
> >> +	if (!numa_enabled) {
> >> +		*primary = NUMA_NO_NODE;
> >> +		*secondary = NUMA_NO_NODE;
> >> +		return 0;
> >> +	}
> >> +
> >> +	associativity = of_get_associativity(node);
> >> +	if (!associativity)
> >> +		return -ENODEV;
> >> +
> >> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> >> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> >> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
> >
> > Secondary ID is always the second reference point, but primary depends
> > on the length of resources?  That seems very weird.
> 
> primary_domain_index is distance_ref_point[0]. With Form2 we would find
> both primary and secondary domain ID same for all resources other than
> persistent memory device. The usage w.r.t. persistent memory is
> explained in patch 7.

Right, I misunderstood

> 
> With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> the kernel should use when using persistent memory devices.

This seems kind of bogus.  With Form1, the primary/secondary ID are a
sort of heirarchy of distance (things with same primary ID are very
close, things with same secondary are kinda-close, etc.).  With Form2,
it's referring to their effective node for different purposes.

Using the same terms for different meanings seems unnecessarily
confusing.

> Persistent memory devices
> can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> the numa node number OS should use when using these devices as regular memory. Secondary
> domainID is the numa node number that should be used when using this device as
> persistent memory.

It's weird to me that you'd want to consider them in different nodes
for those different purposes.

> In the later case, we are interested in the locality of the
> device to an established numa node. In the above example, if the last row represents a
> persistent memory device/resource, NUMA node number 40 will be used when using the device
> as regular memory and NUMA node number 0 will be the device numa node when using it as
> a persistent memory device.

I don't really get what you mean by "locality of the device to an
established numa node".  Or at least how that's different from
anything else we're handling here.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
From: David Gibson @ 2021-06-15  6:25 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <87fsxjofw5.fsf@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 16363 bytes --]

On Tue, Jun 15, 2021 at 10:58:42AM +0530, Aneesh Kumar K.V wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> >> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
> >>  arch/powerpc/include/asm/firmware.h       |   3 +-
> >>  arch/powerpc/include/asm/prom.h           |   1 +
> >>  arch/powerpc/kernel/prom_init.c           |   3 +-
> >>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
> >>  arch/powerpc/platforms/pseries/firmware.c |   1 +
> >>  6 files changed, 290 insertions(+), 6 deletions(-)
> >>  create mode 100644 Documentation/powerpc/associativity.rst
> >> 
> >> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
> >> new file mode 100644
> >> index 000000000000..58abedea81d7
> >> --- /dev/null
> >> +++ b/Documentation/powerpc/associativity.rst
> >> @@ -0,0 +1,139 @@
> >> +============================
> >> +NUMA resource associativity
> >> +=============================
> >> +
> >> +Associativity represents the groupings of the various platform resources into
> >> +domains of substantially similar mean performance relative to resources outside
> >> +of that domain. Resources subsets of a given domain that exhibit better
> >> +performance relative to each other than relative to other resources subsets
> >> +are represented as being members of a sub-grouping domain. This performance
> >> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
> >> +From the platform view, these groups are also referred to as domains.
> >> +
> >> +PAPR interface currently supports two different ways of communicating these resource
> >
> > You describe form 2 below as well, which contradicts this.
> 
> Fixed as below.
> 
> PAPR interface currently supports different ways of communicating these resource
> grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
> associativity grouping. Form 0 is the older format and is now considered deprecated.
> 
> Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
> bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.

LGTM.

> >> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
> >> +Form 0 is the older format and is now considered deprecated.
> >> +
> >> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> >> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> >> +A value of 1 indicates the usage of Form 1 associativity.
> >> +
> >> +Form 0
> >> +-----
> >> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> >> +
> >> +Form 1
> >> +-----
> >> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
> >> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
> >> +
> >> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
> >> +representing the resource’s platform grouping domains.
> >> +
> >> +The “ibm,associativity-reference-points” property contains one or more list of numbers
> >> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
> >> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
> >> +
> >> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)
> >
> > I thought we used the *least* significant boundary (the smallest
> > grouping, not the largest).  That is, the last index, not the first.
> >
> > Actually... come to think of it, I'm not even sure how to interpret
> > "most significant".  Does that mean a change in grouping at that "most
> > significant" level results in the largest perfomance difference?
> 
> PAPR defines "most significant" as below
> 
> When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
> ity-reference-points” property indicates boundaries between associativity domains presented by the
> “ibm,associativity” property containing “near” and “far” resources. The
> first such boundary in the list represents the 1 based ordinal in the
> associativity lists of the most significant boundary, with subsequent
> entries indicating progressively less significant boundaries

No... that's not a definition.  Like your draft PAPR uses the term
while entirely failing to define it.  From what I can tell about how
it is used the "most significant" boundary corresponds to what Linux
simply thinks of as the node id.  But intuitively, I'd think of that
as the "least significant" boundary, since that's basically the
smallest granularity at which we care about NUMA distances.


> I would interpret it as the boundary where we start defining NUMA
> nodes.

That isn't any clearer to me.

> >> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
> >> +recursively comparing if they belong to the same higher-level domains. For mismatch
> >> +at every higher level of the resource group, the kernel doubles the NUMA distance between
> >> +the comparing domains.
> >> +
> >> +Form 2
> >> +-------
> >> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
> >> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
> >> +domain numbering. With numa distance computation now detached from the index value of
> >> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
> >> +same domain index representing resource groups of different
> >> performance/latency characteristics.
> >
> > The meaning of "domain index" is not clear to me here.
> 
> Sorry for the confusion there. domain index is the index where domainID
> is appearing. W.r.t "ibm,associativity"  we have

Ok, I think I eventually deduced that.  We should start out clearly
defining both domainID and index here.

Also.. I think we need to find more distinct terms, because "index" is
being used for both where the ID appears in an associativity array,
and also when an ID appears in the Form2 "lookup-index-table" and the
two usages are totally unconnected.

> The “ibm,associativity” property contains one or more lists of numbers (domainID)
> representing the resource’s platform grouping domains. If we can look at
> an example property.
> 
> { 4, 6, 7, 0, 0}
> { 4, 6, 7, 0, 40}
> 
> With Form 1 both NUMA node 0 and 40 will appear at the same distance.
> They both are at domain index 4. With Form 2 we can represent them with
> different NUMA distance values.

Ok.  Note that PAPR was never clear about what space domain IDs need
to be unique within: do they need to be (a) globally unique (not true
in practice), (b) unique at their index level or (c) unique only
within their "parent" node at the previous index level.

We should take the opportunity with Form2 to make that clearer.

My understanding is that with Form2 it should be entirely feasible to
built a dt have associativity arrays that are always of length 1.  Is
that correct?

> >> +
> >> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
> >> +"ibm,architecture-vec-5" property.
> >> +
> >> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
> >> +the domainIDs present in the system. The offset of the domainID in this property is considered
> >> +the domainID index.
> >
> > You haven't really introduced the term "domainID".  Is "domainID
> > index" the same as "domain index" above?  It's not clear to me.
> 
> The earlier part of the documented said 
> 
> The “ibm,associativity” property contains one or more lists of numbers (domainID)
> representing the resource’s platform grouping domains.
> 
> I will update domain index to domainID index. 
> 
> >
> > The distinction between "domain index" and "primary domain id" is also
> > not clear to me.
> 
> primary domain id is the domainID appearing in the primary domainID
> index. Linux kenrel also use that as the NUMA node number.

nit s/kenrel/kernel/

> Primary domainID index is defined by ibm,associativity-reference-points
> and we consider that as the most significant resource group boundary.
> 
> ibm,associativity-reference-points can be looked at as
> { primary domainID index, secondary domainID index, tertiary domainID index.. }

Ok, explicitly stating that in the doc would help a lot.

> >
> >> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
> >> +N domainID encoded as with encode-int
> >> +
> >> +For ex:
> >> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
> >
> > Above you say "Form 2 allows a large number of primary domain ids at
> > the same domain index", but this encoding doesn't appear to permit
> > that.
> 
> I didn't follow that question.

Ah, that's because I was thinking of index here as the index within
the lookup-index-table, not the index within the associativity
arrays.

> >
> >> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
> >> +distance between resource groups/domains present in the system.
> >> +
> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> >> +
> >> +For ex:
> >> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
> >> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
> >> +
> >> +  | 0    8   40
> >> +--|------------
> >> +  |
> >> +0 | 10   20  80
> >> +  |
> >> +8 | 20   10  160
> >> +  |
> >> +40| 80   160  10
> >
> > What's the reason for multiplying the values by 10 in the expanded
> > table version?
> 
> That was me missing a document update. Since we used 8 bits to encode
> distance at some point we were looking at a SCALE factor. But later
> realized other architectures also restrict distance to 8 bits. I will
> update ibm,numa-distance-table in the document.

Ok.

> >> +
> >> +With Form2 "ibm,associativity" for resources is listed as below:
> >> +
> >> +"ibm,associativity" property for resources in node 0, 8 and 40
> >> +{ 4, 6, 7, 0, 0}
> >> +{ 4, 6, 9, 8, 8}
> >> +{ 4, 6, 7, 0, 40}
> >> +
> >> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
> >> +
> >> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
> >
> > What the heck is the secondary domainID
> 
> domainID appearing the secondary domainID index.

I understand that from the clarifications you've made about, but
second domainID index wasn't any more defined in the original draft.

> ibm,associativity-reference-points gives an indication of different
> hierachy of resource grouping as below.
> 
> ibm,associativity-reference-points can be looked at as
> { primary domainID index, secondary domainID index, tertiary domainID index.. }
> 
> >
> >> +the kernel should use when using persistent memory devices. Persistent memory devices
> >> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> >> +the numa node number OS should use when using these devices as regular memory. Secondary
> >> +domainID is the numa node number that should be used when using this device as
> >> +persistent memory. In the later case, we are interested in the locality of the
> >> +device to an established numa node. In the above example, if the last row represents a
> >> +persistent memory device/resource, NUMA node number 40 will be used when using the device
> >> +as regular memory and NUMA node number 0 will be the device numa node when using it as
> >> +a persistent memory device.
> >> +
> >> +Each resource (drcIndex) now also supports additional optional device tree properties.
> >> +These properties are marked optional because the platform can choose not to export
> >> +them and provide the system topology details using the earlier defined device tree
> >> +properties alone. The optional device tree properties are used when adding new resources
> >> +(DLPAR) and when the platform didn't provide the topology details of the domain which
> >> +contains the newly added resource during boot.
> >> +
> >> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
> >> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
> >> +of the new resource can be obtained from the existing "ibm,associativity" property. This
> >> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
> >> +The value is 1 based array index value.
> >
> > Am I correct in thinking that if we have an entirely form2 world, we'd
> > only need this and the ibm,associativity properties could be dropped?
> 
> Not really. ibm,numa-lookup-index-table was added to have a concise
> representation of numa distance via ibm,numa-distance-table. 
> 
> For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
> numa distance. Instead ibm,numa-lookup-index-table allows us to present
> the same in a 3x3 matrix  distance[index0][index1] is the  distance
> between NUMA node 0 and 4 and distance[index0][index2] is the distance
> between NUMA node 0 and 5

Right, I get the purpose of it, and I realized I misphrashed my
question.  My point is that in a Form2 world, the *only* thing the
associativity array is used for is to deduce its position in
lookup-index-table.  Once you have have that for each resource, you
have everything you need, yes?

> 
> 
> >
> >> +
> >> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
> >> +
> >> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
> >> +from this resource domain to other resources.
> >
> > IIUC this is about extending the global distance table with
> > information for a new node.  Is that correct?
> 
> correct.
> 
> >
> > The global distance table allows for the possibility of asymmetric
> > distances between nodes, but this does not.  Is that intentional?
> 
> This also does, For example with 3 nodes currently present and 4 node
> getting added ibm,numa-distance have 8 elements enabling us to have
> distance[Node0][Node50] being different from distance[Node50][Node0]
> as shown below.

Ok that's not clear from the above.  Rather than "one or more lists of
numbers" I think you want to explicitly give two options.  Either one
list, which gives symmetric distances, or two which gives distances
to, then distance from.

> 
> >
> >> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> >> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> >> +
> >> +For ex:
> >> +ibm,associativity     = { 4, 5, 6, 7, 50}
> >> +ibm,numa-lookup-index = { 4 }
> >> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
> >> +
> >> +resulting in a new toplogy as below.
> >> +  | 0    8   40   50
> >> +--|------------------
> >> +  |
> >> +0 | 10   20  80   160
> >> +  |
> >> +8 | 20   10  160  320
> >> +  |
> >> +40| 80   160  10  80
> >> +  |
> >> +50| 160  320  80  10
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] powerpc/signal64: Don't read sigaction arguments back from user memory
From: Christophe Leroy @ 2021-06-15  6:22 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev, Michael Ellerman; +Cc: cmr
In-Reply-To: <e84f44e5-46a2-4076-b565-038057329be5@csgroup.eu>



Le 14/06/2021 à 13:49, Christophe Leroy a écrit :
> 
> 
> Le 14/06/2021 à 07:49, Nicholas Piggin a écrit :
>> Excerpts from Christophe Leroy's message of June 14, 2021 3:30 pm:
>>>
>>>
>>> Le 14/06/2021 à 03:32, Nicholas Piggin a écrit :
>>>> Excerpts from Michael Ellerman's message of June 10, 2021 5:29 pm:
>>>>> When delivering a signal to a sigaction style handler (SA_SIGINFO), we
>>>>> pass pointers to the siginfo and ucontext via r4 and r5.
>>>>>
>>>>> Currently we populate the values in those registers by reading the
>>>>> pointers out of the sigframe in user memory, even though the values in
>>>>> user memory were written by the kernel just prior:
>>>>>
>>>>>     unsafe_put_user(&frame->info, &frame->pinfo, badframe_block);
>>>>>     unsafe_put_user(&frame->uc, &frame->puc, badframe_block);
>>>>>     ...
>>>>>     if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
>>>>>         err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
>>>>>         err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);
>>>>>
>>>>> ie. we write &frame->info into frame->pinfo, and then read frame->pinfo
>>>>> back into r4, and similarly for &frame->uc.
>>>>>
>>>>> The code has always been like this, since linux-fullhistory commit
>>>>> d4f2d95eca2c ("Forward port of 2.4 ppc64 signal changes.").
>>>>>
>>>>> There's no reason for us to read the values back from user memory,
>>>>> rather than just setting the value in the gpr[4/5] directly. In fact
>>>>> reading the value back from user memory opens up the possibility of
>>>>> another user thread changing the values before we read them back.
>>>>> Although any process doing that would be racing against the kernel
>>>>> delivering the signal, and would risk corrupting the stack, so that
>>>>> would be a userspace bug.
>>>>>
>>>>> Note that this is 64-bit only code, so there's no subtlety with the size
>>>>> of pointers differing between kernel and user. Also the frame variable
>>>>> is not modified to point elsewhere during the function.
>>>>>
>>>>> In the past reading the values back from user memory was not costly, but
>>>>> now that we have KUAP on some CPUs it is, so we'd rather avoid it for
>>>>> that reason too.
>>>>>
>>>>> So change the code to just set the values directly, using the same
>>>>> values we have written to the sigframe previously in the function.
>>>>>
>>>>> Note also that this matches what our 32-bit signal code does.
>>>>>
>>>>> Using a version of will-it-scale's signal1_threads that sets SA_SIGINFO,
>>>>> this results in a ~4% increase in signals per second on a Power9, from
>>>>> 229,777 to 239,766.
>>>>
>>>> Good find, nice improvement. Will make it possible to make the error
>>>> handling much nicer too I think.
>>>>
>>>> Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
>>>>
>>>> You've moved copy_siginfo_to_user right up to the user access unlock,
>>>> could save 2 more KUAP lock/unlocks if we had an unsafe_clear_user. If
>>>> we can move the other user access stuff up as well, the stack frame
>>>> put_user could use unsafe_put_user as well, saving 1 more. Another few
>>>> percent?
>>>
>>> I'm looking at making an 'unsafe' version of copy_siginfo_to_user().
>>> That's straight forward for 'native' signals, but for compat signals that's more tricky.
>>
>> Ah nice. Native is most important at the moment.
>>
> 
> Finally not so easy. We have a quite efficient clear_user() which uses 'dcbz'. When replacing that 
> by a simplistic unsafe_clear_user() on the same model as unsafe_copy_to_user(), performance are 
> degradated on 32s. Need to implement it more efficiently.
> 

Don't know what I did yesterday. Performance is _not_ degraded, it is improved by 5%. I'll send out 
a series soon.

^ permalink raw reply

* Re: [PATCH 2/2] selftests: Skip TM tests on synthetic TM implementations
From: Jordan Niethe @ 2021-06-15  6:06 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Michael Neuling
In-Reply-To: <20210607233709.941102-2-jniethe5@gmail.com>

On Tue, Jun 8, 2021 at 9:37 AM Jordan Niethe <jniethe5@gmail.com> wrote:
>
> Transactional Memory was removed from the architecture in ISA v3.1. For
> threads running in P8/P9 compatibility mode on P10 a synthetic TM
> implementation is provided. In this implementation, tbegin. always sets
> cr0 eq meaning the abort handler is always called. This is not an issue
> as users of TM are expected to have a fallback non transactional way to
> make forward progress in the abort handler.
>
> As the TM self tests exist only to test TM, no alternative path forward
> is provided, leading to them timing out and failing on the synthetic TM
> implementation.
>
> The TEXASR indicates if a transaction failure is due to a synthetic
> implementation. Check for a synthetic implementation and skip the TM
> tests if so.
>
> Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
> ---
>  .../selftests/powerpc/ptrace/ptrace-tm-gpr.c  |  1 +
>  .../powerpc/ptrace/ptrace-tm-spd-gpr.c        |  1 +
>  .../powerpc/ptrace/ptrace-tm-spd-tar.c        |  1 +
>  .../selftests/powerpc/ptrace/ptrace-tm-tar.c  |  1 +
>  .../selftests/powerpc/tm/tm-resched-dscr.c    |  1 +
>  .../selftests/powerpc/tm/tm-signal-stack.c    |  1 +
>  .../testing/selftests/powerpc/tm/tm-syscall.c |  2 +-
>  tools/testing/selftests/powerpc/tm/tm.h       | 36 +++++++++++++++++++
>  8 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
> index 7df7100a29be..67ca297c5cca 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
> @@ -113,6 +113,7 @@ int ptrace_tm_gpr(void)
>         int ret, status;
>
>         SKIP_IF(!have_htm());
> +       SKIP_IF(htm_is_synthetic());
>         shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
>         pid = fork();
>         if (pid < 0) {
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
> index 8706bea5d015..6f2bce1b6c5d 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
> @@ -119,6 +119,7 @@ int ptrace_tm_spd_gpr(void)
>         int ret, status;
>
>         SKIP_IF(!have_htm());
> +       SKIP_IF(htm_is_synthetic());
>         shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
>         pid = fork();
>         if (pid < 0) {
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
> index 2ecfa1158e2b..e112a34fbe59 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
> @@ -129,6 +129,7 @@ int ptrace_tm_spd_tar(void)
>         int ret, status;
>
>         SKIP_IF(!have_htm());
> +       SKIP_IF(htm_is_synthetic());
>         shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
>         pid = fork();
>         if (pid == 0)
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
> index 46ef378a15ec..d0db6df0f0ea 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
> @@ -117,6 +117,7 @@ int ptrace_tm_tar(void)
>         int ret, status;
>
>         SKIP_IF(!have_htm());
> +       SKIP_IF(htm_is_synthetic());
>         shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
>         pid = fork();
>         if (pid == 0)
> diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
> index 4cdb83964bb3..85c940ae6ff8 100644
> --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
> +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
> @@ -40,6 +40,7 @@ int test_body(void)
>         uint64_t rv, dscr1 = 1, dscr2, texasr;
>
>         SKIP_IF(!have_htm());
> +       SKIP_IF(htm_is_synthetic());
>
>         printf("Check DSCR TM context switch: ");
>         fflush(stdout);
> diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
> index cdcf8c5bbbc7..68807aac8dd3 100644
> --- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
> +++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
> @@ -35,6 +35,7 @@ int tm_signal_stack()
>         int pid;
>
>         SKIP_IF(!have_htm());
> +       SKIP_IF(htm_is_synthetic());
>
>         pid = fork();
>         if (pid < 0)
> diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c
> index becb8207b432..467a6b3134b2 100644
> --- a/tools/testing/selftests/powerpc/tm/tm-syscall.c
> +++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
> @@ -25,7 +25,6 @@ extern int getppid_tm_suspended(void);
>  unsigned retries = 0;
>
>  #define TEST_DURATION 10 /* seconds */
> -#define TM_RETRIES 100
>
>  pid_t getppid_tm(bool suspend)
>  {
> @@ -67,6 +66,7 @@ int tm_syscall(void)
>         struct timeval end, now;
>
>         SKIP_IF(!have_htm_nosc());
> +       SKIP_IF(htm_is_synthetic());
>
>         setbuf(stdout, NULL);
>
> diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h
> index c5a1e5c163fc..c03c6e778876 100644
> --- a/tools/testing/selftests/powerpc/tm/tm.h
> +++ b/tools/testing/selftests/powerpc/tm/tm.h
> @@ -10,6 +10,9 @@
>  #include <asm/tm.h>
>
>  #include "utils.h"
> +#include "reg.h"
> +
> +#define TM_RETRIES 100
>
>  static inline bool have_htm(void)
>  {
> @@ -31,6 +34,39 @@ static inline bool have_htm_nosc(void)
>  #endif
>  }
>
> +/*
> + * Transactional Memory was removed in ISA 3.1. A synthetic TM implementation
> + * is provided on P10 for threads running in P8/P9 compatibility  mode. The
> + * synthetic implementation immediately fails after tbegin. This failure sets
> + * Bit 7 (Failure Persistent) and Bit 15 (Implementation-specific).
> + */
> +static inline bool htm_is_synthetic(void)
> +{
> +       int i;
> +
> +       /*
> +        * Per the ISA, the Failure Persistent bit may be incorrect. Try a few
> +        * times in case we got an Implementation-specific failure on a non ISA
> +        * v3.1 system. On these systems the Implementation-specific failure
> +        * should not be persistent.
> +        */
> +       for (i = 0; i < TM_RETRIES; i++) {
> +               asm volatile(
> +               "tbegin.;"
> +               "beq 1f;"
> +               "tend.;"
> +               "1:"
> +               :
> +               :
> +               : "memory");
> +
> +               if ((__builtin_get_texasr() & (TEXASR_FP | TEXASR_IC)) !=
> +                   (TEXASR_FP | TEXASR_IC))
> +                       break;
> +       }
> +       return i == TM_RETRIES;
> +}
> +
>  static inline long failure_code(void)
>  {
>         return __builtin_get_texasru() >> 24;
> --
> 2.25.1
>
There's a couple more tests that need the same treatment, Will send a
new version.

^ permalink raw reply

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
From: Aneesh Kumar K.V @ 2021-06-15  5:57 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <YMgkyfc4g+na5GJZ@yekko>

David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
>> FORM2 introduce a concept of secondary domain which is identical to the
>> conceept of FORM1 primary domain. Use secondary domain as the numa node
>> when using persistent memory device. For DAX kmem use the logical domain
>> id introduced in FORM2. This new numa node
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>>  3 files changed, 45 insertions(+), 10 deletions(-)
>> 
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 86cd2af014f7..b9ac6d02e944 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>>  	return nid;
>>  }
>>  
>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
>> +{
>> +	int secondary_index;
>> +	const __be32 *associativity;
>> +
>> +	if (!numa_enabled) {
>> +		*primary = NUMA_NO_NODE;
>> +		*secondary = NUMA_NO_NODE;
>> +		return 0;
>> +	}
>> +
>> +	associativity = of_get_associativity(node);
>> +	if (!associativity)
>> +		return -ENODEV;
>> +
>> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
>> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
>> +		secondary_index = of_read_number(&distance_ref_points[1], 1);
>
> Secondary ID is always the second reference point, but primary depends
> on the length of resources?  That seems very weird.

primary_domain_index is distance_ref_point[0]. With Form2 we would find
both primary and secondary domain ID same for all resources other than
persistent memory device. The usage w.r.t. persistent memory is
explained in patch 7.

With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
the kernel should use when using persistent memory devices. Persistent memory devices
can also be used as regular memory using DAX KMEM driver and primary domainID indicates
the numa node number OS should use when using these devices as regular memory. Secondary
domainID is the numa node number that should be used when using this device as
persistent memory. In the later case, we are interested in the locality of the
device to an established numa node. In the above example, if the last row represents a
persistent memory device/resource, NUMA node number 40 will be used when using the device
as regular memory and NUMA node number 0 will be the device numa node when using it as
a persistent memory device.


>
>> +		*secondary = of_read_number(&associativity[secondary_index], 1);
>> +	}
>> +	if (*primary == 0xffff || *primary >= nr_node_ids)
>> +		*primary = NUMA_NO_NODE;
>> +
>> +	if (*secondary == 0xffff || *secondary >= nr_node_ids)
>> +		*secondary = NUMA_NO_NODE;
>> +	return 0;
>> +}
>> +
>>  /* Returns the nid associated with the given device tree node,
>>   * or -1 if not found.
>>   */
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> index ef26fe40efb0..9bf2f1f3ddc5 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -18,6 +18,7 @@
>>  #include <asm/plpar_wrappers.h>
>>  #include <asm/papr_pdsm.h>
>>  #include <asm/mce.h>
>> +#include "pseries.h"
>>  
>>  #define BIND_ANY_ADDR (~0ul)
>>  
>> @@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
>>  struct papr_scm_priv {
>>  	struct platform_device *pdev;
>>  	struct device_node *dn;
>> +	int numa_node;
>> +	int target_node;
>>  	uint32_t drc_index;
>>  	uint64_t blocks;
>>  	uint64_t block_size;
>> @@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	struct nd_mapping_desc mapping;
>>  	struct nd_region_desc ndr_desc;
>>  	unsigned long dimm_flags;
>> -	int target_nid, online_nid;
>>  	ssize_t stat_size;
>>  
>>  	p->bus_desc.ndctl = papr_scm_ndctl;
>> @@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>>  
>>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
>> -	target_nid = dev_to_node(&p->pdev->dev);
>> -	online_nid = numa_map_to_online_node(target_nid);
>> -	ndr_desc.numa_node = online_nid;
>> -	ndr_desc.target_node = target_nid;
>> +	ndr_desc.numa_node = p->numa_node;
>> +	ndr_desc.target_node = p->target_node;
>>  	ndr_desc.res = &p->res;
>>  	ndr_desc.of_node = p->dn;
>>  	ndr_desc.provider_data = p;
>> @@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  				ndr_desc.res, p->dn);
>>  		goto err;
>>  	}
>> -	if (target_nid != online_nid)
>> -		dev_info(dev, "Region registered with target node %d and online node %d",
>> -			 target_nid, online_nid);
>>  
>>  	mutex_lock(&papr_ndr_lock);
>>  	list_add_tail(&p->region_list, &papr_nd_regions);
>> @@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>>  	struct papr_scm_priv *p;
>>  	const char *uuid_str;
>>  	u64 uuid[2];
>> -	int rc;
>> +	int rc, numa_node;
>>  
>>  	/* check we have all the required DT properties */
>>  	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
>> @@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
>>  		return -ENODEV;
>>  	}
>>  
>> -
>>  	p = kzalloc(sizeof(*p), GFP_KERNEL);
>>  	if (!p)
>>  		return -ENOMEM;
>>  
>> +	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
>> +		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
>> +		rc = -ENODEV;
>> +		goto err;
>> +	}
>> +	p->numa_node = numa_map_to_online_node(numa_node);
>> +	if (numa_node != p->numa_node)
>> +		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
>> +			 p->numa_node, numa_node);
>> +
>>  	/* Initialize the dimm mutex */
>>  	mutex_init(&p->health_mutex);
>>  
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 663a0859cf13..9c2a1fc9ded1 100644
>> --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>>  void pseries_lpar_read_hblkrm_characteristics(void);
>>  
>>  void update_numa_distance(struct device_node *node);
>> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
>>  #endif /* _PSERIES_PSERIES_H */
>
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson

^ permalink raw reply

* [PATCH] cpufreq:powernv: Fix init_chip_info initialization in numa=off
From: Pratik R. Sampat @ 2021-06-15  5:09 UTC (permalink / raw)
  To: mpe, rjw, linux-pm, linuxppc-dev, linux-kernel, psampat,
	pratik.r.sampat

In the numa=off kernel command-line configuration init_chip_info() loops
around the number of chips and attempts to copy the cpumask of that node
which is NULL for all iterations after the first chip.

Hence adding a check to bail out after the first initialization if there
is only one node.

Fixes: 053819e0bf84 ("cpufreq: powernv: Handle throttling due to Pmax capping at chip level")
Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
Reported-by: Shirisha Ganta <shirishaganta1@ibm.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index e439b43c19eb..663f9c4b5e3a 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -1078,6 +1078,8 @@ static int init_chip_info(void)
 		INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn);
 		for_each_cpu(cpu, &chips[i].mask)
 			per_cpu(chip_info, cpu) =  &chips[i];
+		if (num_possible_nodes() == 1)
+			break;
 	}
 
 free_and_return:
-- 
2.30.2


^ permalink raw reply related

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
From: Aneesh Kumar K.V @ 2021-06-15  5:28 UTC (permalink / raw)
  To: David Gibson; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <YMgkMg0i/6L1SOPd@yekko>

David Gibson <david@gibson.dropbear.id.au> writes:

> On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
>> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
>>  arch/powerpc/include/asm/firmware.h       |   3 +-
>>  arch/powerpc/include/asm/prom.h           |   1 +
>>  arch/powerpc/kernel/prom_init.c           |   3 +-
>>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
>>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>>  6 files changed, 290 insertions(+), 6 deletions(-)
>>  create mode 100644 Documentation/powerpc/associativity.rst
>> 
>> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
>> new file mode 100644
>> index 000000000000..58abedea81d7
>> --- /dev/null
>> +++ b/Documentation/powerpc/associativity.rst
>> @@ -0,0 +1,139 @@
>> +============================
>> +NUMA resource associativity
>> +=============================
>> +
>> +Associativity represents the groupings of the various platform resources into
>> +domains of substantially similar mean performance relative to resources outside
>> +of that domain. Resources subsets of a given domain that exhibit better
>> +performance relative to each other than relative to other resources subsets
>> +are represented as being members of a sub-grouping domain. This performance
>> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
>> +From the platform view, these groups are also referred to as domains.
>> +
>> +PAPR interface currently supports two different ways of communicating these resource
>
> You describe form 2 below as well, which contradicts this.

Fixed as below.

PAPR interface currently supports different ways of communicating these resource
grouping details to the OS. These are referred to as Form 0, Form 1 and Form2
associativity grouping. Form 0 is the older format and is now considered deprecated.

Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity
bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used.



>
>> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
>> +Form 0 is the older format and is now considered deprecated.
>> +
>> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
>> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
>> +A value of 1 indicates the usage of Form 1 associativity.
>> +
>> +Form 0
>> +-----
>> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
>> +
>> +Form 1
>> +-----
>> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
>> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
>> +
>> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
>> +representing the resource’s platform grouping domains.
>> +
>> +The “ibm,associativity-reference-points” property contains one or more list of numbers
>> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
>> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
>> +
>> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)
>
> I thought we used the *least* significant boundary (the smallest
> grouping, not the largest).  That is, the last index, not the first.
>
> Actually... come to think of it, I'm not even sure how to interpret
> "most significant".  Does that mean a change in grouping at that "most
> significant" level results in the largest perfomance difference?

PAPR defines "most significant" as below

When the “ibm,architecture-vec-5” property byte 5 bit 0 has the value of one, the “ibm,associativ-
ity-reference-points” property indicates boundaries between associativity domains presented by the
“ibm,associativity” property containing “near” and “far” resources. The
first such boundary in the list represents the 1 based ordinal in the
associativity lists of the most significant boundary, with subsequent
entries indicating progressively less significant boundaries

I would interpret it as the boundary where we start defining NUMA nodes.

>
>> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
>> +recursively comparing if they belong to the same higher-level domains. For mismatch
>> +at every higher level of the resource group, the kernel doubles the NUMA distance between
>> +the comparing domains.
>> +
>> +Form 2
>> +-------
>> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
>> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
>> +domain numbering. With numa distance computation now detached from the index value of
>> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
>> +same domain index representing resource groups of different
>> performance/latency characteristics.
>
> The meaning of "domain index" is not clear to me here.

Sorry for the confusion there. domain index is the index where domainID
is appearing. W.r.t "ibm,associativity"  we have 

The “ibm,associativity” property contains one or more lists of numbers (domainID)
representing the resource’s platform grouping domains. If we can look at
an example property.

{ 4, 6, 7, 0, 0}
{ 4, 6, 7, 0, 40}

With Form 1 both NUMA node 0 and 40 will appear at the same distance.
They both are at domain index 4. With Form 2 we can represent them with
different NUMA distance values. 


>
>> +
>> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
>> +"ibm,architecture-vec-5" property.
>> +
>> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
>> +the domainIDs present in the system. The offset of the domainID in this property is considered
>> +the domainID index.
>
> You haven't really introduced the term "domainID".  Is "domainID
> index" the same as "domain index" above?  It's not clear to me.

The earlier part of the documented said 

The “ibm,associativity” property contains one or more lists of numbers (domainID)
representing the resource’s platform grouping domains.

I will update domain index to domainID index. 

>
> The distinction between "domain index" and "primary domain id" is also
> not clear to me.

primary domain id is the domainID appearing in the primary domainID
index. Linux kenrel also use that as the NUMA node number.
Primary domainID index is defined by ibm,associativity-reference-points
and we consider that as the most significant resource group boundary.

ibm,associativity-reference-points can be looked at as
{ primary domainID index, secondary domainID index, tertiary domainID index.. }


>
>> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
>> +N domainID encoded as with encode-int
>> +
>> +For ex:
>> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.
>
> Above you say "Form 2 allows a large number of primary domain ids at
> the same domain index", but this encoding doesn't appear to permit
> that.

I didn't follow that question.

>
>> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
>> +distance between resource groups/domains present in the system.
>> +
>> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
>> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
>> +
>> +For ex:
>> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
>> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
>> +
>> +  | 0    8   40
>> +--|------------
>> +  |
>> +0 | 10   20  80
>> +  |
>> +8 | 20   10  160
>> +  |
>> +40| 80   160  10
>
> What's the reason for multiplying the values by 10 in the expanded
> table version?

That was me missing a document update. Since we used 8 bits to encode
distance at some point we were looking at a SCALE factor. But later
realized other architectures also restrict distance to 8 bits. I will
update ibm,numa-distance-table in the document.

>
>> +
>> +With Form2 "ibm,associativity" for resources is listed as below:
>> +
>> +"ibm,associativity" property for resources in node 0, 8 and 40
>> +{ 4, 6, 7, 0, 0}
>> +{ 4, 6, 9, 8, 8}
>> +{ 4, 6, 7, 0, 40}
>> +
>> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
>> +
>> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes
>
> What the heck is the secondary domainID

domainID appearing the secondary domainID index.
ibm,associativity-reference-points gives an indication of different
hierachy of resource grouping as below.

ibm,associativity-reference-points can be looked at as
{ primary domainID index, secondary domainID index, tertiary domainID index.. }

>
>> +the kernel should use when using persistent memory devices. Persistent memory devices
>> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
>> +the numa node number OS should use when using these devices as regular memory. Secondary
>> +domainID is the numa node number that should be used when using this device as
>> +persistent memory. In the later case, we are interested in the locality of the
>> +device to an established numa node. In the above example, if the last row represents a
>> +persistent memory device/resource, NUMA node number 40 will be used when using the device
>> +as regular memory and NUMA node number 0 will be the device numa node when using it as
>> +a persistent memory device.
>> +
>> +Each resource (drcIndex) now also supports additional optional device tree properties.
>> +These properties are marked optional because the platform can choose not to export
>> +them and provide the system topology details using the earlier defined device tree
>> +properties alone. The optional device tree properties are used when adding new resources
>> +(DLPAR) and when the platform didn't provide the topology details of the domain which
>> +contains the newly added resource during boot.
>> +
>> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
>> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
>> +of the new resource can be obtained from the existing "ibm,associativity" property. This
>> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
>> +The value is 1 based array index value.
>
> Am I correct in thinking that if we have an entirely form2 world, we'd
> only need this and the ibm,associativity properties could be dropped?

Not really. ibm,numa-lookup-index-table was added to have a concise
representation of numa distance via ibm,numa-distance-table. 

For ex: With domainID 0, 4, 5 we could do a 5x5 matrix to represent the
numa distance. Instead ibm,numa-lookup-index-table allows us to present
the same in a 3x3 matrix  distance[index0][index1] is the  distance
between NUMA node 0 and 4 and distance[index0][index2] is the distance
between NUMA node 0 and 5


>
>> +
>> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
>> +
>> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
>> +from this resource domain to other resources.
>
> IIUC this is about extending the global distance table with
> information for a new node.  Is that correct?

correct.

>
> The global distance table allows for the possibility of asymmetric
> distances between nodes, but this does not.  Is that intentional?

This also does, For example with 3 nodes currently present and 4 node
getting added ibm,numa-distance have 8 elements enabling us to have
distance[Node0][Node50] being different from distance[Node50][Node0]
as shown below.

>
>> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
>> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
>> +
>> +For ex:
>> +ibm,associativity     = { 4, 5, 6, 7, 50}
>> +ibm,numa-lookup-index = { 4 }
>> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
>> +
>> +resulting in a new toplogy as below.
>> +  | 0    8   40   50
>> +--|------------------
>> +  |
>> +0 | 10   20  80   160
>> +  |
>> +8 | 20   10  160  320
>> +  |
>> +40| 80   160  10  80
>> +  |
>> +50| 160  320  80  10

^ permalink raw reply

* [PATCH v2] selftests/powerpc: Always test lmw and stmw
From: Jordan Niethe @ 2021-06-15  5:10 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe

Load Multiple Word (lmw) and Store Multiple Word (stmw) will raise an
Alignment Exception:
  - Little Endian mode: always
  - Big Endian mode: address not word aligned

These conditions do not depend on cache inhibited memory. Test the
alignment handler emulation of these instructions regardless of if there
is cache inhibited memory available or not.

Commit dd3a44c06f7b ("selftests/powerpc: Only test lwm/stmw on big
endian") stopped testing lmw/stmw on little endian because newer
binutils (>= 2.36) will not assemble them in little endian mode. The
kernel still emulates these instructions in little endian mode so use
macros to generate them and test them.

Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
v2: Use macros for lmw/stmw
---
 .../powerpc/alignment/alignment_handler.c     | 101 +++++++++++++++++-
 .../selftests/powerpc/include/instructions.h  |  10 ++
 2 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
index 33ee34fc0828..26878147f389 100644
--- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c
+++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
@@ -45,6 +45,7 @@
 #include <getopt.h>
 #include <setjmp.h>
 #include <signal.h>
+#include <errno.h>
 
 #include "utils.h"
 #include "instructions.h"
@@ -453,11 +454,6 @@ int test_alignment_handler_integer(void)
 	STORE_DFORM_TEST(stdu);
 	STORE_XFORM_TEST(stdux);
 
-#ifdef __BIG_ENDIAN__
-	LOAD_DFORM_TEST(lmw);
-	STORE_DFORM_TEST(stmw);
-#endif
-
 	return rc;
 }
 
@@ -602,6 +598,99 @@ int test_alignment_handler_fp_prefix(void)
 	return rc;
 }
 
+int test_alignment_handler_multiple(void)
+{
+	int offset, width, r, rc = 0;
+	void *src1, *dst1, *src2, *dst2;
+
+	rc = posix_memalign(&src1, bufsize, bufsize);
+	if (rc) {
+		printf("\n");
+		return rc;
+	}
+
+	rc = posix_memalign(&dst1, bufsize, bufsize);
+	if (rc) {
+		printf("\n");
+		free(src1);
+		return rc;
+	}
+
+	src2 = malloc(bufsize);
+	if (!src2) {
+		printf("\n");
+		free(src1);
+		free(dst1);
+		return -ENOMEM;
+	}
+
+	dst2 = malloc(bufsize);
+	if (!dst2) {
+		printf("\n");
+		free(src1);
+		free(dst1);
+		free(src2);
+		return -ENOMEM;
+	}
+
+	/* lmw */
+	width = 4;
+	printf("\tDoing lmw:\t");
+	for (offset = 0; offset < width; offset++) {
+		preload_data(src1, offset, width);
+		preload_data(src2, offset, width);
+
+		asm volatile(LMW(31, %0, 0)
+			     "std 31, 0(%1)"
+			     :: "r"(src1 + offset), "r"(dst1 + offset), "r"(0)
+			     : "memory", "r31");
+
+		memcpy(dst2 + offset, src1 + offset, width);
+
+		r = test_memcmp(dst1, dst2, width, offset, "test_lmw");
+		if (r && !debug) {
+			printf("FAILED: Wrong Data\n");
+			break;
+		}
+	}
+
+	if (!r)
+		printf("PASSED\n");
+	else
+		rc |= 1;
+
+	/* stmw */
+	width = 4;
+	printf("\tDoing stmw:\t");
+	for (offset = 0; offset < width; offset++) {
+		preload_data(src1, offset, width);
+		preload_data(src2, offset, width);
+
+		asm volatile("ld  31, 0(%0) ;"
+			     STMW(31, %1, 0)
+			     :: "r"(src1 + offset), "r"(dst1 + offset), "r"(0)
+			     : "memory", "r31");
+
+		memcpy(dst2 + offset, src1 + offset, width);
+
+		r = test_memcmp(dst1, dst2, width, offset, "test_stmw");
+		if (r && !debug) {
+			printf("FAILED: Wrong Data\n");
+			break;
+		}
+	}
+	if (!r)
+		printf("PASSED\n");
+	else
+		rc |= 1;
+
+	free(src1);
+	free(src2);
+	free(dst1);
+	free(dst2);
+	return rc;
+}
+
 void usage(char *prog)
 {
 	printf("Usage: %s [options] [path [offset]]\n", prog);
@@ -676,5 +765,7 @@ int main(int argc, char *argv[])
 			   "test_alignment_handler_fp_206");
 	rc |= test_harness(test_alignment_handler_fp_prefix,
 			   "test_alignment_handler_fp_prefix");
+	rc |= test_harness(test_alignment_handler_multiple,
+			   "test_alignment_handler_multiple");
 	return rc;
 }
diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h
index 4efa6314bd96..60605e2bbd3c 100644
--- a/tools/testing/selftests/powerpc/include/instructions.h
+++ b/tools/testing/selftests/powerpc/include/instructions.h
@@ -143,4 +143,14 @@ static inline int paste_last(void *i)
 #define PSTXV0(s, a, r, d)		PREFIX_8LS(0xd8000000, s, a, r, d)
 #define PSTXV1(s, a, r, d)		PREFIX_8LS(0xdc000000, s, a, r, d)
 
+/* Load and Store Multiple Instructions */
+#define LMW(t, a, d)			stringify_in_c(.long (46 << 26) |		\
+						       __PPC_RT(t) |			\
+						       __PPC_RA(a) |			\
+						       ((d) & 0xffff);\n)
+#define STMW(t, a, d)			stringify_in_c(.long (47 << 26) |		\
+						       __PPC_RT(t) |			\
+						       __PPC_RA(a) |			\
+						       ((d) & 0xffff);\n)
+
 #endif /* _SELFTESTS_POWERPC_INSTRUCTIONS_H */
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH 09/16] ps3disk: use memcpy_{from,to}_bvec
From: Herbert Xu @ 2021-06-15  5:02 UTC (permalink / raw)
  To: Ira Weiny, David S. Miller
  Cc: Jens Axboe, linux-arch, Thomas Bogendoerfer, linux-scsi,
	Mike Snitzer, linux-sh, Geoff Levand, Tero Kristo, ceph-devel,
	linux-mips, Dongsheng Yang, linux-kernel, linux-block, dm-devel,
	Thomas Gleixner, linux-csky, linux-mmc, Ilya Dryomov,
	linuxppc-dev, Christoph Hellwig, linux-arm-kernel
In-Reply-To: <20210612040743.GG1600546@iweiny-DESK2.sc.intel.com>

On Fri, Jun 11, 2021 at 09:07:43PM -0700, Ira Weiny wrote:
>
> More recently this was added:
> 
> 7e34e0bbc644 crypto: omap-crypto - fix userspace copied buffer access
> 
> I'm CC'ing Tero and Herbert to see why they added the SLAB check.

Probably because the generic Crypto API has the same check.  This
all goes back to

commit 4f3e797ad07d52d34983354a77b365dfcd48c1b4
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Mon Feb 9 14:22:14 2009 +1100

    crypto: scatterwalk - Avoid flush_dcache_page on slab pages

    It's illegal to call flush_dcache_page on slab pages on a number
    of architectures.  So this patch avoids doing so if PageSlab is
    true.

    In future we can move the flush_dcache_page call to those page
    cache users that actually need it.

    Reported-by: David S. Miller <davem@davemloft.net>
    Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

But I can't find any emails discussing this so let me ask Dave
directly and see if he can tell us what the issue was or might
have been.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v9 01/14] swiotlb: Refactor swiotlb init functions
From: Claire Chang @ 2021-06-15  4:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: heikki.krogerus, thomas.hellstrom, peterz, joonas.lahtinen,
	dri-devel, chris, grant.likely, paulus, Frank Rowand, mingo,
	Marek Szyprowski, sstabellini, Saravana Kannan, Joerg Roedel,
	Rafael J . Wysocki, Bartosz Golaszewski, bskeggs, linux-pci,
	xen-devel, Thierry Reding, intel-gfx, matthew.auld,
	linux-devicetree, Jianxiong Gao, Daniel Vetter, Will Deacon,
	Konrad Rzeszutek Wilk, maarten.lankhorst, airlied, Dan Williams,
	linuxppc-dev, jani.nikula, Rob Herring, rodrigo.vivi,
	Bjorn Helgaas, boris.ostrovsky, Andy Shevchenko, jgross,
	Nicolas Boichat, Greg KH, Randy Dunlap, lkml, Tomasz Figa,
	list@263.net:IOMMU DRIVERS, Jim Quinlan, xypron.glpk,
	Robin Murphy, bauerman
In-Reply-To: <20210614061644.GA28343@lst.de>

On Mon, Jun 14, 2021 at 2:16 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Jun 11, 2021 at 11:26:46PM +0800, Claire Chang wrote:
> > +     spin_lock_init(&mem->lock);
> > +     for (i = 0; i < mem->nslabs; i++) {
> > +             mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
> > +             mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
> > +             mem->slots[i].alloc_size = 0;
> > +     }
> > +
> > +     if (memory_decrypted)
> > +             set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
> > +     memset(vaddr, 0, bytes);
>
> We don't really need to do this call before the memset.  Which means we
> can just move it to the callers that care instead of having a bool
> argument.
>
> Otherwise looks good:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Thanks for the review. Will wait more days for other reviews and send
v10 to address the comments in this and other patches.

^ permalink raw reply

* Re: [RFC PATCH 6/8] powerpc/pseries: Add a helper for form1 cpu distance
From: David Gibson @ 2021-06-15  3:21 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-7-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1701 bytes --]

On Mon, Jun 14, 2021 at 10:10:01PM +0530, Aneesh Kumar K.V wrote:
> This helper is only used with the dispatch trace log collection.
> A later patch will add Form2 affinity support and this change helps
> in keeping that simpler. Also add a comment explaining we don't expect
> the code to be called with FORM0
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/mm/numa.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 64caaf07cf82..696e5bfe1414 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	int dist = 0;
>  
> @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  	return dist;
>  }
>  
> +int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> +	/* We should not get called with FORM0 */
> +	VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> +
> +	return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +}
> +
>  /* must hold reference to node during call */
>  static const __be32 *of_get_associativity(struct device_node *dev)
>  {

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 8/8] powerpc/papr_scm: Use FORM2 associativity details
From: David Gibson @ 2021-06-15  3:55 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-9-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 5763 bytes --]

On Mon, Jun 14, 2021 at 10:10:03PM +0530, Aneesh Kumar K.V wrote:
> FORM2 introduce a concept of secondary domain which is identical to the
> conceept of FORM1 primary domain. Use secondary domain as the numa node
> when using persistent memory device. For DAX kmem use the logical domain
> id introduced in FORM2. This new numa node
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c                    | 28 +++++++++++++++++++++++
>  arch/powerpc/platforms/pseries/papr_scm.c | 26 +++++++++++++--------
>  arch/powerpc/platforms/pseries/pseries.h  |  1 +
>  3 files changed, 45 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 86cd2af014f7..b9ac6d02e944 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -265,6 +265,34 @@ static int associativity_to_nid(const __be32 *associativity)
>  	return nid;
>  }
>  
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary)
> +{
> +	int secondary_index;
> +	const __be32 *associativity;
> +
> +	if (!numa_enabled) {
> +		*primary = NUMA_NO_NODE;
> +		*secondary = NUMA_NO_NODE;
> +		return 0;
> +	}
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return -ENODEV;
> +
> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> +		*primary = of_read_number(&associativity[primary_domain_index], 1);
> +		secondary_index = of_read_number(&distance_ref_points[1], 1);

Secondary ID is always the second reference point, but primary depends
on the length of resources?  That seems very weird.

> +		*secondary = of_read_number(&associativity[secondary_index], 1);
> +	}
> +	if (*primary == 0xffff || *primary >= nr_node_ids)
> +		*primary = NUMA_NO_NODE;
> +
> +	if (*secondary == 0xffff || *secondary >= nr_node_ids)
> +		*secondary = NUMA_NO_NODE;
> +	return 0;
> +}
> +
>  /* Returns the nid associated with the given device tree node,
>   * or -1 if not found.
>   */
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> index ef26fe40efb0..9bf2f1f3ddc5 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -18,6 +18,7 @@
>  #include <asm/plpar_wrappers.h>
>  #include <asm/papr_pdsm.h>
>  #include <asm/mce.h>
> +#include "pseries.h"
>  
>  #define BIND_ANY_ADDR (~0ul)
>  
> @@ -88,6 +89,8 @@ struct papr_scm_perf_stats {
>  struct papr_scm_priv {
>  	struct platform_device *pdev;
>  	struct device_node *dn;
> +	int numa_node;
> +	int target_node;
>  	uint32_t drc_index;
>  	uint64_t blocks;
>  	uint64_t block_size;
> @@ -923,7 +926,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	struct nd_mapping_desc mapping;
>  	struct nd_region_desc ndr_desc;
>  	unsigned long dimm_flags;
> -	int target_nid, online_nid;
>  	ssize_t stat_size;
>  
>  	p->bus_desc.ndctl = papr_scm_ndctl;
> @@ -974,10 +976,8 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	mapping.size = p->blocks * p->block_size; // XXX: potential overflow?
>  
>  	memset(&ndr_desc, 0, sizeof(ndr_desc));
> -	target_nid = dev_to_node(&p->pdev->dev);
> -	online_nid = numa_map_to_online_node(target_nid);
> -	ndr_desc.numa_node = online_nid;
> -	ndr_desc.target_node = target_nid;
> +	ndr_desc.numa_node = p->numa_node;
> +	ndr_desc.target_node = p->target_node;
>  	ndr_desc.res = &p->res;
>  	ndr_desc.of_node = p->dn;
>  	ndr_desc.provider_data = p;
> @@ -1001,9 +1001,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  				ndr_desc.res, p->dn);
>  		goto err;
>  	}
> -	if (target_nid != online_nid)
> -		dev_info(dev, "Region registered with target node %d and online node %d",
> -			 target_nid, online_nid);
>  
>  	mutex_lock(&papr_ndr_lock);
>  	list_add_tail(&p->region_list, &papr_nd_regions);
> @@ -1096,7 +1093,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>  	struct papr_scm_priv *p;
>  	const char *uuid_str;
>  	u64 uuid[2];
> -	int rc;
> +	int rc, numa_node;
>  
>  	/* check we have all the required DT properties */
>  	if (of_property_read_u32(dn, "ibm,my-drc-index", &drc_index)) {
> @@ -1119,11 +1116,20 @@ static int papr_scm_probe(struct platform_device *pdev)
>  		return -ENODEV;
>  	}
>  
> -
>  	p = kzalloc(sizeof(*p), GFP_KERNEL);
>  	if (!p)
>  		return -ENOMEM;
>  
> +	if (get_primary_and_secondary_domain(dn, &p->target_node, &numa_node)) {
> +		dev_err(&pdev->dev, "%pOF: missing NUMA attributes!\n", dn);
> +		rc = -ENODEV;
> +		goto err;
> +	}
> +	p->numa_node = numa_map_to_online_node(numa_node);
> +	if (numa_node != p->numa_node)
> +		dev_info(&pdev->dev, "Region registered with online node %d and device tree node %d",
> +			 p->numa_node, numa_node);
> +
>  	/* Initialize the dimm mutex */
>  	mutex_init(&p->health_mutex);
>  
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 663a0859cf13..9c2a1fc9ded1 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -114,4 +114,5 @@ void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
>  void update_numa_distance(struct device_node *node);
> +int get_primary_and_secondary_domain(struct device_node *node, int *primary, int *secondary);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 7/8] powerpc/pseries: Add support for FORM2 associativity
From: David Gibson @ 2021-06-15  3:53 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-8-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 20184 bytes --]

On Mon, Jun 14, 2021 at 10:10:02PM +0530, Aneesh Kumar K.V wrote:
> Signed-off-by: Daniel Henrique Barboza <danielhb413@gmail.com>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  Documentation/powerpc/associativity.rst   | 139 ++++++++++++++++++++
>  arch/powerpc/include/asm/firmware.h       |   3 +-
>  arch/powerpc/include/asm/prom.h           |   1 +
>  arch/powerpc/kernel/prom_init.c           |   3 +-
>  arch/powerpc/mm/numa.c                    | 149 +++++++++++++++++++++-
>  arch/powerpc/platforms/pseries/firmware.c |   1 +
>  6 files changed, 290 insertions(+), 6 deletions(-)
>  create mode 100644 Documentation/powerpc/associativity.rst
> 
> diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst
> new file mode 100644
> index 000000000000..58abedea81d7
> --- /dev/null
> +++ b/Documentation/powerpc/associativity.rst
> @@ -0,0 +1,139 @@
> +============================
> +NUMA resource associativity
> +=============================
> +
> +Associativity represents the groupings of the various platform resources into
> +domains of substantially similar mean performance relative to resources outside
> +of that domain. Resources subsets of a given domain that exhibit better
> +performance relative to each other than relative to other resources subsets
> +are represented as being members of a sub-grouping domain. This performance
> +characteristic is presented in terms of NUMA node distance within the Linux kernel.
> +From the platform view, these groups are also referred to as domains.
> +
> +PAPR interface currently supports two different ways of communicating these resource

You describe form 2 below as well, which contradicts this.

> +grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping.
> +Form 0 is the older format and is now considered deprecated.
> +
> +Hypervisor indicates the type/form of associativity used via "ibm,arcitecture-vec-5 property".
> +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1.
> +A value of 1 indicates the usage of Form 1 associativity.
> +
> +Form 0
> +-----
> +Form 0 associativity supports only two NUMA distance (LOCAL and REMOTE).
> +
> +Form 1
> +-----
> +With Form 1 a combination of ibm,associativity-reference-points and ibm,associativity
> +device tree properties are used to determine the NUMA distance between resource groups/domains. 
> +
> +The “ibm,associativity” property contains one or more lists of numbers (domainID)
> +representing the resource’s platform grouping domains.
> +
> +The “ibm,associativity-reference-points” property contains one or more list of numbers
> +(domain index) that represents the 1 based ordinal in the associativity lists of the most
> +significant boundary, with subsequent entries indicating progressively less significant boundaries.
> +
> +Linux kernel uses the domain id of the most significant boundary (aka primary domain)

I thought we used the *least* significant boundary (the smallest
grouping, not the largest).  That is, the last index, not the first.

Actually... come to think of it, I'm not even sure how to interpret
"most significant".  Does that mean a change in grouping at that "most
significant" level results in the largest perfomance difference?

> +as the NUMA node id. Linux kernel computes NUMA distance between two domains by
> +recursively comparing if they belong to the same higher-level domains. For mismatch
> +at every higher level of the resource group, the kernel doubles the NUMA distance between
> +the comparing domains.
> +
> +Form 2
> +-------
> +Form 2 associativity format adds separate device tree properties representing NUMA node distance
> +thereby making the node distance computation flexible. Form 2 also allows flexible primary
> +domain numbering. With numa distance computation now detached from the index value of
> +"ibm,associativity" property, Form 2 allows a large number of primary domain ids at the
> +same domain index representing resource groups of different
> performance/latency characteristics.

The meaning of "domain index" is not clear to me here.

> +
> +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the
> +"ibm,architecture-vec-5" property.
> +
> +"ibm,numa-lookup-index-table" property contains one or more list numbers representing
> +the domainIDs present in the system. The offset of the domainID in this property is considered
> +the domainID index.

You haven't really introduced the term "domainID".  Is "domainID
index" the same as "domain index" above?  It's not clear to me.

The distinction between "domain index" and "primary domain id" is also
not clear to me.

> +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by
> +N domainID encoded as with encode-int
> +
> +For ex:
> +ibm,numa-lookup-index-table =  {4, 0, 8, 250, 252}, domainID index for domainID 8 is 1.

Above you say "Form 2 allows a large number of primary domain ids at
the same domain index", but this encoding doesn't appear to permit
that.

> +"ibm,numa-distance-table" property contains one or more list of numbers representing the NUMA
> +distance between resource groups/domains present in the system.
> +
> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> +
> +For ex:
> +ibm,numa-lookup-index-table =  {3, 0, 8, 40}
> +ibm,numa-distance-table     =  {9, 1, 2, 8, 2, 1, 16, 8, 16, 1}
> +
> +  | 0    8   40
> +--|------------
> +  |
> +0 | 10   20  80
> +  |
> +8 | 20   10  160
> +  |
> +40| 80   160  10

What's the reason for multiplying the values by 10 in the expanded
table version?

> +
> +With Form2 "ibm,associativity" for resources is listed as below:
> +
> +"ibm,associativity" property for resources in node 0, 8 and 40
> +{ 4, 6, 7, 0, 0}
> +{ 4, 6, 9, 8, 8}
> +{ 4, 6, 7, 0, 40}
> +
> +With "ibm,associativity-reference-points"  { 0x4, 0x3, 0x2 }
> +
> +With Form2 the primary domainID and secondary domainID are used to identify the NUMA nodes

What the heck is the secondary domainID?

> +the kernel should use when using persistent memory devices. Persistent memory devices
> +can also be used as regular memory using DAX KMEM driver and primary domainID indicates
> +the numa node number OS should use when using these devices as regular memory. Secondary
> +domainID is the numa node number that should be used when using this device as
> +persistent memory. In the later case, we are interested in the locality of the
> +device to an established numa node. In the above example, if the last row represents a
> +persistent memory device/resource, NUMA node number 40 will be used when using the device
> +as regular memory and NUMA node number 0 will be the device numa node when using it as
> +a persistent memory device.
> +
> +Each resource (drcIndex) now also supports additional optional device tree properties.
> +These properties are marked optional because the platform can choose not to export
> +them and provide the system topology details using the earlier defined device tree
> +properties alone. The optional device tree properties are used when adding new resources
> +(DLPAR) and when the platform didn't provide the topology details of the domain which
> +contains the newly added resource during boot.
> +
> +"ibm,numa-lookup-index" property contains a number representing the domainID index to be used
> +when building the NUMA distance of the numa node to which this resource belongs. The domain id
> +of the new resource can be obtained from the existing "ibm,associativity" property. This
> +can be used to build distance information of a newly onlined NUMA node via DLPAR operation.
> +The value is 1 based array index value.

Am I correct in thinking that if we have an entirely form2 world, we'd
only need this and the ibm,associativity properties could be dropped?

> +
> +prop-encoded-array: An integer encoded as with encode-int specifying the domainID index
> +
> +"ibm,numa-distance" property contains one or more list of numbers presenting the NUMA distance
> +from this resource domain to other resources.

IIUC this is about extending the global distance table with
information for a new node.  Is that correct?

The global distance table allows for the possibility of asymmetric
distances between nodes, but this does not.  Is that intentional?

> +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by
> +N distance values encoded as with encode-bytes. The max distance value we could encode is 255.
> +
> +For ex:
> +ibm,associativity     = { 4, 5, 6, 7, 50}
> +ibm,numa-lookup-index = { 4 }
> +ibm,numa-distance   =  {8, 16, 32, 8, 1, 16, 32, 8, 1}
> +
> +resulting in a new toplogy as below.
> +  | 0    8   40   50
> +--|------------------
> +  |
> +0 | 10   20  80   160
> +  |
> +8 | 20   10  160  320
> +  |
> +40| 80   160  10  80
> +  |
> +50| 160  320  80  10
> diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
> index 60b631161360..97a3bd9ffeb9 100644
> --- a/arch/powerpc/include/asm/firmware.h
> +++ b/arch/powerpc/include/asm/firmware.h
> @@ -53,6 +53,7 @@
>  #define FW_FEATURE_ULTRAVISOR	ASM_CONST(0x0000004000000000)
>  #define FW_FEATURE_STUFF_TCE	ASM_CONST(0x0000008000000000)
>  #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000)
> +#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000)
>  
>  #ifndef __ASSEMBLY__
>  
> @@ -73,7 +74,7 @@ enum {
>  		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
>  		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
>  		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
> -		FW_FEATURE_RPT_INVALIDATE,
> +		FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY,
>  	FW_FEATURE_PSERIES_ALWAYS = 0,
>  	FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR,
>  	FW_FEATURE_POWERNV_ALWAYS = 0,
> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
> index df9fec9d232c..5c80152e8f18 100644
> --- a/arch/powerpc/include/asm/prom.h
> +++ b/arch/powerpc/include/asm/prom.h
> @@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop,
>  #define OV5_XCMO		0x0440	/* Page Coalescing */
>  #define OV5_FORM1_AFFINITY	0x0580	/* FORM1 NUMA affinity */
>  #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
> +#define OV5_FORM2_AFFINITY	0x0520	/* Form2 NUMA affinity */
>  #define OV5_HP_EVT		0x0604	/* Hot Plug Event support */
>  #define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
>  #define OV5_PFO_HW_RNG		0x1180	/* PFO Random Number Generator */
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 64b9593038a7..496fdac54c29 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -1070,7 +1070,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
>  #else
>  		0,
>  #endif
> -		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN),
> +		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) |
> +		OV5_FEAT(OV5_FORM2_AFFINITY),
>  		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
>  		.micro_checkpoint = 0,
>  		.reserved0 = 0,
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 696e5bfe1414..86cd2af014f7 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  
>  #define FORM0_AFFINITY 0
>  #define FORM1_AFFINITY 1
> +#define FORM2_AFFINITY 2
>  static int affinity_form;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
>  static int max_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
> +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
> +	[0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
> +};
> +static int numa_id_index_table[MAX_NUMNODES];
>  
>  /*
>   * Allocate node_to_cpumask_map based on number of available nodes
> @@ -166,6 +171,27 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> +/*
> + * With FORM2 if we are not using logical domain ids, we will find
> + * both primary and seconday domains with same value. Hence always
> + * start comparison from secondary domains
> + */
> +static int __cpu_form2_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
> +{
> +	int dist = 0;
> +
> +	int i, index;
> +
> +	for (i = 1; i < max_domain_index; i++) {
> +		index = be32_to_cpu(distance_ref_points[i]);
> +		if (cpu1_assoc[index] == cpu2_assoc[index])
> +			break;
> +		dist++;
> +	}
> +
> +	return dist;
> +}
> +
>  static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	int dist = 0;
> @@ -178,7 +204,6 @@ static int __cpu_form1_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  			break;
>  		dist++;
>  	}
> -
>  	return dist;
>  }
>  
> @@ -186,8 +211,9 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
>  	/* We should not get called with FORM0 */
>  	VM_WARN_ON(affinity_form == FORM0_AFFINITY);
> -
> -	return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +	if (affinity_form == FORM1_AFFINITY)
> +		return __cpu_form1_distance(cpu1_assoc, cpu2_assoc);
> +	return __cpu_form2_distance(cpu1_assoc, cpu2_assoc);
>  }
>  
>  /* must hold reference to node during call */
> @@ -201,7 +227,9 @@ int __node_distance(int a, int b)
>  	int i;
>  	int distance = LOCAL_DISTANCE;
>  
> -	if (affinity_form == FORM0_AFFINITY)
> +	if (affinity_form == FORM2_AFFINITY)
> +		return numa_distance_table[a][b];
> +	else if (affinity_form == FORM0_AFFINITY)
>  		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
>  	for (i = 0; i < max_domain_index; i++) {
> @@ -303,15 +331,116 @@ static void initialize_form1_numa_distance(struct device_node *node)
>  
>  /*
>   * Used to update distance information w.r.t newly added node.
> + * ibm,numa-lookup-index -> 4
> + * ibm,numa-distance -> {5, 20, 40, 60, 80, 10 }
>   */
>  void update_numa_distance(struct device_node *node)
>  {
> +	int i, nid, other_nid, other_nid_index = 0;
> +	const __be32 *numa_indexp;
> +	const __u8  *numa_distancep;
> +	int numa_index, max_numa_index, numa_distance;
> +
>  	if (affinity_form == FORM0_AFFINITY)
>  		return;
>  	else if (affinity_form == FORM1_AFFINITY) {
>  		initialize_form1_numa_distance(node);
>  		return;
>  	}
> +	/* FORM2 affinity  */
> +
> +	nid = of_node_to_nid_single(node);
> +	if (nid == NUMA_NO_NODE)
> +		return;
> +
> +	/* Already initialized */
> +	if (numa_distance_table[nid][nid] != -1)
> +		return;
> +	/*
> +	 * update node distance if not already populated.
> +	 */
> +	numa_distancep = of_get_property(node, "ibm,numa-distance", NULL);
> +	if (!numa_distancep)
> +		return;
> +
> +	numa_indexp = of_get_property(node, "ibm,numa-lookup-index", NULL);
> +	if (!numa_indexp)
> +		return;
> +
> +	numa_index = of_read_number(numa_indexp, 1);
> +	/*
> +	 * update the numa_id_index_table. Device tree look at index table as
> +	 * 1 based array indexing.
> +	 */
> +	numa_id_index_table[numa_index - 1] = nid;
> +
> +	max_numa_index = of_read_number((const __be32 *)numa_distancep, 1);
> +	VM_WARN_ON(max_numa_index != 2 * numa_index);
> +	/* Skip the size which is encoded int */
> +	numa_distancep += sizeof(__be32);
> +
> +	/*
> +	 * First fill the distance information from other node to this node.
> +	 */
> +	other_nid_index = 0;
> +	for (i = 0; i < numa_index; i++) {
> +		numa_distance = numa_distancep[i];
> +		other_nid = numa_id_index_table[other_nid_index++];
> +		numa_distance_table[other_nid][nid] = numa_distance;
> +	}
> +
> +	other_nid_index = 0;
> +	for (; i < max_numa_index; i++) {
> +		numa_distance = numa_distancep[i];
> +		other_nid = numa_id_index_table[other_nid_index++];
> +		numa_distance_table[nid][other_nid] = numa_distance;
> +	}
> +}
> +
> +/*
> + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
> + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
> + */
> +static void initialize_form2_numa_distance_lookup_table(struct device_node *root)
> +{
> +	const __u8 *numa_dist_table;
> +	const __be32 *numa_lookup_index;
> +	int numa_dist_table_length;
> +	int max_numa_index, distance_index;
> +	int i, curr_row = 0, curr_column = 0;
> +
> +	numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
> +	max_numa_index = of_read_number(&numa_lookup_index[0], 1);
> +
> +	/* first element of the array is the size and is encode-int */
> +	numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL);
> +	numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1);
> +	/* Skip the size which is encoded int */
> +	numa_dist_table += sizeof(__be32);
> +
> +	pr_debug(" numa_dist_table_len = %d, numa_dist_indexes_len = %d \n",
> +		 numa_dist_table_length, max_numa_index);
> +
> +	for (i = 0; i < max_numa_index; i++)
> +		/* +1 skip the max_numa_index in the property */
> +		numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
> +
> +
> +	VM_WARN_ON(numa_dist_table_length != max_numa_index * max_numa_index);
> +
> +	for (distance_index = 0; distance_index < numa_dist_table_length; distance_index++) {
> +		int nodeA = numa_id_index_table[curr_row];
> +		int nodeB = numa_id_index_table[curr_column++];
> +
> +		numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index];
> +
> +		pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]);
> +		if (curr_column >= max_numa_index) {
> +			curr_row++;
> +			/* reset the column */
> +			curr_column = 0;
> +		}
> +	}
>  }
>  
>  static int __init find_primary_domain_index(void)
> @@ -324,6 +453,9 @@ static int __init find_primary_domain_index(void)
>  	 */
>  	if (firmware_has_feature(FW_FEATURE_OPAL)) {
>  		affinity_form = FORM1_AFFINITY;
> +	} else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
> +		dbg("Using form 2 affinity\n");
> +		affinity_form = FORM2_AFFINITY;
>  	} else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
>  		dbg("Using form 1 affinity\n");
>  		affinity_form = FORM1_AFFINITY;
> @@ -368,8 +500,17 @@ static int __init find_primary_domain_index(void)
>  
>  		index = of_read_number(&distance_ref_points[1], 1);
>  	} else {
> +		/*
> +		 * Both FORM1 and FORM2 affinity find the primary domain details
> +		 * at the same offset.
> +		 */
>  		index = of_read_number(distance_ref_points, 1);
>  	}
> +	/*
> +	 * If it is FORM2 also initialize the distance table here.
> +	 */
> +	if (affinity_form == FORM2_AFFINITY)
> +		initialize_form2_numa_distance_lookup_table(root);
>  
>  	/*
>  	 * Warn and cap if the hardware supports more than
> diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
> index 5d4c2bc20bba..f162156b7b68 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -123,6 +123,7 @@ vec5_fw_features_table[] = {
>  	{FW_FEATURE_PRRN,		OV5_PRRN},
>  	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
>  	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},
> +	{FW_FEATURE_FORM2_AFFINITY,	OV5_FORM2_AFFINITY},
>  };
>  
>  static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 3/8] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY
From: David Gibson @ 2021-06-15  3:04 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-4-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 6496 bytes --]

On Mon, Jun 14, 2021 at 10:09:58PM +0530, Aneesh Kumar K.V wrote:
> Also make related code cleanup that will allow adding FORM2_AFFINITY in
> later patches. No functional change in this patch.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/include/asm/firmware.h       |  4 +--
>  arch/powerpc/include/asm/prom.h           |  2 +-
>  arch/powerpc/kernel/prom_init.c           |  2 +-
>  arch/powerpc/mm/numa.c                    | 35 ++++++++++++++---------
>  arch/powerpc/platforms/pseries/firmware.c |  2 +-
>  5 files changed, 26 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h
> index 7604673787d6..60b631161360 100644
> --- a/arch/powerpc/include/asm/firmware.h
> +++ b/arch/powerpc/include/asm/firmware.h
> @@ -44,7 +44,7 @@
>  #define FW_FEATURE_OPAL		ASM_CONST(0x0000000010000000)
>  #define FW_FEATURE_SET_MODE	ASM_CONST(0x0000000040000000)
>  #define FW_FEATURE_BEST_ENERGY	ASM_CONST(0x0000000080000000)
> -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000)
> +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000)
>  #define FW_FEATURE_PRRN		ASM_CONST(0x0000000200000000)
>  #define FW_FEATURE_DRMEM_V2	ASM_CONST(0x0000000400000000)
>  #define FW_FEATURE_DRC_INFO	ASM_CONST(0x0000000800000000)
> @@ -69,7 +69,7 @@ enum {
>  		FW_FEATURE_SPLPAR | FW_FEATURE_LPAR |
>  		FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
>  		FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
> -		FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
> +		FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN |
>  		FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 |
>  		FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE |
>  		FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR |
> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
> index 324a13351749..df9fec9d232c 100644
> --- a/arch/powerpc/include/asm/prom.h
> +++ b/arch/powerpc/include/asm/prom.h
> @@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop,
>  #define OV5_MSI			0x0201	/* PCIe/MSI support */
>  #define OV5_CMO			0x0480	/* Cooperative Memory Overcommitment */
>  #define OV5_XCMO		0x0440	/* Page Coalescing */
> -#define OV5_TYPE1_AFFINITY	0x0580	/* Type 1 NUMA affinity */
> +#define OV5_FORM1_AFFINITY	0x0580	/* FORM1 NUMA affinity */
>  #define OV5_PRRN		0x0540	/* Platform Resource Reassignment */
>  #define OV5_HP_EVT		0x0604	/* Hot Plug Event support */
>  #define OV5_RESIZE_HPT		0x0601	/* Hash Page Table resizing */
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 41ed7e33d897..64b9593038a7 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -1070,7 +1070,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = {
>  #else
>  		0,
>  #endif
> -		.associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
> +		.associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN),
>  		.bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT),
>  		.micro_checkpoint = 0,
>  		.reserved0 = 0,
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 5941da201fa3..192067991f8a 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data);
>  
>  static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
> -static int form1_affinity;
> +
> +#define FORM0_AFFINITY 0
> +#define FORM1_AFFINITY 1
> +static int affinity_form;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
>  static int max_domain_index;
> @@ -190,7 +193,7 @@ int __node_distance(int a, int b)
>  	int i;
>  	int distance = LOCAL_DISTANCE;
>  
> -	if (!form1_affinity)
> +	if (affinity_form == FORM0_AFFINITY)
>  		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
>  	for (i = 0; i < max_domain_index; i++) {
> @@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>  {
>  	int i;
>  
> -	if (!form1_affinity)
> +	if (affinity_form != FORM1_AFFINITY)
>  		return;
>  
>  	for (i = 0; i < max_domain_index; i++) {
> @@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void)
>  	int index;
>  	struct device_node *root;
>  
> +	/*
> +	 * Check for which form of affinity.
> +	 */
> +	if (firmware_has_feature(FW_FEATURE_OPAL)) {
> +		affinity_form = FORM1_AFFINITY;
> +	} else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
> +		dbg("Using form 1 affinity\n");
> +		affinity_form = FORM1_AFFINITY;
> +	} else
> +		affinity_form = FORM0_AFFINITY;
> +
>  	if (firmware_has_feature(FW_FEATURE_OPAL))
>  		root = of_find_node_by_path("/ibm,opal");
>  	else
> @@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void)
>  	}
>  
>  	max_domain_index /= sizeof(int);
> -
> -	if (firmware_has_feature(FW_FEATURE_OPAL) ||
> -	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> -		dbg("Using form 1 affinity\n");
> -		form1_affinity = 1;
> -	}
> -
> -	if (form1_affinity) {
> -		index = of_read_number(distance_ref_points, 1);
> -	} else {
> +	if (affinity_form == FORM0_AFFINITY) {
>  		if (max_domain_index < 2) {
>  			printk(KERN_WARNING "NUMA: "
> -				"short ibm,associativity-reference-points\n");
> +			       "short ibm,associativity-reference-points\n");
>  			goto err;
>  		}
>  
>  		index = of_read_number(&distance_ref_points[1], 1);
> +	} else {
> +		index = of_read_number(distance_ref_points, 1);
>  	}
>  
>  	/*
> diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
> index 4c7b7f5a2ebc..5d4c2bc20bba 100644
> --- a/arch/powerpc/platforms/pseries/firmware.c
> +++ b/arch/powerpc/platforms/pseries/firmware.c
> @@ -119,7 +119,7 @@ struct vec5_fw_feature {
>  
>  static __initdata struct vec5_fw_feature
>  vec5_fw_features_table[] = {
> -	{FW_FEATURE_TYPE1_AFFINITY,	OV5_TYPE1_AFFINITY},
> +	{FW_FEATURE_FORM1_AFFINITY,	OV5_FORM1_AFFINITY},
>  	{FW_FEATURE_PRRN,		OV5_PRRN},
>  	{FW_FEATURE_DRMEM_V2,		OV5_DRMEM_V2},
>  	{FW_FEATURE_DRC_INFO,		OV5_DRC_INFO},

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 1/8] powerpc/pseries: rename min_common_depth to primary_domain_index
From: David Gibson @ 2021-06-15  3:00 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-2-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 5080 bytes --]

On Mon, Jun 14, 2021 at 10:09:56PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch.

I think this needs a rationale as to why 'primary_domain_index' is a
better name than 'min_common_depth'.  The meaning isn't obvious to me
from either name.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 38 +++++++++++++++++++-------------------
>  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f2bf98bdcea2..8365b298ec48 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
>  EXPORT_SYMBOL(node_to_cpumask_map);
>  EXPORT_SYMBOL(node_data);
>  
> -static int min_common_depth;
> +static int primary_domain_index;
>  static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
> @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity)
>  	if (!numa_enabled)
>  		goto out;
>  
> -	if (of_read_number(associativity, 1) >= min_common_depth)
> -		nid = of_read_number(&associativity[min_common_depth], 1);
> +	if (of_read_number(associativity, 1) >= primary_domain_index)
> +		nid = of_read_number(&associativity[primary_domain_index], 1);
>  
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= nr_node_ids)
> @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static int __init find_min_common_depth(void)
> +static int __init find_primary_domain_index(void)
>  {
> -	int depth;
> +	int index;
>  	struct device_node *root;
>  
>  	if (firmware_has_feature(FW_FEATURE_OPAL))
> @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void)
>  	}
>  
>  	if (form1_affinity) {
> -		depth = of_read_number(distance_ref_points, 1);
> +		index = of_read_number(distance_ref_points, 1);
>  	} else {
>  		if (distance_ref_points_depth < 2) {
>  			printk(KERN_WARNING "NUMA: "
> @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void)
>  			goto err;
>  		}
>  
> -		depth = of_read_number(&distance_ref_points[1], 1);
> +		index = of_read_number(&distance_ref_points[1], 1);
>  	}
>  
>  	/*
> @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void)
>  	}
>  
>  	of_node_put(root);
> -	return depth;
> +	return index;
>  
>  err:
>  	of_node_put(root);
> @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  	int nid = default_nid;
>  	int rc, index;
>  
> -	if ((min_common_depth < 0) || !numa_enabled)
> +	if ((primary_domain_index < 0) || !numa_enabled)
>  		return default_nid;
>  
>  	rc = of_get_assoc_arrays(&aa);
>  	if (rc)
>  		return default_nid;
>  
> -	if (min_common_depth <= aa.array_sz &&
> +	if (primary_domain_index <= aa.array_sz &&
>  	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> -		index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
>  		nid = of_read_number(&aa.arrays[index], 1);
>  
>  		if (nid == 0xffff || nid >= nr_node_ids)
> @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void)
>  		return -1;
>  	}
>  
> -	min_common_depth = find_min_common_depth();
> +	primary_domain_index = find_primary_domain_index();
>  
> -	if (min_common_depth < 0) {
> +	if (primary_domain_index < 0) {
>  		/*
> -		 * if we fail to parse min_common_depth from device tree
> +		 * if we fail to parse primary_domain_index from device tree
>  		 * mark the numa disabled, boot with numa disabled.
>  		 */
>  		numa_enabled = false;
> -		return min_common_depth;
> +		return primary_domain_index;
>  	}
>  
> -	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
> +	dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index);
>  
>  	/*
>  	 * Even though we connect cpus to numa domains later in SMP
> @@ -919,14 +919,14 @@ static void __init find_possible_nodes(void)
>  			goto out;
>  	}
>  
> -	max_nodes = of_read_number(&domains[min_common_depth], 1);
> +	max_nodes = of_read_number(&domains[primary_domain_index], 1);
>  	for (i = 0; i < max_nodes; i++) {
>  		if (!node_possible(i))
>  			node_set(i, node_possible_map);
>  	}
>  
>  	prop_length /= sizeof(int);
> -	if (prop_length > min_common_depth + 2)
> +	if (prop_length > primary_domain_index + 2)
>  		coregroup_enabled = 1;
>  
>  out:
> @@ -1259,7 +1259,7 @@ int cpu_to_coregroup_id(int cpu)
>  		goto out;
>  
>  	index = of_read_number(associativity, 1);
> -	if (index > min_common_depth + 1)
> +	if (index > primary_domain_index + 1)
>  		return of_read_number(&associativity[index - 1], 1);
>  
>  out:

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 4/8] powerpc/pseries: Consolidate DLPAR NUMA distance update
From: David Gibson @ 2021-06-15  3:13 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-5-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 4204 bytes --]

On Mon, Jun 14, 2021 at 10:09:59PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call. In
> later patch we will remove updating NUMA distance when we are looking
> for node id from associativity array.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c                        | 41 +++++++++++++++++++
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |  2 +
>  .../platforms/pseries/hotplug-memory.c        |  2 +
>  arch/powerpc/platforms/pseries/pseries.h      |  1 +
>  4 files changed, 46 insertions(+)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 192067991f8a..fec47981c1ef 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -287,6 +287,47 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> +	int i, nid;
> +
> +	if (of_read_number(associativity, 1) >= primary_domain_index) {
> +		nid = of_read_number(&associativity[primary_domain_index], 1);
> +
> +		for (i = 0; i < max_domain_index; i++) {
> +			const __be32 *entry;
> +
> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);
> +		}
> +	}
> +}

This logic is almost identicaly to initialize_distance_lookup_table()
- it would be good if they could be consolidated, so it's clear that
coldplugged and hotplugged nodes are parsing the NUMA information in
the same way.

> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> +	const __be32 *associativity;
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return;
> +
> +	__initialize_form1_numa_distance(associativity);
> +	return;
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> +	if (affinity_form == FORM0_AFFINITY)
> +		return;
> +	else if (affinity_form == FORM1_AFFINITY) {
> +		initialize_form1_numa_distance(node);
> +		return;
> +	}
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>  	int index;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>  		return saved_rc;
>  	}
>  
> +	update_numa_distance(dn);
> +
>  	rc = dlpar_online_cpu(dn);
>  	if (rc) {
>  		saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 8377f1f7c78e..0e602c3b01ea 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
>  		return -ENODEV;
>  	}
>  
> +	update_numa_distance(lmb_node);
> +
>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>  	if (!dr_node) {
>  		dlpar_free_cc_nodes(lmb_node);
> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
> index 1f051a786fb3..663a0859cf13 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -113,4 +113,5 @@ extern u32 pseries_security_flavor;
>  void pseries_setup_security_mitigations(void);
>  void pseries_lpar_read_hblkrm_characteristics(void);
>  
> +void update_numa_distance(struct device_node *node);
>  #endif /* _PSERIES_PSERIES_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [RFC PATCH 2/8] powerpc/pseries: rename distance_ref_points_depth to max_domain_index
From: David Gibson @ 2021-06-15  3:01 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210614164003.196094-3-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 3960 bytes --]

On Mon, Jun 14, 2021 at 10:09:57PM +0530, Aneesh Kumar K.V wrote:
> No functional change in this patch

As with 1/8 an explanation of what this actually means and therefore
why this is a better name would be very helpful.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 20 ++++++++++----------
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8365b298ec48..5941da201fa3 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -56,7 +56,7 @@ static int n_mem_addr_cells, n_mem_size_cells;
>  static int form1_affinity;
>  
>  #define MAX_DISTANCE_REF_POINTS 4
> -static int distance_ref_points_depth;
> +static int max_domain_index;
>  static const __be32 *distance_ref_points;
>  static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
> @@ -169,7 +169,7 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  
>  	int i, index;
>  
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> +	for (i = 0; i < max_domain_index; i++) {
>  		index = be32_to_cpu(distance_ref_points[i]);
>  		if (cpu1_assoc[index] == cpu2_assoc[index])
>  			break;
> @@ -193,7 +193,7 @@ int __node_distance(int a, int b)
>  	if (!form1_affinity)
>  		return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
>  
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> +	for (i = 0; i < max_domain_index; i++) {
>  		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
>  			break;
>  
> @@ -213,7 +213,7 @@ static void initialize_distance_lookup_table(int nid,
>  	if (!form1_affinity)
>  		return;
>  
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> +	for (i = 0; i < max_domain_index; i++) {
>  		const __be32 *entry;
>  
>  		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> @@ -240,7 +240,7 @@ static int associativity_to_nid(const __be32 *associativity)
>  		nid = NUMA_NO_NODE;
>  
>  	if (nid > 0 &&
> -		of_read_number(associativity, 1) >= distance_ref_points_depth) {
> +		of_read_number(associativity, 1) >= max_domain_index) {
>  		/*
>  		 * Skip the length field and send start of associativity array
>  		 */
> @@ -310,14 +310,14 @@ static int __init find_primary_domain_index(void)
>  	 */
>  	distance_ref_points = of_get_property(root,
>  					"ibm,associativity-reference-points",
> -					&distance_ref_points_depth);
> +					&max_domain_index);
>  
>  	if (!distance_ref_points) {
>  		dbg("NUMA: ibm,associativity-reference-points not found.\n");
>  		goto err;
>  	}
>  
> -	distance_ref_points_depth /= sizeof(int);
> +	max_domain_index /= sizeof(int);
>  
>  	if (firmware_has_feature(FW_FEATURE_OPAL) ||
>  	    firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
> @@ -328,7 +328,7 @@ static int __init find_primary_domain_index(void)
>  	if (form1_affinity) {
>  		index = of_read_number(distance_ref_points, 1);
>  	} else {
> -		if (distance_ref_points_depth < 2) {
> +		if (max_domain_index < 2) {
>  			printk(KERN_WARNING "NUMA: "
>  				"short ibm,associativity-reference-points\n");
>  			goto err;
> @@ -341,10 +341,10 @@ static int __init find_primary_domain_index(void)
>  	 * Warn and cap if the hardware supports more than
>  	 * MAX_DISTANCE_REF_POINTS domains.
>  	 */
> -	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> +	if (max_domain_index > MAX_DISTANCE_REF_POINTS) {
>  		printk(KERN_WARNING "NUMA: distance array capped at "
>  			"%d entries\n", MAX_DISTANCE_REF_POINTS);
> -		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> +		max_domain_index = MAX_DISTANCE_REF_POINTS;
>  	}
>  
>  	of_node_put(root);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* RE: [PATCH] usb: gadget: fsl: properly remove remnant of MXC support
From: Leo Li @ 2021-06-15  3:54 UTC (permalink / raw)
  To: Joel Stanley
  Cc: Felipe Balbi, Arnd Bergmann, Greg Kroah-Hartman,
	linux-usb@vger.kernel.org, Linux Kernel Mailing List,
	Fabio Estevam, Ran Wang, linuxppc-dev
In-Reply-To: <CACPK8XfUiiBM=KQiqSJ5uSUpOHLTp_wxhNyEw-gYkTBsZjbZVg@mail.gmail.com>



> -----Original Message-----
> From: Joel Stanley <joel@jms.id.au>
> Sent: Monday, June 14, 2021 8:52 PM
> To: Leo Li <leoyang.li@nxp.com>
> Cc: Felipe Balbi <balbi@kernel.org>; Greg Kroah-Hartman
> <gregkh@linuxfoundation.org>; linux-usb@vger.kernel.org; linuxppc-dev
> <linuxppc-dev@lists.ozlabs.org>; Linux Kernel Mailing List <linux-
> kernel@vger.kernel.org>; Arnd Bergmann <arnd@arndb.de>; Ran Wang
> <ran.wang_1@nxp.com>; Fabio Estevam <festevam@gmail.com>
> Subject: Re: [PATCH] usb: gadget: fsl: properly remove remnant of MXC
> support
> 
> On Sat, 12 Jun 2021 at 00:31, Li Yang <leoyang.li@nxp.com> wrote:
> >
> > Commit a390bef7db1f ("usb: gadget: fsl_mxc_udc: Remove the driver")
> > didn't remove all the MXC related stuff which can cause build problem
> > for LS1021 when enabled again in Kconfig.  This patch remove all the
> > remnants.
> >
> > Signed-off-by: Li Yang <leoyang.li@nxp.com>
> 
> Reviewed-by: Joel Stanley <joel@jms.id.au>
> 
> Will you re-submit the kconfig change once this is merged?

I think that we can re-use your previous patch.

Hi Greg,

Can you apply the reverted Kconfig patch again?  Or do you prefer us to re-submit it again?

Regards,
Leo

^ permalink raw reply

* Re: [PATCH v2 09/12] powerpc/inst: Refactor PPC32 and PPC64 versions
From: Jordan Niethe @ 2021-06-15  3:48 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: linux-kernel, Paul Mackerras, Naveen N. Rao, linuxppc-dev
In-Reply-To: <d54c63dcac6d190e1cc0d2fe3259d6e621928cdf.1621516826.git.christophe.leroy@csgroup.eu>

On Thu, May 20, 2021 at 11:50 PM Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
> ppc_inst() ppc_inst_prefixed() ppc_inst_swab() can easily
> be made common to both PPC32 and PPC64.
>
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
>  arch/powerpc/include/asm/inst.h | 49 +++++++++------------------------
>  1 file changed, 13 insertions(+), 36 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
> index 32d318c3b180..e009e94e90b2 100644
> --- a/arch/powerpc/include/asm/inst.h
> +++ b/arch/powerpc/include/asm/inst.h
> @@ -60,9 +60,9 @@ static inline int ppc_inst_primary_opcode(struct ppc_inst x)
>         return ppc_inst_val(x) >> 26;
>  }
>
> -#ifdef CONFIG_PPC64
>  #define ppc_inst(x) ((struct ppc_inst){ .val = (x) })
>
> +#ifdef CONFIG_PPC64
>  #define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y) })
>
>  static inline u32 ppc_inst_suffix(struct ppc_inst x)
> @@ -70,57 +70,34 @@ static inline u32 ppc_inst_suffix(struct ppc_inst x)
>         return x.suffix;
>  }
>
> -static inline bool ppc_inst_prefixed(struct ppc_inst x)
> -{
> -       return ppc_inst_primary_opcode(x) == OP_PREFIX;
> -}
> +#else
> +#define ppc_inst_prefix(x, y) ppc_inst(x)
>
> -static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> +static inline u32 ppc_inst_suffix(struct ppc_inst x)
>  {
> -       return ppc_inst_prefix(swab32(ppc_inst_val(x)), swab32(ppc_inst_suffix(x)));
> +       return 0;
>  }
>
> +#endif /* CONFIG_PPC64 */
> +
>  static inline struct ppc_inst ppc_inst_read(const unsigned int *ptr)
>  {
> -       u32 val, suffix;
> -
> -       val = *ptr;
> -       if ((val >> 26) == OP_PREFIX) {
> -               suffix = *(ptr + 1);
> -               return ppc_inst_prefix(val, suffix);
> -       } else {
> -               return ppc_inst(val);
> -       }
> +       if (IS_ENABLED(CONFIG_PPC64) && (*ptr >> 26) == OP_PREFIX)
> +               return ppc_inst_prefix(*ptr, *(ptr + 1));
> +       else
> +               return ppc_inst(*ptr);
>  }
>
> -#else
> -
> -#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> -
> -#define ppc_inst_prefix(x, y) ppc_inst(x)
> -
>  static inline bool ppc_inst_prefixed(struct ppc_inst x)
>  {
> -       return false;
> -}
> -
> -static inline u32 ppc_inst_suffix(struct ppc_inst x)
> -{
> -       return 0;
> +       return IS_ENABLED(CONFIG_PPC64) && ppc_inst_primary_opcode(x) == OP_PREFIX;
>  }
>
>  static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
>  {
> -       return ppc_inst(swab32(ppc_inst_val(x)));
> -}
> -
> -static inline struct ppc_inst ppc_inst_read(const unsigned int *ptr)
> -{
> -       return ppc_inst(*ptr);
> +       return ppc_inst_prefix(swab32(ppc_inst_val(x)), swab32(ppc_inst_suffix(x)));
>  }
>
> -#endif /* CONFIG_PPC64 */
> -
>  static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
>  {
>         if (ppc_inst_val(x) != ppc_inst_val(y))
> --
> 2.25.0
>
Reviewed by: Jordan Niethe <jniethe5@gmail.com>

^ permalink raw reply

* Re: [PATCH v2 08/12] powerpc: Don't use 'struct ppc_inst' to reference instruction location
From: Jordan Niethe @ 2021-06-15  3:47 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: linux-kernel, Paul Mackerras, Naveen N. Rao, linuxppc-dev
In-Reply-To: <871r93vqcb.fsf@mpe.ellerman.id.au>

On Tue, Jun 15, 2021 at 12:01 PM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> > diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
> > index 5a0740ebf132..32d318c3b180 100644
> > --- a/arch/powerpc/include/asm/inst.h
> > +++ b/arch/powerpc/include/asm/inst.h
> > @@ -139,7 +139,7 @@ static inline int ppc_inst_len(struct ppc_inst x)
> >   * Return the address of the next instruction, if the instruction @value was
> >   * located at @location.
> >   */
> > -static inline struct ppc_inst *ppc_inst_next(void *location, struct ppc_inst *value)
> > +static inline unsigned int *ppc_inst_next(unsigned int *location, unsigned int *value)
> >  {
> >       struct ppc_inst tmp;
> >
>
> It's not visible in the diff, but the rest of the function is:
>
>         tmp = ppc_inst_read(value);
>
>         return location + ppc_inst_len(tmp);
> }
>
> And so changing the type of location from void * to int * changes the
> result of that addition, ie. previously it was in units of bytes, now
> it's units of 4 bytes.
>
> To fix it I've kept location as unsigned int *, and added a cast where
> we do the addition. That way users of the function just see unsigned int *,
> the cast to void * is an implementation detail.
>
> We only have a handful of uses of ppc_inst_len(), so maybe that should
> change name and return a result in units of int *. But that can be a
> separate change.
>
> > diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
> > index 87f524e4b09c..302f2a1e0361 100644
> > --- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
> > +++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
> > @@ -83,7 +83,7 @@ smp_86xx_kick_cpu(int nr)
> >               mdelay(1);
> >
> >       /* Restore the exception vector */
> > -     patch_instruction((struct ppc_inst *)vector, ppc_inst(save_vector));
> > +     patch_instruction(vector, ppc_inst(save_vector));
> >
> >       local_irq_restore(flags);
> >
>
> There was another usage in here:
>
>         /* Setup fake reset vector to call __secondary_start_mpc86xx. */
>         target = (unsigned long) __secondary_start_mpc86xx;
> -       patch_branch((struct ppc_inst *)vector, target, BRANCH_SET_LINK);
> +       patch_branch(vector, target, BRANCH_SET_LINK);
>
>         /* Kick that CPU */
>         smp_86xx_release_core(nr);
>
> I fixed it up.
>
> cheers
fwiw
Reviewed by: Jordan Niethe <jniethe5@gmail.com>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox