LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 06/11] sections: Provide internal __is_kernel() and __is_kernel_text() helper
From: Kefeng Wang @ 2021-09-30  7:11 UTC (permalink / raw)
  To: arnd, linux-arch, linux-kernel, linuxppc-dev, rostedt, mingo,
	davem, ast, ryabinin.a.a, akpm
  Cc: Kefeng Wang, paulus, linux-alpha, bpf
In-Reply-To: <20210930071143.63410-1-wangkefeng.wang@huawei.com>

An internal __is_kernel() helper which only check the
kernel address ranges, and an internal __is_kernel_text()
helper which only check text section ranges.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 include/asm-generic/sections.h | 29 +++++++++++++++++++++++++++++
 include/linux/kallsyms.h       |  4 ++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 811583ca8bd0..a7abeadddc7a 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -186,4 +186,33 @@ static inline bool is_kernel_inittext(unsigned long addr)
 	       addr < (unsigned long)_einittext;
 }
 
+/**
+ * __is_kernel_text - checks if the pointer address is located in the
+ *                    .text section
+ *
+ * @addr: address to check
+ *
+ * Returns: true if the address is located in .text, false otherwise.
+ * Note: an internal helper, only check the range of _stext to _etext.
+ */
+static inline bool __is_kernel_text(unsigned long addr)
+{
+	return addr >= (unsigned long)_stext &&
+	       addr < (unsigned long)_etext;
+}
+
+/**
+ * __is_kernel - checks if the pointer address is located in the kernel range
+ *
+ * @addr: address to check
+ *
+ * Returns: true if the address is located in the kernel range, false otherwise.
+ * Note: an internal helper, only check the range of _stext to _end.
+ */
+static inline bool __is_kernel(unsigned long addr)
+{
+	return addr >= (unsigned long)_stext &&
+	       addr < (unsigned long)_end;
+}
+
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 8a9d329c927c..5fb17dd4b6fa 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -26,14 +26,14 @@ struct module;
 
 static inline int is_kernel_text(unsigned long addr)
 {
-	if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext))
+	if (__is_kernel_text(addr))
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
 
 static inline int is_kernel(unsigned long addr)
 {
-	if (addr >= (unsigned long)_stext && addr < (unsigned long)_end)
+	if (__is_kernel(addr))
 		return 1;
 	return in_gate_area_no_mm(addr);
 }
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 03/11] sections: Move and rename core_kernel_data() to is_kernel_core_data()
From: Kefeng Wang @ 2021-09-30  7:11 UTC (permalink / raw)
  To: arnd, linux-arch, linux-kernel, linuxppc-dev, rostedt, mingo,
	davem, ast, ryabinin.a.a, akpm
  Cc: Kefeng Wang, paulus, linux-alpha, bpf
In-Reply-To: <20210930071143.63410-1-wangkefeng.wang@huawei.com>

Move core_kernel_data() into sections.h and rename it to
is_kernel_core_data(), also make it return bool value, then
update all the callers.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 include/asm-generic/sections.h | 16 ++++++++++++++++
 include/linux/kernel.h         |  1 -
 kernel/extable.c               | 18 ------------------
 kernel/trace/ftrace.c          |  2 +-
 net/sysctl_net.c               |  2 +-
 5 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 817309e289db..24780c0f40b1 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -142,6 +142,22 @@ static inline bool init_section_intersects(void *virt, size_t size)
 	return memory_intersects(__init_begin, __init_end, virt, size);
 }
 
+/**
+ * is_kernel_core_data - checks if the pointer address is located in the
+ *			 .data section
+ *
+ * @addr: address to check
+ *
+ * Returns: true if the address is located in .data, false otherwise.
+ * Note: On some archs it may return true for core RODATA, and false
+ *       for others. But will always be true for core RW data.
+ */
+static inline bool is_kernel_core_data(unsigned long addr)
+{
+	return addr >= (unsigned long)_sdata &&
+	       addr < (unsigned long)_edata;
+}
+
 /**
  * is_kernel_rodata - checks if the pointer address is located in the
  *                    .rodata section
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2776423a587e..e5a9af8a4e20 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -230,7 +230,6 @@ extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
 extern int init_kernel_text(unsigned long addr);
-extern int core_kernel_data(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
diff --git a/kernel/extable.c b/kernel/extable.c
index b0ea5eb0c3b4..da26203841d4 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -82,24 +82,6 @@ int notrace core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-/**
- * core_kernel_data - tell if addr points to kernel data
- * @addr: address to test
- *
- * Returns true if @addr passed in is from the core kernel data
- * section.
- *
- * Note: On some archs it may return true for core RODATA, and false
- *  for others. But will always be true for core RW data.
- */
-int core_kernel_data(unsigned long addr)
-{
-	if (addr >= (unsigned long)_sdata &&
-	    addr < (unsigned long)_edata)
-		return 1;
-	return 0;
-}
-
 int __kernel_text_address(unsigned long addr)
 {
 	if (kernel_text_address(addr))
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7efbc8aaf7f6..f15badf31f52 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -323,7 +323,7 @@ int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
 		return -EBUSY;
 
-	if (!core_kernel_data((unsigned long)ops))
+	if (!is_kernel_core_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
 	add_ftrace_ops(&ftrace_ops_list, ops);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index f6cb0d4d114c..4b45ed631eb8 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -144,7 +144,7 @@ static void ensure_safe_net_sysctl(struct net *net, const char *path,
 		addr = (unsigned long)ent->data;
 		if (is_module_address(addr))
 			where = "module";
-		else if (core_kernel_data(addr))
+		else if (is_kernel_core_data(addr))
 			where = "kernel";
 		else
 			continue;
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 00/11] sections: Unify kernel sections range check and use
From: Kefeng Wang @ 2021-09-30  7:11 UTC (permalink / raw)
  To: arnd, linux-arch, linux-kernel, linuxppc-dev, rostedt, mingo,
	davem, ast, ryabinin.a.a, akpm
  Cc: Kefeng Wang, paulus, linux-alpha, bpf

There are three head files(kallsyms.h, kernel.h and sections.h) which
include the kernel sections range check, let's make some cleanup and
unify them.

1. cleanup arch specific text/data check and fix address boundary check
   in kallsyms.h
2. make all the basic/core kernel range check function into sections.h
3. update all the callers, and use the helper in sections.h to simplify
   the code

After this series, we have 5 APIs about kernel sections range check in
sections.h

 * is_kernel_rodata()		--- already in sections.h
 * is_kernel_core_data()	--- come from core_kernel_data() in kernel.h
 * is_kernel_inittext()		--- come from kernel.h and kallsyms.h
 * __is_kernel_text()		--- add new internal helper
 * __is_kernel()		--- add new internal helper

Note: For the last two helpers, people should not use directly, consider to
      use corresponding function in kallsyms.h.

v4:
- Use core_kernel_text() in powerpc sugguested Christophe Leroy, build
  test only
- Use is_kernel_text() in alpha and microblaze, build test only on
  next-20210929

v3:
https://lore.kernel.org/linux-arch/20210926072048.190336-1-wangkefeng.wang@huawei.com/
- Add Steven's RB to patch2
- Introduce two internal helper, then use is_kernel_text() in core_kernel_text()
  and is_kernel() in kernel_or_module_addr() suggested by Steven

v2:
https://lore.kernel.org/linux-arch/20210728081320.20394-1-wangkefeng.wang@huawei.com/
- add ACK/RW to patch2, and drop inappropriate fix tag
- keep 'core' to check kernel data, suggestted by Steven Rostedt
  <rostedt@goodmis.org>, rename is_kernel_data() to is_kernel_core_data()
- drop patch8 which is merged
- drop patch9 which is resend independently

v1:
https://lore.kernel.org/linux-arch/20210626073439.150586-1-wangkefeng.wang@huawei.com


Kefeng Wang (11):
  kallsyms: Remove arch specific text and data check
  kallsyms: Fix address-checks for kernel related range
  sections: Move and rename core_kernel_data() to is_kernel_core_data()
  sections: Move is_kernel_inittext() into sections.h
  x86: mm: Rename __is_kernel_text() to is_x86_32_kernel_text()
  sections: Provide internal __is_kernel() and __is_kernel_text() helper
  mm: kasan: Use is_kernel() helper
  extable: Use is_kernel_text() helper
  powerpc/mm: Use core_kernel_text() helper
  microblaze: Use is_kernel_text() helper
  alpha: Use is_kernel_text() helper

 arch/alpha/kernel/traps.c      |  4 +-
 arch/microblaze/mm/pgtable.c   |  3 +-
 arch/powerpc/mm/pgtable_32.c   |  7 +---
 arch/x86/kernel/unwind_orc.c   |  2 +-
 arch/x86/mm/init_32.c          | 14 +++----
 include/asm-generic/sections.h | 75 ++++++++++++++++++++++++++--------
 include/linux/kallsyms.h       | 13 +-----
 include/linux/kernel.h         |  2 -
 kernel/extable.c               | 33 ++-------------
 kernel/locking/lockdep.c       |  3 --
 kernel/trace/ftrace.c          |  2 +-
 mm/kasan/report.c              |  2 +-
 net/sysctl_net.c               |  2 +-
 13 files changed, 78 insertions(+), 84 deletions(-)

-- 
2.26.2


^ permalink raw reply

* Re: [PATCH] KVM: PPC: Book3S HV: Use GLOBAL_TOC for kvmppc_h_set_dabr/xdabr()
From: Jordan Niethe @ 2021-09-30  6:59 UTC (permalink / raw)
  To: Daniel Axtens; +Cc: Michael Neuling, linuxppc-dev, Nicholas Piggin
In-Reply-To: <87bl4a6182.fsf@dja-thinkpad.axtens.net>

On Thu, Sep 30, 2021 at 4:13 PM Daniel Axtens <dja@axtens.net> wrote:
>
> Hi Michael,
>
> > kvmppc_h_set_dabr(), and kvmppc_h_set_xdabr() which jumps into
> > it, need to use _GLOBAL_TOC to setup the kernel TOC pointer, because
> > kvmppc_h_set_dabr() uses LOAD_REG_ADDR() to load dawr_force_enable.
>
> This makes sense. LOAD_REG_ADDR() does ld reg,name@got(r2) and
> _GLOBAL_TOC sets r2 based on r12 and .TOC. .
>
> Looking at
> e.g. https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1610846.html
> it seems that we use GOT and TOC largely interchangeably... so assuming
> I haven't completely misunderstood, the change this patch makes seems to
> make sense to me. :)
>
> > When called from hcall_try_real_mode() we have the kernel TOC in r2,
> > established near the start of kvmppc_interrupt_hv(), so there is no
> > issue.
> >
> > But they can also be called from kvmppc_pseries_do_hcall() which is
> > module code, so the access ends up happening with the kvm-hv module's
> > r2, which will not point at dawr_force_enable and could even cause a
> > fault.
>
> I checked and there isn't anywhere else the functions are called, so
> this will now cover everything.
>
> > With the current code layout and compilers we haven't observed a fault
> > in practice, the load hits somewhere in kvm-hv.ko and silently returns
> > some bogus value.
> >
> > Note that we we expect p8/p9 guests to use the DAWR, but SLOF uses
> > h_set_dabr() to test if sc1 works correctly, see SLOF's
> > lib/libhvcall/brokensc1.c.
>
> I assume that something (the module loader?) patches the callsite to
> restore r2 after the function call? I imagine something must otherwise
> things would fall apart pretty quickly...
>
> > Fixes: c1fe190c0672 ("powerpc: Add force enable of DAWR on P9 option")
>
> That patch seems to only affect the DA_W_R not the DA_B_R - how does it
> cause this bug?

Isn't it that patch which adds the LOAD_REG_ADDR(r11,
dawr_force_enable) to kvmppc_h_set_dabr() which is the problem?

>
> All in all this looks good to me:
> Reviewed-by: Daniel Axtens <dja@axtens.net>
>
> Kind regards,
> Daniel
>
> > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> > ---
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index 90484425a1e6..30a8a07cff18 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -1999,7 +1999,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> >       .globl  hcall_real_table_end
> >  hcall_real_table_end:
> >
> > -_GLOBAL(kvmppc_h_set_xdabr)
> > +_GLOBAL_TOC(kvmppc_h_set_xdabr)
> >  EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
> >       andi.   r0, r5, DABRX_USER | DABRX_KERNEL
> >       beq     6f
> > @@ -2009,7 +2009,7 @@ EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
> >  6:   li      r3, H_PARAMETER
> >       blr
> >
> > -_GLOBAL(kvmppc_h_set_dabr)
> > +_GLOBAL_TOC(kvmppc_h_set_dabr)
> >  EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
> >       li      r5, DABRX_USER | DABRX_KERNEL
> >  3:
> > --
> > 2.25.1

^ permalink raw reply

* Re: [PATCH] powerpc/eeh:Fix some mistakes in comments
From: Daniel Axtens @ 2021-09-30  6:31 UTC (permalink / raw)
  To: Kai Song, linuxppc-dev; +Cc: paulus, Kai Song, oohall, linux-kernel
In-Reply-To: <20210927023507.32564-1-songkai01@inspur.com>

Hi Kai,

Thank you for your contribution to the powerpc kernel!

> Get rid of warning:
> arch/powerpc/kernel/eeh.c:774: warning: expecting prototype for eeh_set_pe_freset(). Prototype was for eeh_set_dev_freset() instead

You haven't said where this warning is from. I thought it might be from
sparse but I couldn't seem to reproduce it - is my version of sparse too
old or are you using a different tool?

>  /**
> - * eeh_set_pe_freset - Check the required reset for the indicated device
> - * @data: EEH device
> + * eeh_set_dev_freset - Check the required reset for the indicated device
> + * @edev: EEH device
>   * @flag: return value
>   *
>   * Each device might have its preferred reset type: fundamental or

This looks like a good and correct change.

I checked through git history with git blame to see when the function
was renamed. There are 2 commits that should have updated the comment:
one renamed the function and one renamed an argument. So, I think this
commit could have:

Fixes: d6c4932fbf24 ("powerpc/eeh: Strengthen types of eeh traversal functions")
Fixes: c270a24c59bd ("powerpc/eeh: Do reset based on PE")

But I don't know if an out of date comment is enough of a 'bug' to
justify a Fixes: tag? (mpe, I'm sure I've asked this before, sorry!)

All up, this is a good correction to the comment.

There are a few other functions in the file that have incorrect
docstrings:

 - eeh_pci_enable - missing parameter

 - eeh_pe_reset and eeh_pe_reset_full - missing parameter

 - eeh_init - missing parameter

 - eeh_pe_inject_err - wrong name for a parameter

Could you fix all of the docstrings in the file at once?

Kind regards,
Daniel

^ permalink raw reply

* Re: [PATCH] KVM: PPC: Book3S HV: Use GLOBAL_TOC for kvmppc_h_set_dabr/xdabr()
From: Daniel Axtens @ 2021-09-30  6:12 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: mikey, npiggin
In-Reply-To: <20210923151031.72408-1-mpe@ellerman.id.au>

Hi Michael,

> kvmppc_h_set_dabr(), and kvmppc_h_set_xdabr() which jumps into
> it, need to use _GLOBAL_TOC to setup the kernel TOC pointer, because
> kvmppc_h_set_dabr() uses LOAD_REG_ADDR() to load dawr_force_enable.

This makes sense. LOAD_REG_ADDR() does ld reg,name@got(r2) and
_GLOBAL_TOC sets r2 based on r12 and .TOC. .

Looking at
e.g. https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1610846.html
it seems that we use GOT and TOC largely interchangeably... so assuming
I haven't completely misunderstood, the change this patch makes seems to
make sense to me. :)

> When called from hcall_try_real_mode() we have the kernel TOC in r2,
> established near the start of kvmppc_interrupt_hv(), so there is no
> issue.
>
> But they can also be called from kvmppc_pseries_do_hcall() which is
> module code, so the access ends up happening with the kvm-hv module's
> r2, which will not point at dawr_force_enable and could even cause a
> fault.

I checked and there isn't anywhere else the functions are called, so
this will now cover everything.

> With the current code layout and compilers we haven't observed a fault
> in practice, the load hits somewhere in kvm-hv.ko and silently returns
> some bogus value.
>
> Note that we we expect p8/p9 guests to use the DAWR, but SLOF uses
> h_set_dabr() to test if sc1 works correctly, see SLOF's
> lib/libhvcall/brokensc1.c.

I assume that something (the module loader?) patches the callsite to
restore r2 after the function call? I imagine something must otherwise
things would fall apart pretty quickly...

> Fixes: c1fe190c0672 ("powerpc: Add force enable of DAWR on P9 option")

That patch seems to only affect the DA_W_R not the DA_B_R - how does it
cause this bug?

All in all this looks good to me:
Reviewed-by: Daniel Axtens <dja@axtens.net>

Kind regards,
Daniel

> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 90484425a1e6..30a8a07cff18 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1999,7 +1999,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
>  	.globl	hcall_real_table_end
>  hcall_real_table_end:
>  
> -_GLOBAL(kvmppc_h_set_xdabr)
> +_GLOBAL_TOC(kvmppc_h_set_xdabr)
>  EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
>  	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
>  	beq	6f
> @@ -2009,7 +2009,7 @@ EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
>  6:	li	r3, H_PARAMETER
>  	blr
>  
> -_GLOBAL(kvmppc_h_set_dabr)
> +_GLOBAL_TOC(kvmppc_h_set_dabr)
>  EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
>  	li	r5, DABRX_USER | DABRX_KERNEL
>  3:
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH kernel] powerpc/iommu: Report the correct most efficient DMA mask for PCI devices
From: Christoph Hellwig @ 2021-09-30  5:43 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: Carol L Soto, iommu, linuxppc-dev, Christoph Hellwig
In-Reply-To: <20210930034454.95794-1-aik@ozlabs.ru>

On Thu, Sep 30, 2021 at 01:44:54PM +1000, Alexey Kardashevskiy wrote:
> and returns the IOMMU table mask on the pseries platform which makes some
> drivers (mpt3sas is one example) choose 32bit DMA even though bypass is
> supported. The powernv platform sort of handles it by having a bigger
> default window with a mask >=40 but it only works as drivers choose
> 63/64bit if the required mask is >32 which is rather pointless.
> 
> This reintroduces the bypass capability check to let drivers make
> a better choice of the DMA mask.
> 
> Fixes: f1565c24b596 ("powerpc: use the generic dma_ops_bypass mode")
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCH v4 5/8] bpf ppc64: Add BPF_PROBE_MEM support for JIT
From: Jordan Niethe @ 2021-09-30  4:18 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Ravi Bangoria, songliubraving, daniel, john.fastabend, ast,
	andrii, Paul Mackerras, netdev, naveen.n.rao, yhs, bpf, kpsingh,
	linuxppc-dev, kafai, Hari Bathini
In-Reply-To: <9628c18d-001e-9777-e800-486a83844ac1@csgroup.eu>

On Wed, Sep 29, 2021 at 9:50 PM Christophe Leroy
<christophe.leroy@csgroup.eu> wrote:
>
>
>
> Le 29/09/2021 à 13:18, Hari Bathini a écrit :
> > From: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
> >
> > BPF load instruction with BPF_PROBE_MEM mode can cause a fault
> > inside kernel. Append exception table for such instructions
> > within BPF program.
> >
> > Unlike other archs which uses extable 'fixup' field to pass dest_reg
> > and nip, BPF exception table on PowerPC follows the generic PowerPC
>
>
> For my curiosity, can you explain why we don't want and can't do the
> same on powerpc as on other archs ?

The main thing is on x86, the extable has another field , handler:
struct exception_table_entry { int insn, fixup, handler; };
handler can be used to perform other things before continuing on to fixup.
So for bpf the handler is used to clear the dest register (which is
encoded in the low byte of fixup).
More detail in 3dec541b2e63 ("bpf: Add support for BTF pointers to x86 JIT").

arm64 is an example of an arch that doesn't have a handler field in the extable.
They did something along the lines of this rather than adding a
handler field to the extable.
See 800834285361 ("bpf, arm64: Add BPF exception tables")

>
>
> > exception table design, where it populates both fixup and extable
> > sections within BPF program. fixup section contains two instructions,
> > first instruction clears dest_reg and 2nd jumps to next instruction
> > in the BPF code. extable 'insn' field contains relative offset of
> > the instruction and 'fixup' field contains relative offset of the
> > fixup entry. Example layout of BPF program with extable present:
> >
> >               +------------------+
> >               |                  |
> >               |                  |
> >     0x4020 -->| ld   r27,4(r3)   |
> >               |                  |
> >               |                  |
> >     0x40ac -->| lwz  r3,0(r4)    |
> >               |                  |
> >               |                  |
> >               |------------------|
> >     0x4280 -->| li  r27,0        |  \ fixup entry
> >               | b   0x4024       |  /
> >     0x4288 -->| li  r3,0         |
> >               | b   0x40b0       |
> >               |------------------|
> >     0x4290 -->| insn=0xfffffd90  |  \ extable entry
> >               | fixup=0xffffffec |  /
> >     0x4298 -->| insn=0xfffffe14  |
> >               | fixup=0xffffffec |
> >               +------------------+
> >
> >     (Addresses shown here are chosen random, not real)
> >

^ permalink raw reply

* [PATCH] ASoC: fsl_rpmsg: Add rpmsg audio support for i.MX8ULP
From: Shengjiu Wang @ 2021-09-30  3:26 UTC (permalink / raw)
  To: nicoleotsuka, Xiubo.Lee, festevam, lgirdwood, broonie, perex,
	tiwai, alsa-devel
  Cc: linuxppc-dev, linux-kernel

On i.MX8ULP the audio interface and codec are controlled
by Cortex-M domain, Cortex-M core provides audio service
over rpmsg.

The rpmsg audio function is almost same as i.MX7ULP
platform, so share same configuration.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 sound/soc/fsl/fsl_rpmsg.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/soc/fsl/fsl_rpmsg.c b/sound/soc/fsl/fsl_rpmsg.c
index 07abad7fe372..8508bc7f239d 100644
--- a/sound/soc/fsl/fsl_rpmsg.c
+++ b/sound/soc/fsl/fsl_rpmsg.c
@@ -174,6 +174,7 @@ static const struct of_device_id fsl_rpmsg_ids[] = {
 	{ .compatible = "fsl,imx8mm-rpmsg-audio", .data = &imx8mm_data},
 	{ .compatible = "fsl,imx8mn-rpmsg-audio", .data = &imx8mn_data},
 	{ .compatible = "fsl,imx8mp-rpmsg-audio", .data = &imx8mp_data},
+	{ .compatible = "fsl,imx8ulp-rpmsg-audio", .data = &imx7ulp_data},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, fsl_rpmsg_ids);
-- 
2.17.1


^ permalink raw reply related

* [PATCH kernel] powerpc/iommu: Report the correct most efficient DMA mask for PCI devices
From: Alexey Kardashevskiy @ 2021-09-30  3:44 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, iommu, Christoph Hellwig, Carol L Soto

According to dma-api.rst, the dma_get_required_mask() helper should return
"the mask that the platform requires to operate efficiently". Which in
the case of PPC64 means the bypass mask and not a mask from an IOMMU table
which is shorter and slower to use due to map/unmap operations (especially
expensive on "pseries").

However the existing implementation ignores the possibility of bypassing
and returns the IOMMU table mask on the pseries platform which makes some
drivers (mpt3sas is one example) choose 32bit DMA even though bypass is
supported. The powernv platform sort of handles it by having a bigger
default window with a mask >=40 but it only works as drivers choose
63/64bit if the required mask is >32 which is rather pointless.

This reintroduces the bypass capability check to let drivers make
a better choice of the DMA mask.

Fixes: f1565c24b596 ("powerpc: use the generic dma_ops_bypass mode")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/dma-iommu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 111249fd619d..d646077bcbcf 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -184,6 +184,14 @@ u64 dma_iommu_get_required_mask(struct device *dev)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 	u64 mask;

+	if (dev_is_pci(dev)) {
+		u64 bypass_mask = dma_direct_get_required_mask(dev);
+
+		if (dma_iommu_dma_supported(dev, bypass_mask)) {
+			dev_info(dev, "%s: returning bypass mask 0x%llx\n", __func__, bypass_mask);
+			return bypass_mask;
+		}
+	}
 	if (!tbl)
 		return 0;

-- 
2.30.2

^ permalink raw reply related

* Re: [RFC PATCH 4/8] powerpc: add CPU field to struct thread_info
From: Kees Cook @ 2021-09-29 23:10 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Peter Zijlstra, Catalin Marinas, Paul Mackerras, linux-riscv,
	Will Deacon, Ard Biesheuvel, open list:S390, Vasily Gorbik,
	Russell King, Christian Borntraeger, Ingo Molnar, Albert Ou,
	Arnd Bergmann, Heiko Carstens, Keith Packard, Borislav Petkov,
	Andy Lutomirski, Paul Walmsley, Thomas Gleixner, Linux ARM,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT),
	Linux Kernel Mailing List, Palmer Dabbelt, Linus Torvalds
In-Reply-To: <878rzf0zmb.fsf@mpe.ellerman.id.au>

On Thu, Sep 30, 2021 at 08:46:04AM +1000, Michael Ellerman wrote:
> Ard Biesheuvel <ardb@kernel.org> writes:
> > On Tue, 28 Sept 2021 at 02:16, Michael Ellerman <mpe@ellerman.id.au> wrote:
> >>
> >> Michael Ellerman <mpe@ellerman.id.au> writes:
> >> > Ard Biesheuvel <ardb@kernel.org> writes:
> >> >> On Tue, 14 Sept 2021 at 14:11, Ard Biesheuvel <ardb@kernel.org> wrote:
> >> >>>
> >> >>> The CPU field will be moved back into thread_info even when
> >> >>> THREAD_INFO_IN_TASK is enabled, so add it back to powerpc's definition
> >> >>> of struct thread_info.
> >> >>>
> >> >>> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> >> >>
> >> >> Michael,
> >> >>
> >> >> Do you have any objections or issues with this patch or the subsequent
> >> >> ones cleaning up the task CPU kludge for ppc32? Christophe indicated
> >> >> that he was happy with it.
> >> >
> >> > No objections, it looks good to me, thanks for cleaning up that horror :)
> >> >
> >> > It didn't apply cleanly to master so I haven't tested it at all, if you can point me at a
> >> > git tree with the dependencies I'd be happy to run some tests over it.
> >>
> >> Actually I realised I can just drop the last patch.
> >>
> >> So that looks fine, passes my standard quick build & boot on qemu tests,
> >> and builds with/without stack protector enabled.
> >>
> >
> > Thanks.
> >
> > Do you have any opinion on how this series should be merged? Kees Cook
> > is willing to take them via his cross-arch tree, or you could carry
> > them if you prefer. Taking it via multiple trees at the same time is
> > going to be tricky, or take two cycles, with I'd prefer to avoid.
> 
> I don't really mind. If Kees is happy to take it then that's OK by me.
> 
> If Kees put the series in a topic branch based off rc2 then I could
> merge that, and avoid any conflicts.

If that helps, yeah, I can make a separate stable branch. Thanks!

-Kees

-- 
Kees Cook

^ permalink raw reply

* Re: [RFC PATCH 4/8] powerpc: add CPU field to struct thread_info
From: Michael Ellerman @ 2021-09-29 22:46 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Peter Zijlstra, Catalin Marinas, Paul Mackerras, linux-riscv,
	Will Deacon, open list:S390, Arnd Bergmann, Russell King,
	Christian Borntraeger, Ingo Molnar, Albert Ou, Kees Cook,
	Vasily Gorbik, Heiko Carstens, Keith Packard, Borislav Petkov,
	Andy Lutomirski, Paul Walmsley, Thomas Gleixner, Linux ARM,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT),
	Linux Kernel Mailing List, Palmer Dabbelt, Linus Torvalds
In-Reply-To: <CAMj1kXFXtbD3=L+QvCnwbyFr-qbWivZ0wRGT0N4LNxANPD8x4g@mail.gmail.com>

Ard Biesheuvel <ardb@kernel.org> writes:
> On Tue, 28 Sept 2021 at 02:16, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>
>> Michael Ellerman <mpe@ellerman.id.au> writes:
>> > Ard Biesheuvel <ardb@kernel.org> writes:
>> >> On Tue, 14 Sept 2021 at 14:11, Ard Biesheuvel <ardb@kernel.org> wrote:
>> >>>
>> >>> The CPU field will be moved back into thread_info even when
>> >>> THREAD_INFO_IN_TASK is enabled, so add it back to powerpc's definition
>> >>> of struct thread_info.
>> >>>
>> >>> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
>> >>
>> >> Michael,
>> >>
>> >> Do you have any objections or issues with this patch or the subsequent
>> >> ones cleaning up the task CPU kludge for ppc32? Christophe indicated
>> >> that he was happy with it.
>> >
>> > No objections, it looks good to me, thanks for cleaning up that horror :)
>> >
>> > It didn't apply cleanly to master so I haven't tested it at all, if you can point me at a
>> > git tree with the dependencies I'd be happy to run some tests over it.
>>
>> Actually I realised I can just drop the last patch.
>>
>> So that looks fine, passes my standard quick build & boot on qemu tests,
>> and builds with/without stack protector enabled.
>>
>
> Thanks.
>
> Do you have any opinion on how this series should be merged? Kees Cook
> is willing to take them via his cross-arch tree, or you could carry
> them if you prefer. Taking it via multiple trees at the same time is
> going to be tricky, or take two cycles, with I'd prefer to avoid.

I don't really mind. If Kees is happy to take it then that's OK by me.

If Kees put the series in a topic branch based off rc2 then I could
merge that, and avoid any conflicts.

cheers

^ permalink raw reply

* Re: [PATCH v5 07/11] PCI: Replace pci_dev::driver usage that gets the driver name
From: Ido Schimmel @ 2021-09-29 15:21 UTC (permalink / raw)
  To: Ido Schimmel, u.kleine-koenig
  Cc: linux-pci, Alexander Duyck, oss-drivers, Paul Mackerras,
	Christoph Hellwig, Herbert Xu, Rafał Miłecki,
	Jesse Brandeburg, Bjorn Helgaas, Jakub Kicinski, Yisen Zhuang,
	Vadym Kochan, Michael Buesch, Jiri Pirko, Salil Mehta, netdev,
	linux-wireless, linux-kernel, Taras Chornyi, Zhou Wang,
	linux-crypto, kernel, Simon Horman, Oliver O'Halloran,
	linuxppc-dev, David S. Miller
In-Reply-To: <YVR74+8Rw6XmTqDD@shredder>

On Wed, Sep 29, 2021 at 05:44:51PM +0300, Ido Schimmel wrote:
> On Wed, Sep 29, 2021 at 10:53:02AM +0200, Uwe Kleine-König wrote:
> > struct pci_dev::driver holds (apart from a constant offset) the same
> > data as struct pci_dev::dev->driver. With the goal to remove struct
> > pci_dev::driver to get rid of data duplication replace getting the
> > driver name by dev_driver_string() which implicitly makes use of struct
> > pci_dev::dev->driver.
> > 
> > Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
> 
> For mlxsw:
> 
> Reviewed-by: Ido Schimmel <idosch@nvidia.com>
> Tested-by: Ido Schimmel <idosch@nvidia.com>
> 
> Thanks

Actually, I found out that after loading and executing another kernel
(or the same one) via kexec I get this splat [1].

[1]
 BUG: unable to handle page fault for address: ffffffffffffffc8         
 #PF: supervisor read access in kernel mode                       
 #PF: error_code(0x0000) - not-present page
 PGD 6e40c067 P4D 6e40c067 PUD 6e40e067 PMD 0 
 Oops: 0000 [#1] SMP                                                    
 CPU: 0 PID: 786 Comm: kexec Not tainted 5.15.0-rc2-custom-45114-g6b0effa5a61f #112
 Hardware name: Mellanox Technologies Ltd. MSN3700/VMOD0005, BIOS 5.11 01/06/2019                                                                              
 RIP: 0010:pci_device_shutdown+0x16/0x40
 Code: 01 00 31 d2 4c 89 e7 89 c6 e8 36 ce 01 00 41 89 c5 eb bb 90 55 48 8d af 40 ff ff ff 53 48 8b 47 68 48 89 fb 48 83 f8 78 74 0e <48> 8b 40 c8 48 85 c0 74 
05 48 89 ef ff d0 80 3d 35 81 b7 01 00 74                                             
 RSP: 0018:ffff95fec0d37db8 EFLAGS: 00010297                            
 RAX: 0000000000000000 RBX: ffff8d70c0f1f0c0 RCX: 0000000000000004
 RDX: ffff8d7115a03a00 RSI: 0000000000000206 RDI: ffff8d70c0f1f0c0      
 RBP: ffff8d70c0f1f000 R08: 0000000000000002 R09: 0000000000000502
 R10: 0000000000000000 R11: 0000000000000006 R12: ffff8d70c0f1f0c0                                                                                             
 R13: ffff8d70c0f1f140 R14: 00000000fee1dead R15: 0000000000000000                                                                                             
 FS:  00007fd3089e0b80(0000) GS:ffff8d7237c00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033                     
 CR2: ffffffffffffffc8 CR3: 0000000155abb001 CR4: 00000000003706f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Call Trace:
  device_shutdown+0x12e/0x180 
  kernel_kexec+0x52/0xb0
  __do_sys_reboot+0x1c0/0x210 
  do_syscall_64+0x35/0x80
  entry_SYSCALL_64_after_hwframe+0x44/0xae
 RIP: 0033:0x7fd308afd557
 Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 89 fa be 69 19 12 28 bf ad de e1 fe b8 a9 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 
c3 48 8b 15 f1 a8 0c 00 f7 d8 64 89 02 b8
 RSP: 002b:00007fff7d01e0a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a9
 RAX: ffffffffffffffda RBX: 00005606db11d380 RCX: 00007fd308afd557
 RDX: 0000000045584543 RSI: 0000000028121969 RDI: 00000000fee1dead
 RBP: 0000000000000000 R08: 0000000000000007 R09: 00007fd308bc8a60
 R10: 0000000000000021 R11: 0000000000000246 R12: 0000000000000003
 R13: 00007fff7d01e1f0 R14: 00005606db11d8c0 R15: 00000000ffffffff
 Modules linked in:
 CR2: ffffffffffffffc8
 ---[ end trace 0cb0bc633a6fde3e ]---

Where:

(gdb) l *(pci_device_shutdown+0x16)
0xffffffff8156abf6 is in pci_device_shutdown (drivers/pci/pci-driver.c:496).
491             struct pci_dev *pci_dev = to_pci_dev(dev);
492             struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver);
493
494             pm_runtime_resume(dev);
495
496             if (drv && drv->shutdown)
497                     drv->shutdown(pci_dev);
498
499             /*
500              * If this is a kexec reboot, turn off Bus Master bit on the

^ permalink raw reply

* Re: [PATCH v5 07/11] PCI: Replace pci_dev::driver usage that gets the driver name
From: Ido Schimmel @ 2021-09-29 14:44 UTC (permalink / raw)
  To: Uwe Kleine-König
  Cc: linux-pci, Alexander Duyck, oss-drivers, Paul Mackerras,
	Christoph Hellwig, Herbert Xu, Rafał Miłecki,
	Jesse Brandeburg, Bjorn Helgaas, Jakub Kicinski, Yisen Zhuang,
	Vadym Kochan, Michael Buesch, Jiri Pirko, Salil Mehta, netdev,
	linux-wireless, linux-kernel, Taras Chornyi, Zhou Wang,
	linux-crypto, kernel, Simon Horman, Oliver O'Halloran,
	linuxppc-dev, David S. Miller
In-Reply-To: <20210929085306.2203850-8-u.kleine-koenig@pengutronix.de>

On Wed, Sep 29, 2021 at 10:53:02AM +0200, Uwe Kleine-König wrote:
> struct pci_dev::driver holds (apart from a constant offset) the same
> data as struct pci_dev::dev->driver. With the goal to remove struct
> pci_dev::driver to get rid of data duplication replace getting the
> driver name by dev_driver_string() which implicitly makes use of struct
> pci_dev::dev->driver.
> 
> Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>

For mlxsw:

Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>

Thanks

^ permalink raw reply

* Re: [PATCH 00/10] Add Apple M1 support to PASemi i2c driver
From: Wolfram Sang @ 2021-09-29 20:35 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Sven Peter, Hector Martin, Linux Kernel Mailing List, Linux I2C,
	Paul Mackerras, Linux ARM, Olof Johansson, Mohamed Mediouni,
	Mark Kettenis, linuxppc-dev, Alyssa Rosenzweig, Stan Skowronek
In-Reply-To: <CAK8P3a3Lt2QXk+aWLtXUXjjNhKJwNns6d9r=Yh5_aWETuvZTpQ@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 245 bytes --]


> This looks all very good to me, I had one very minor comment.
> 
> Whole series
> 
> Reviewed-by: Arnd Bergmann <arnd@arndb.de>

Thanks for the series and the review!

Same here, looks good to me and I only had one minor comment.


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH 09/10] i2c: pasemi: Add Apple platform driver
From: Wolfram Sang @ 2021-09-29 20:33 UTC (permalink / raw)
  To: Sven Peter
  Cc: Arnd Bergmann, Hector Martin, linux-kernel, linux-i2c,
	Paul Mackerras, linux-arm-kernel, Olof Johansson,
	Mohamed Mediouni, Mark Kettenis, linuxppc-dev, Alyssa Rosenzweig,
	Stan Skowronek
In-Reply-To: <20210926095847.38261-10-sven@svenpeter.dev>

[-- Attachment #1: Type: text/plain, Size: 192 bytes --]


>  drivers/i2c/busses/i2c-pasemi-apple.c | 122 ++++++++++++++++++++++++++

Can't we name it 'i2c-pasemi-platform.c' instead? Makes more sense to me
because the other instance is named -pci.


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH v5 10/11] PCI: Replace pci_dev::driver usage by pci_dev::dev.driver
From: Andrew Donnellan @ 2021-09-29 15:44 UTC (permalink / raw)
  To: Uwe Kleine-König
  Cc: Mark Rutland, Peter Zijlstra, H. Peter Anvin,
	Oliver O'Halloran, Jiri Olsa, Christoph Hellwig,
	Stefano Stabellini, Arnd Bergmann, Boris Ostrovsky, x86,
	Alexander Shishkin, Ingo Molnar, Bjorn Helgaas, linux-pci,
	xen-devel, Mathias Nyman, Konrad Rzeszutek Wilk,
	Arnaldo Carvalho de Melo, Borislav Petkov, Bjorn Helgaas,
	Namhyung Kim, Thomas Gleixner, Juergen Gross, Greg Kroah-Hartman,
	linux-usb, linux-perf-users, kernel, Frederic Barrat,
	Paul Mackerras, linuxppc-dev
In-Reply-To: <20210929134330.e5c57t7mtwu5iner@pengutronix.de>

On 29/9/21 11:43 pm, Uwe Kleine-König wrote:> I'm not a huge fan either, 
I used it to keep the control flow as is and
> without introducing several calls to to_pci_driver.
> 
> The whole code looks as follows:
> 
> 	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> 		struct pci_driver *afu_drv;
> 		if (afu_dev->dev.driver &&
> 		    (afu_drv = to_pci_driver(afu_dev->dev.driver))->err_handler &&
> 		    afu_drv->err_handler->resume)
> 			afu_drv->err_handler->resume(afu_dev);
> 	}
> 
> Without assignment in the if it could look as follows:
> 
> 	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
> 		struct pci_driver *afu_drv;
> 
> 		if (!afu_dev->dev.driver)
> 			continue;
> 
> 		afu_drv = to_pci_driver(afu_dev->dev.driver));
> 
> 		if (afu_drv->err_handler && afu_drv->err_handler->resume)
> 			afu_drv->err_handler->resume(afu_dev);
> 	}
> 
> Fine for me.

This looks fine.

As an aside while writing my email I discovered the existence of 
container_of_safe(), a version of container_of() that handles the null 
and err ptr cases... if to_pci_driver() used that, the null check in the 
caller could be moved until after the to_pci_driver() call which would 
be neater.

But then, grep tells me that container_of_safe() is used precisely zero 
times in the entire tree. Interesting.

> (Sidenote: What happens if the device is unbound directly after the
> check for afu_dev->dev.driver? This is a problem the old code had, too
> (assuming it is a real problem, didn't check deeply).)

Looking at any of the cxl PCI error handling paths brings back 
nightmares from a few years ago... Fred: I wonder if we need to add a 
lock here?

-- 
Andrew Donnellan              OzLabs, ADL Canberra
ajd@linux.ibm.com             IBM Australia Limited

^ permalink raw reply

* [PATCH v1 6/6] x86: remove memory hotplug support on X86_32
From: David Hildenbrand @ 2021-09-29 14:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport
In-Reply-To: <20210929143600.49379-1-david@redhat.com>

CONFIG_MEMORY_HOTPLUG was marked BROKEN over one year and we just
restricted it to 64 bit. Let's remove the unused x86 32bit implementation
and simplify the Kconfig.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/x86/Kconfig      |  6 +++---
 arch/x86/mm/init_32.c | 31 -------------------------------
 2 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab83c22d274e..85f4762429f1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,7 +62,7 @@ config X86
 	select ARCH_32BIT_OFF_T			if X86_32
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
-	select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
+	select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
 	select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
 	select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
@@ -1615,7 +1615,7 @@ config ARCH_SELECT_MEMORY_MODEL
 
 config ARCH_MEMORY_PROBE
 	bool "Enable sysfs memory/probe interface"
-	depends on X86_64 && MEMORY_HOTPLUG
+	depends on MEMORY_HOTPLUG
 	help
 	  This option enables a sysfs memory/probe interface for testing.
 	  See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
@@ -2395,7 +2395,7 @@ endmenu
 
 config ARCH_HAS_ADD_PAGES
 	def_bool y
-	depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+	depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
 config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
 	def_bool y
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bd90b8fe81e4..5cd7ea6d645c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -779,37 +779,6 @@ void __init mem_init(void)
 	test_wp_bit();
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
-		    struct mhp_params *params)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	int ret;
-
-	/*
-	 * The page tables were already mapped at boot so if the caller
-	 * requests a different mapping type then we must change all the
-	 * pages with __set_memory_prot().
-	 */
-	if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
-		ret = __set_memory_prot(start, nr_pages, params->pgprot);
-		if (ret)
-			return ret;
-	}
-
-	return __add_pages(nid, start_pfn, nr_pages, params);
-}
-
-void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-
-	__remove_pages(start_pfn, nr_pages, altmap);
-}
-#endif
-
 int kernel_set_to_readonly __read_mostly;
 
 static void mark_nxdata_nx(void)
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 4/6] mm/memory_hotplug: remove HIGHMEM leftovers
From: David Hildenbrand @ 2021-09-29 14:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport
In-Reply-To: <20210929143600.49379-1-david@redhat.com>

We don't support CONFIG_MEMORY_HOTPLUG on 32 bit and consequently not
HIGHMEM. Let's remove any leftover code -- including the unused
"status_change_nid_high" field part of the memory notifier.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 Documentation/core-api/memory-hotplug.rst     |  3 --
 .../zh_CN/core-api/memory-hotplug.rst         |  4 ---
 include/linux/memory.h                        |  1 -
 mm/memory_hotplug.c                           | 36 ++-----------------
 4 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst
index de7467e48067..682259ee633a 100644
--- a/Documentation/core-api/memory-hotplug.rst
+++ b/Documentation/core-api/memory-hotplug.rst
@@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
 		unsigned long start_pfn;
 		unsigned long nr_pages;
 		int status_change_nid_normal;
-		int status_change_nid_high;
 		int status_change_nid;
 	}
 
@@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
 - nr_pages is # of pages of online/offline memory.
 - status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
   is (will be) set/clear, if this is -1, then nodemask status is not changed.
-- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
-  is (will be) set/clear, if this is -1, then nodemask status is not changed.
 - status_change_nid is set node id when N_MEMORY of nodemask is (will be)
   set/clear. It means a new(memoryless) node gets new memory by online and a
   node loses all memory. If this is -1, then nodemask status is not changed.
diff --git a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst
index 161f4d2c18cc..9a204eb196f2 100644
--- a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst
+++ b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst
@@ -63,7 +63,6 @@ memory_notify结构体的指针::
 		unsigned long start_pfn;
 		unsigned long nr_pages;
 		int status_change_nid_normal;
-		int status_change_nid_high;
 		int status_change_nid;
 	}
 
@@ -74,9 +73,6 @@ memory_notify结构体的指针::
 - status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节
   点id，如果是-1，则nodemask状态不改变。
 
-- status_change_nid_high是当nodemask的N_HIGH_MEMORY被设置/清除时设置的节点
-  id，如果这个值为-1，那么nodemask状态不会改变。
-
 - status_change_nid是当nodemask的N_MEMORY被（将）设置/清除时设置的节点id。这
   意味着一个新的（没上线的）节点通过联机获得新的内存，而一个节点失去了所有的内
   存。如果这个值为-1，那么nodemask的状态就不会改变。
diff --git a/include/linux/memory.h b/include/linux/memory.h
index dd6e608c3e0b..c46ff374d48d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,7 +96,6 @@ struct memory_notify {
 	unsigned long start_pfn;
 	unsigned long nr_pages;
 	int status_change_nid_normal;
-	int status_change_nid_high;
 	int status_change_nid;
 };
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8d7b2b593a26..95c927c8bfb8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,7 +21,6 @@
 #include <linux/memory.h>
 #include <linux/memremap.h>
 #include <linux/memory_hotplug.h>
-#include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
@@ -585,10 +584,6 @@ void generic_online_page(struct page *page, unsigned int order)
 	debug_pagealloc_map_pages(page, 1 << order);
 	__free_pages_core(page, order);
 	totalram_pages_add(1UL << order);
-#ifdef CONFIG_HIGHMEM
-	if (PageHighMem(page))
-		totalhigh_pages_add(1UL << order);
-#endif
 }
 EXPORT_SYMBOL_GPL(generic_online_page);
 
@@ -625,16 +620,11 @@ static void node_states_check_changes_online(unsigned long nr_pages,
 
 	arg->status_change_nid = NUMA_NO_NODE;
 	arg->status_change_nid_normal = NUMA_NO_NODE;
-	arg->status_change_nid_high = NUMA_NO_NODE;
 
 	if (!node_state(nid, N_MEMORY))
 		arg->status_change_nid = nid;
 	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
 		arg->status_change_nid_normal = nid;
-#ifdef CONFIG_HIGHMEM
-	if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
-		arg->status_change_nid_high = nid;
-#endif
 }
 
 static void node_states_set_node(int node, struct memory_notify *arg)
@@ -642,9 +632,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 	if (arg->status_change_nid_normal >= 0)
 		node_set_state(node, N_NORMAL_MEMORY);
 
-	if (arg->status_change_nid_high >= 0)
-		node_set_state(node, N_HIGH_MEMORY);
-
 	if (arg->status_change_nid >= 0)
 		node_set_state(node, N_MEMORY);
 }
@@ -1801,7 +1788,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 
 	arg->status_change_nid = NUMA_NO_NODE;
 	arg->status_change_nid_normal = NUMA_NO_NODE;
-	arg->status_change_nid_high = NUMA_NO_NODE;
 
 	/*
 	 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1816,24 +1802,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
 		arg->status_change_nid_normal = zone_to_nid(zone);
 
-#ifdef CONFIG_HIGHMEM
-	/*
-	 * node_states[N_HIGH_MEMORY] contains nodes which
-	 * have normal memory or high memory.
-	 * Here we add the present_pages belonging to ZONE_HIGHMEM.
-	 * If the zone is within the range of [0..ZONE_HIGHMEM), and
-	 * we determine that the zones in that range become empty,
-	 * we need to clear the node for N_HIGH_MEMORY.
-	 */
-	present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
-	if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
-		arg->status_change_nid_high = zone_to_nid(zone);
-#endif
-
 	/*
-	 * We have accounted the pages from [0..ZONE_NORMAL), and
-	 * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
-	 * as well.
+	 * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
+	 * does not apply as we don't support 32bit.
 	 * Here we count the possible pages from ZONE_MOVABLE.
 	 * If after having accounted all the pages, we see that the nr_pages
 	 * to be offlined is over or equal to the accounted pages,
@@ -1851,9 +1822,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
 	if (arg->status_change_nid_normal >= 0)
 		node_clear_state(node, N_NORMAL_MEMORY);
 
-	if (arg->status_change_nid_high >= 0)
-		node_clear_state(node, N_HIGH_MEMORY);
-
 	if (arg->status_change_nid >= 0)
 		node_clear_state(node, N_MEMORY);
 }
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 5/6] mm/memory_hotplug: remove stale function declarations
From: David Hildenbrand @ 2021-09-29 14:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport
In-Reply-To: <20210929143600.49379-1-david@redhat.com>

These functions no longer exist.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/memory_hotplug.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e5a867c950b2..be48e003a518 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -98,9 +98,6 @@ static inline void zone_seqlock_init(struct zone *zone)
 {
 	seqlock_init(&zone->span_seqlock);
 }
-extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
-extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
-extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 extern void adjust_present_page_count(struct page *page,
 				      struct memory_group *group,
 				      long nr_pages);
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 3/6] mm/memory_hotplug: restrict CONFIG_MEMORY_HOTPLUG to 64 bit
From: David Hildenbrand @ 2021-09-29 14:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport
In-Reply-To: <20210929143600.49379-1-david@redhat.com>

32 bit support is broken in various ways: for example, we can online
memory that should actually go to ZONE_HIGHMEM to ZONE_MOVABLE or in
some cases even to one of the other kernel zones.

We marked it BROKEN in commit b59d02ed0869 ("mm/memory_hotplug: disable the
functionality for 32b") almost one year ago. According to that commit
it might be broken at least since 2017. Further, there is hardly a sane use
case nowadays.

Let's just depend completely on 64bit, dropping the "BROKEN" dependency to
make clear that we are not going to support it again. Next, we'll remove
some HIGHMEM leftovers from memory hotplug code to clean up.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index ea8762cd8e1e..88273dd5c6d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -125,7 +125,7 @@ config MEMORY_HOTPLUG
 	select MEMORY_ISOLATION
 	depends on SPARSEMEM
 	depends on ARCH_ENABLE_MEMORY_HOTPLUG
-	depends on 64BIT || BROKEN
+	depends on 64BIT
 	select NUMA_KEEP_MEMINFO if NUMA

 config MEMORY_HOTPLUG_DEFAULT_ONLINE
-- 
2.31.1

^ permalink raw reply related

* [PATCH v1 2/6] mm/memory_hotplug: remove CONFIG_MEMORY_HOTPLUG_SPARSE
From: David Hildenbrand @ 2021-09-29 14:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport
In-Reply-To: <20210929143600.49379-1-david@redhat.com>

CONFIG_MEMORY_HOTPLUG depends on CONFIG_SPARSEMEM, so there is no need for
CONFIG_MEMORY_HOTPLUG_SPARSE anymore; adjust all instances to use
CONFIG_MEMORY_HOTPLUG and remove CONFIG_MEMORY_HOTPLUG_SPARSE.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/powerpc/include/asm/machdep.h            |  2 +-
 arch/powerpc/kernel/setup_64.c                |  2 +-
 arch/powerpc/platforms/powernv/setup.c        |  4 ++--
 arch/powerpc/platforms/pseries/setup.c        |  2 +-
 drivers/base/Makefile                         |  2 +-
 drivers/base/node.c                           |  9 ++++-----
 drivers/virtio/Kconfig                        |  2 +-
 include/linux/memory.h                        | 18 +++++++-----------
 include/linux/node.h                          |  4 ++--
 lib/Kconfig.debug                             |  2 +-
 mm/Kconfig                                    |  4 ----
 mm/memory_hotplug.c                           |  2 --
 tools/testing/selftests/memory-hotplug/config |  1 -
 13 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 764f2732a821..d8a2ca007082 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -32,7 +32,7 @@ struct machdep_calls {
 	void		(*iommu_save)(void);
 	void		(*iommu_restore)(void);
 #endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 	unsigned long	(*memory_block_size)(void);
 #endif
 #endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index eaa79a0996d1..21f15d82f062 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -912,7 +912,7 @@ void __init setup_per_cpu_areas(void)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 unsigned long memory_block_size_bytes(void)
 {
 	if (ppc_md.memory_block_size)
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index a8db3f153063..ad56a54ac9c5 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -440,7 +440,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 }
 #endif /* CONFIG_KEXEC_CORE */
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 static unsigned long pnv_memory_block_size(void)
 {
 	/*
@@ -553,7 +553,7 @@ define_machine(powernv) {
 #ifdef CONFIG_KEXEC_CORE
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
 #endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 	.memory_block_size	= pnv_memory_block_size,
 #endif
 };
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index f79126f16258..d29f6f1f7f37 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -1089,7 +1089,7 @@ define_machine(pseries) {
 	.machine_kexec          = pSeries_machine_kexec,
 	.kexec_cpu_down         = pseries_kexec_cpu_down,
 #endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 	.memory_block_size	= pseries_memory_block_size,
 #endif
 };
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index ef8e44a7d288..02f7f1358e86 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -13,7 +13,7 @@ obj-y			+= power/
 obj-$(CONFIG_ISA_BUS_API)	+= isa.o
 obj-y				+= firmware_loader/
 obj-$(CONFIG_NUMA)	+= node.o
-obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
 ifeq ($(CONFIG_SYSFS),y)
 obj-$(CONFIG_MODULES)	+= module.o
 endif
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c56d34f8158f..b5a4ba18f9f9 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -629,7 +629,7 @@ static void node_device_release(struct device *dev)
 {
 	struct node *node = to_node(dev);
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
 	/*
 	 * We schedule the work only when a memory section is
 	 * onlined/offlined on this node. When we come here,
@@ -782,7 +782,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 	return 0;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 static int __ref get_nid_for_pfn(unsigned long pfn)
 {
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -958,10 +958,9 @@ static int node_memory_callback(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 #endif	/* CONFIG_HUGETLBFS */
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
-#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
-    !defined(CONFIG_HUGETLBFS)
+#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS)
 static inline int node_memory_callback(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index ce1b3f6ec325..3654def9915c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -98,7 +98,7 @@ config VIRTIO_MEM
 	default m
 	depends on X86_64
 	depends on VIRTIO
-	depends on MEMORY_HOTPLUG_SPARSE
+	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on CONTIG_ALLOC
 	help
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 7efc0a7c14c9..dd6e608c3e0b 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -110,7 +110,7 @@ struct mem_section;
 #define SLAB_CALLBACK_PRI       1
 #define IPC_CALLBACK_PRI        10
 
-#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_MEMORY_HOTPLUG
 static inline void memory_dev_init(void)
 {
 	return;
@@ -126,7 +126,11 @@ static inline int memory_notify(unsigned long val, void *v)
 {
 	return 0;
 }
-#else
+#define hotplug_memory_notifier(fn, pri)	({ 0; })
+/* These aren't inline functions due to a GCC bug. */
+#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
+#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
+#else /* CONFIG_MEMORY_HOTPLUG */
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size,
@@ -149,9 +153,6 @@ struct memory_group *memory_group_find_by_id(int mgid);
 typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 			       struct memory_group *excluded, void *arg);
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-
-#ifdef CONFIG_MEMORY_HOTPLUG
 #define hotplug_memory_notifier(fn, pri) ({		\
 	static __meminitdata struct notifier_block fn##_mem_nb =\
 		{ .notifier_call = fn, .priority = pri };\
@@ -159,12 +160,7 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 })
 #define register_hotmemory_notifier(nb)		register_memory_notifier(nb)
 #define unregister_hotmemory_notifier(nb) 	unregister_memory_notifier(nb)
-#else
-#define hotplug_memory_notifier(fn, pri)	({ 0; })
-/* These aren't inline functions due to a GCC bug. */
-#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
-#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
-#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
  * Kernel text modification mutex, used for code patching. Users of this lock
diff --git a/include/linux/node.h b/include/linux/node.h
index 8e5a29897936..bb21fd631b16 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -85,7 +85,7 @@ struct node {
 	struct device	dev;
 	struct list_head access_list;
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
 	struct work_struct	node_work;
 #endif
 #ifdef CONFIG_HMEM_REPORTING
@@ -98,7 +98,7 @@ struct memory_block;
 extern struct node *node_devices[];
 typedef  void (*node_registration_func_t)(struct node *);
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
 void link_mem_sections(int nid, unsigned long start_pfn,
 		       unsigned long end_pfn,
 		       enum meminit_context context);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2a9b6dcdac4f..669fee1d26b8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -877,7 +877,7 @@ config DEBUG_MEMORY_INIT
 
 config MEMORY_NOTIFIER_ERROR_INJECT
 	tristate "Memory hotplug notifier error injection module"
-	depends on MEMORY_HOTPLUG_SPARSE && NOTIFIER_ERROR_INJECTION
+	depends on MEMORY_HOTPLUG && NOTIFIER_ERROR_INJECTION
 	help
 	  This option provides the ability to inject artificial errors to
 	  memory hotplug notifier chain callbacks.  It is controlled through
diff --git a/mm/Kconfig b/mm/Kconfig
index b7fb3f0b485e..ea8762cd8e1e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,10 +128,6 @@ config MEMORY_HOTPLUG
 	depends on 64BIT || BROKEN
 	select NUMA_KEEP_MEMINFO if NUMA
 
-config MEMORY_HOTPLUG_SPARSE
-	def_bool y
-	depends on SPARSEMEM && MEMORY_HOTPLUG
-
 config MEMORY_HOTPLUG_DEFAULT_ONLINE
 	bool "Online the newly added memory blocks by default"
 	depends on MEMORY_HOTPLUG
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fd0be32a281..8d7b2b593a26 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -220,7 +220,6 @@ static void release_memory_resource(struct resource *res)
 	kfree(res);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 		const char *reason)
 {
@@ -1163,7 +1162,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
 	mem_hotplug_done();
 	return ret;
 }
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static void reset_node_present_pages(pg_data_t *pgdat)
 {
diff --git a/tools/testing/selftests/memory-hotplug/config b/tools/testing/selftests/memory-hotplug/config
index a7e8cd5bb265..1eef042a31e1 100644
--- a/tools/testing/selftests/memory-hotplug/config
+++ b/tools/testing/selftests/memory-hotplug/config
@@ -1,5 +1,4 @@
 CONFIG_MEMORY_HOTPLUG=y
-CONFIG_MEMORY_HOTPLUG_SPARSE=y
 CONFIG_NOTIFIER_ERROR_INJECTION=y
 CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
 CONFIG_MEMORY_HOTREMOVE=y
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 1/6] mm/memory_hotplug: remove CONFIG_X86_64_ACPI_NUMA dependency from CONFIG_MEMORY_HOTPLUG
From: David Hildenbrand @ 2021-09-29 14:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport
In-Reply-To: <20210929143600.49379-1-david@redhat.com>

SPARSEMEM is the only possible memory model for x86-64, FLATMEM is not
possible:
	config ARCH_FLATMEM_ENABLE
		def_bool y
		depends on X86_32 && !NUMA

And X86_64_ACPI_NUMA (obviously) only supports x86-64:
	config X86_64_ACPI_NUMA
		def_bool y
		depends on X86_64 && NUMA && ACPI && PCI

Let's just remove the CONFIG_X86_64_ACPI_NUMA dependency, as it does no
longer make sense.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index d16ba9249bc5..b7fb3f0b485e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -123,7 +123,7 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
 	select MEMORY_ISOLATION
-	depends on SPARSEMEM || X86_64_ACPI_NUMA
+	depends on SPARSEMEM
 	depends on ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on 64BIT || BROKEN
 	select NUMA_KEEP_MEMINFO if NUMA
-- 
2.31.1


^ permalink raw reply related

* [PATCH v1 0/6] mm/memory_hotplug: Kconfig and 32 bit cleanups
From: David Hildenbrand @ 2021-09-29 14:35 UTC (permalink / raw)
  To: linux-kernel
  Cc: Michal Hocko, Michael S. Tsirkin, Peter Zijlstra, Jason Wang,
	Dave Hansen, x86, virtualization, linux-mm, Paul Mackerras,
	linux-kselftest, H. Peter Anvin, Shuah Khan, Alex Shi,
	Jonathan Corbet, David Hildenbrand, Ingo Molnar, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, Oscar Salvador,
	Greg Kroah-Hartman, linux-doc, Rafael J. Wysocki, Andrew Morton,
	linuxppc-dev, Mike Rapoport

Some cleanups around CONFIG_MEMORY_HOTPLUG, including removing 32 bit
leftovers of memory hotplug support.

Compile-tested on various architectures, quickly tested memory hotplug
on x86-64.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Alex Shi <alexs@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: x86@kernel.org
Cc: linux-mm@kvack.org
Cc: linux-kselftest@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: virtualization@lists.linux-foundation.org

David Hildenbrand (6):
  mm/memory_hotplug: remove CONFIG_X86_64_ACPI_NUMA dependency from
    CONFIG_MEMORY_HOTPLUG
  mm/memory_hotplug: remove CONFIG_MEMORY_HOTPLUG_SPARSE
  mm/memory_hotplug: restrict CONFIG_MEMORY_HOTPLUG to 64 bit
  mm/memory_hotplug: remove HIGHMEM leftovers
  mm/memory_hotplug: remove stale function declarations
  x86: remove memory hotplug support on X86_32

 Documentation/core-api/memory-hotplug.rst     |  3 --
 .../zh_CN/core-api/memory-hotplug.rst         |  4 --
 arch/powerpc/include/asm/machdep.h            |  2 +-
 arch/powerpc/kernel/setup_64.c                |  2 +-
 arch/powerpc/platforms/powernv/setup.c        |  4 +-
 arch/powerpc/platforms/pseries/setup.c        |  2 +-
 arch/x86/Kconfig                              |  6 +--
 arch/x86/mm/init_32.c                         | 31 ---------------
 drivers/base/Makefile                         |  2 +-
 drivers/base/node.c                           |  9 ++---
 drivers/virtio/Kconfig                        |  2 +-
 include/linux/memory.h                        | 19 ++++------
 include/linux/memory_hotplug.h                |  3 --
 include/linux/node.h                          |  4 +-
 lib/Kconfig.debug                             |  2 +-
 mm/Kconfig                                    |  8 +---
 mm/memory_hotplug.c                           | 38 +------------------
 tools/testing/selftests/memory-hotplug/config |  1 -
 18 files changed, 28 insertions(+), 114 deletions(-)

base-commit: 5816b3e6577eaa676ceb00a848f0fd65fe2adc29
-- 
2.31.1

^ permalink raw reply

* Re: [PATCH v8 1/2] powerpc/pseries: Interface to represent PAPR firmware attributes
From: Michael Ellerman @ 2021-09-29 13:49 UTC (permalink / raw)
  To: Pratik R. Sampat, benh, paulus, shuah, farosas, kjain,
	linuxppc-dev, kvm-ppc, linux-kselftest, linux-kernel, psampat,
	pratik.r.sampat
In-Reply-To: <20210928115102.57117-2-psampat@linux.ibm.com>

Hi Pratik,

Some comments inline below ...

"Pratik R. Sampat" <psampat@linux.ibm.com> writes:
> Adds a generic interface to represent the energy and frequency related
> PAPR attributes on the system using the new H_CALL
> "H_GET_ENERGY_SCALE_INFO".

In Linux "generic" would usually mean something that is not architecture
specific, but this is architecture specific.

It's also not generic across different types of attributes, it's only
for information from this particular hcall (right?).

I think you mean generic in contrast to lparcfg, which I guess makes
some sense, this is more generic than lparcfg :)

But I think it'd be better to just say "a sysfs interface".

> H_GET_EM_PARMS H_CALL was previously responsible for exporting this
> information in the lparcfg, however the H_GET_EM_PARMS H_CALL
> will be deprecated P10 onwards.
>
> The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
> hcall(
>   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
>   uint64 flags,           // Per the flag request
>   uint64 firstAttributeId,// The attribute id
>   uint64 bufferAddress,   // Guest physical address of the output buffer
>   uint64 bufferSize       // The size in bytes of the output buffer
> );

As specified in PAPR+ v2.11, section 14.14.3.

> This H_CALL can query either all the attributes at once with
> firstAttributeId = 0, flags = 0 as well as query only one attribute
> at a time with firstAttributeId = id, flags = 1.
>
> The output buffer consists of the following
> 1. number of attributes              - 8 bytes
> 2. array offset to the data location - 8 bytes
> 3. version info                      - 1 byte
     (possible future expansion here)
> 4. A data array of size num attributes, which contains the following:
>   a. attribute ID              - 8 bytes
>   b. attribute value in number - 8 bytes
>   c. attribute name in string  - 64 bytes
>   d. attribute value in string - 64 bytes
>
> The new H_CALL exports information in direct string value format, hence
> a new interface has been introduced in
> /sys/firmware/papr/energy_scale_info to export this information to
> userspace in an extensible pass-through format.

I'm not really sure what "extensible pass-through format" means? Do you
mean the kernel doesn't interpret the values, so firmware can add new
values and they will just appear without any kernel changes required?

> The H_CALL returns the name, numeric value and string value (if exists)
>
> The format of exposing the sysfs information is as follows:
> /sys/firmware/papr/energy_scale_info/
>    |-- <id>/
>      |-- desc
>      |-- value
>      |-- value_desc (if exists)
>    |-- <id>/
>      |-- desc
>      |-- value
>      |-- value_desc (if exists)
> ...
>
> The energy information that is exported is useful for userspace tools
> such as powerpc-utils. Currently these tools infer the
> "power_mode_data" value in the lparcfg, which in turn is obtained from
> the to be deprecated H_GET_EM_PARMS H_CALL.
> On future platforms, such userspace utilities will have to look at the
> data returned from the new H_CALL being populated in this new sysfs
> interface and report this information directly without the need of
> interpretation.

I don't see anything here or in PAPR about how/if the values change over
time.

Are we expected to always read the values from the hypervisor, or are we
allowed to cache them for any period of time?

The reason I ask is currently we're doing 3 hcalls for each attribute,
once for desc, value and value_desc. Is there any issue with the
consistency of the values given we're doing 3 separate calls?

Or are the values static? Or static except for LPM? In which case we
could read them once at boot/LPM. Or maybe we not know if/how the values
will change, especially in future?

> diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> new file mode 100644
> index 000000000000..139a576c7c9d
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> @@ -0,0 +1,26 @@
> +What:		/sys/firmware/papr/energy_scale_info
> +Date:		June 2021
> +Contact:	Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
> +Description:	Directory hosting a set of platform attributes like
> +		energy/frequency on Linux running as a PAPR guest.
> +
> +		Each file in a directory contains a platform
> +		attribute hierarchy pertaining to performance/
> +		energy-savings mode and processor frequency.
> +
> +What:		/sys/firmware/papr/energy_scale_info/<id>
> +		/sys/firmware/papr/energy_scale_info/<id>/desc
> +		/sys/firmware/papr/energy_scale_info/<id>/value
> +		/sys/firmware/papr/energy_scale_info/<id>/value_desc
> +Date:		June 2021
> +Contact:	Linux for PowerPC mailing list <linuxppc-dev@ozlabs.org>
> +Description:	Energy, frequency attributes directory for POWERVM servers
> +
> +		This directory provides energy, frequency, folding information. It
> +		contains below sysfs attributes:
> +
> +		- desc: String description of the attribute <id>
> +
> +		- value: Numeric value of attribute <id>
> +
> +		- value_desc: String value of attribute <id>
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index 9bcf345cb208..38980fef7a3d 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -323,7 +323,8 @@
>  #define H_SCM_PERFORMANCE_STATS 0x418
>  #define H_RPT_INVALIDATE	0x448
>  #define H_SCM_FLUSH		0x44C
> -#define MAX_HCALL_OPCODE	H_SCM_FLUSH
> +#define H_GET_ENERGY_SCALE_INFO	0x450
> +#define MAX_HCALL_OPCODE	H_GET_ENERGY_SCALE_INFO
>  
>  /* Scope args for H_SCM_UNBIND_ALL */
>  #define H_UNBIND_SCOPE_ALL (0x1)
> @@ -641,6 +642,27 @@ struct hv_gpci_request_buffer {
>  	uint8_t bytes[HGPCI_MAX_DATA_BYTES];
>  } __packed;
>  

None of the following needs to be in this header AFAICS. It can all go
in the C file.

> +#define ESI_VERSION	0x1
> +#define MAX_ESI_ATTRS	10

Where does that max come from? I don't see it mentioned in PAPR.

PAPR implies any number of attributes up to 2^64 can be returned, and if
the buffer is too small H_PARTIAL is returned.

We really mustn't hard code a limit of 10 attributes. We need a loop that
reads N attributes, and if we receive H_PARTIAL, we repeat the hcall,
until we've received all the attributes.

To test that is working on an existing system, which presumably only has
a small number of attributes, you should shrink the buffer to the size
of 1 attribute, to make sure the loop logic works.

> +#define MAX_BUF_SZ	(sizeof(struct h_energy_scale_info_hdr) + \
> +			(sizeof(struct energy_scale_attribute) * MAX_ESI_ATTRS))
>
> +struct energy_scale_attribute {
> +	__be64 id;
> +	__be64 value;
> +	unsigned char desc[64];
> +	unsigned char value_desc[64];

I prefer u8.

> +} __packed;
> +
> +struct h_energy_scale_info_hdr {
> +	__be64 num_attrs;
> +	__be64 array_offset;
> +	__u8 data_header_version;

You can use u8 here, __u8 is only needed in uapi headers.

> +} __packed;

So because of array_offset, we don't actually know what the size of the
header plus a single attribute will be.

Because the header tells us the offset to the first attribute, when the
header expands in a future version of the hcall, sizeof(hdr) +
sizeof(energy_scale_attribute) will not necessarily be large enough to
fit a single attribute. I guess maybe that's why you're always
allocating space for 10 of them (MAX_BUF_SZ).

To be truly forward compatible we should do the hcall at startup with
the current known size of the structs, and if we receive H_P4 try again
with increasing sizes until it succeeds.

That's probably overly complex though, given we'd probably only expect
the header to grow by a few 10s of bytes in future.

Maybe we should just allocate 2KB for the buffer, which is still small
as far as allocations go, but gives us a huge amount of headroom for the
header to grow.


> +/* /sys/firmware/papr */
> +extern struct kobject *papr_kobj;
> +
>  #endif /* __ASSEMBLY__ */
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_POWERPC_HVCALL_H */
> diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
> index 830a126e095d..38cd0ed0a617 100644
> --- a/arch/powerpc/kvm/trace_hv.h
> +++ b/arch/powerpc/kvm/trace_hv.h
> @@ -115,6 +115,7 @@
>  	{H_VASI_STATE,			"H_VASI_STATE"}, \
>  	{H_ENABLE_CRQ,			"H_ENABLE_CRQ"}, \
>  	{H_GET_EM_PARMS,		"H_GET_EM_PARMS"}, \
> +	{H_GET_ENERGY_SCALE_INFO,	"H_GET_ENERGY_SCALE_INFO"}, \
>  	{H_SET_MPP,			"H_SET_MPP"}, \
>  	{H_GET_MPP,			"H_GET_MPP"}, \
>  	{H_HOME_NODE_ASSOCIATIVITY,	"H_HOME_NODE_ASSOCIATIVITY"}, \
> diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
> index 4cda0ef87be0..c4c19f6a5975 100644
> --- a/arch/powerpc/platforms/pseries/Makefile
> +++ b/arch/powerpc/platforms/pseries/Makefile
> @@ -6,7 +6,8 @@ obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
>  			   of_helpers.o \
>  			   setup.o iommu.o event_sources.o ras.o \
>  			   firmware.o power.o dlpar.o mobility.o rng.o \
> -			   pci.o pci_dlpar.o eeh_pseries.o msi.o
> +			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
> +			   papr_platform_attributes.o

I don't really mind, but is there a reason we're calling it
papr_platform_attributes.c and not energy_info.c or something?

Is the plan to support more attributes in future?

>  obj-$(CONFIG_SMP)	+= smp.o
>  obj-$(CONFIG_SCANLOG)	+= scanlog.o
>  obj-$(CONFIG_KEXEC_CORE)	+= kexec.o
> diff --git a/arch/powerpc/platforms/pseries/papr_platform_attributes.c b/arch/powerpc/platforms/pseries/papr_platform_attributes.c
> new file mode 100644
> index 000000000000..84ddce52e519
> --- /dev/null
> +++ b/arch/powerpc/platforms/pseries/papr_platform_attributes.c
> @@ -0,0 +1,312 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Platform energy and frequency attributes driver
> + *
> + * This driver creates a sys file at /sys/firmware/papr/ which encapsulates a
> + * directory structure containing files in keyword - value pairs that specify
> + * energy and frequency configuration of the system.
> + *
> + * The format of exposing the sysfs information is as follows:
> + * /sys/firmware/papr/energy_scale_info/
> + *  |-- <id>/
> + *    |-- desc
> + *    |-- value
> + *    |-- value_desc (if exists)
> + *  |-- <id>/
> + *    |-- desc
> + *    |-- value
> + *    |-- value_desc (if exists)
> + *
> + * Copyright 2021 IBM Corp.
> + */
> +
> +#include <asm/hvcall.h>
> +#include <asm/machdep.h>
> +
> +#include "pseries.h"
> +
> +/*
> + * Flag attributes to fetch either all or one attribute from the HCALL
> + * flag = BE(0) => fetch all attributes with firstAttributeId = 0
> + * flag = BE(1) => fetch a single attribute with firstAttributeId = id
> + */
> +#define ESI_FLAGS_ALL		0
> +#define ESI_FLAGS_SINGLE	PPC_BIT(0)

I dislike PPC_BIT(). It's obscure and not helpful for people reading the
code compared to a simple (1ull << 63).

> +
> +#define MAX_ATTRS		3
> +
> +struct papr_attr {
> +	u64 id;
> +	struct kobj_attribute kobj_attr;
> +};
> +struct papr_group {
> +	struct attribute_group pg;
> +	struct papr_attr pgattrs[MAX_ATTRS];
> +} *pgs;

pgs is not a great name for a global.

Can it be static?

Also would be more normal to define the structure separately from
declaring *pgs.

> +
> +/* /sys/firmware/papr */
> +struct kobject *papr_kobj;

static?

> +/* /sys/firmware/papr/energy_scale_info */
> +struct kobject *esi_kobj;

static?


The following three functions are almost exactly the same, except they
print a different field with a different format string. Seems like there
could be some shared code.

> +/*
> + * Extract and export the description of the energy scale attributes
> + */
> +static ssize_t papr_show_desc(struct kobject *kobj,
> +			       struct kobj_attribute *kobj_attr,
> +			       char *buf)
> +{
> +	struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr,
> +					       kobj_attr);
> +	struct h_energy_scale_info_hdr *t_hdr;
> +	struct energy_scale_attribute *t_esi;

What's the "t_", temp? I don't really like it. hdr and esi would be fine IMHO.

> +	char *t_buf;
> +	int ret = 0;

That's initialisation is pointless, you override it below.

> +	t_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);
> +	if (t_buf == NULL)
> +		return -ENOMEM;
> +
> +	ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_SINGLE,
> +				 pattr->id, virt_to_phys(t_buf),
> +				 MAX_BUF_SZ);
> +
> +	if (ret != H_SUCCESS) {
> +		pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> +		goto out;

Here you're going to return ret back to userspace. But ret currently
holds a hcall return value, hcall return values are not necessarily
valid Linux error codes.

eg. H_PARTIAL is 5, H_BUSY is 1, etc. Those are potentially read as
success by your caller.

You must never return a hcall error back to Linux code that's not
expecting it.

Here it should probably just return -EIO I guess.

> +	}
> +
> +	t_hdr = (struct h_energy_scale_info_hdr *) t_buf;
> +	t_esi = (struct energy_scale_attribute *)
> +		(t_buf + be64_to_cpu(t_hdr->array_offset));

You should check that array_offset + sizeof(attribute) doesn't point
past the end of your buffer.

> +
> +	ret = snprintf(buf, sizeof(t_esi->desc), "%s\n", t_esi->desc);
> +	if (ret < 0)
> +		ret = -EIO;
> +out:
> +	kfree(t_buf);
> +
> +	return ret;
> +}
> +
> +/*
> + * Extract and export the numeric value of the energy scale attributes
> + */
> +static ssize_t papr_show_value(struct kobject *kobj,
> +				struct kobj_attribute *kobj_attr,
> +				char *buf)
> +{
> +	struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr,
> +					       kobj_attr);
> +	struct h_energy_scale_info_hdr *t_hdr;
> +	struct energy_scale_attribute *t_esi;
> +	char *t_buf;
> +	int ret = 0;
> +
> +	t_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);
> +	if (t_buf == NULL)
> +		return -ENOMEM;
> +
> +	ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_SINGLE,
> +				 pattr->id, virt_to_phys(t_buf),
> +				 MAX_BUF_SZ);
> +
> +	if (ret != H_SUCCESS) {
> +		pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> +		goto out;
> +	}
> +
> +	t_hdr = (struct h_energy_scale_info_hdr *) t_buf;
> +	t_esi = (struct energy_scale_attribute *)
> +		(t_buf + be64_to_cpu(t_hdr->array_offset));
> +
> +	ret = snprintf(buf, sizeof(t_esi->value), "%llu\n",
> +		       be64_to_cpu(t_esi->value));
> +	if (ret < 0)
> +		ret = -EIO;
> +out:
> +	kfree(t_buf);
> +
> +	return ret;
> +}
> +
> +/*
> + * Extract and export the value description in string format of the energy
> + * scale attributes
> + */
> +static ssize_t papr_show_value_desc(struct kobject *kobj,
> +				     struct kobj_attribute *kobj_attr,
> +				     char *buf)
> +{
> +	struct papr_attr *pattr = container_of(kobj_attr, struct papr_attr,
> +					       kobj_attr);
> +	struct h_energy_scale_info_hdr *t_hdr;
> +	struct energy_scale_attribute *t_esi;
> +	char *t_buf;
> +	int ret = 0;
> +
> +	t_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);
> +	if (t_buf == NULL)
> +		return -ENOMEM;
> +
> +	ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_SINGLE,
> +				 pattr->id, virt_to_phys(t_buf),
> +				 MAX_BUF_SZ);
> +
> +	if (ret != H_SUCCESS) {
> +		pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> +		goto out;
> +	}
> +
> +	t_hdr = (struct h_energy_scale_info_hdr *) t_buf;
> +	t_esi = (struct energy_scale_attribute *)
> +		(t_buf + be64_to_cpu(t_hdr->array_offset));
> +
> +	ret = snprintf(buf, sizeof(t_esi->value_desc), "%s\n",
> +		       t_esi->value_desc);
> +	if (ret < 0)
> +		ret = -EIO;
> +out:
> +	kfree(t_buf);
> +
> +	return ret;
> +}
> +
> +static struct papr_ops_info {
> +	const char *attr_name;
> +	ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *kobj_attr,
> +			char *buf);
> +} ops_info[MAX_ATTRS] = {
> +	{ "desc", papr_show_desc },
> +	{ "value", papr_show_value },
> +	{ "value_desc", papr_show_value_desc },
> +};
> +
> +static void add_attr(u64 id, int index, struct papr_attr *attr)
> +{
> +	attr->id = id;
> +	sysfs_attr_init(&attr->kobj_attr.attr);
> +	attr->kobj_attr.attr.name = ops_info[index].attr_name;
> +	attr->kobj_attr.attr.mode = 0444;
> +	attr->kobj_attr.show = ops_info[index].show;
> +}
> +
> +static int add_attr_group(u64 id, struct papr_group *pg, bool show_val_desc)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_ATTRS; i++) {
> +		if (!strcmp(ops_info[i].attr_name, "value_desc") &&
> +		    !show_val_desc) {
> +			continue;
> +		}
> +		add_attr(id, i, &pg->pgattrs[i]);
> +		pg->pg.attrs[i] = &pg->pgattrs[i].kobj_attr.attr;
> +	}
> +
> +	return sysfs_create_group(esi_kobj, &pg->pg);
> +}
> +
> +static int __init papr_init(void)
> +{
> +	struct h_energy_scale_info_hdr *esi_hdr;
> +	struct energy_scale_attribute *esi_attrs;
> +	uint64_t num_attrs;
> +	int ret, idx, i;
> +	char *esi_buf;
> +
> +	if (!firmware_has_feature(FW_FEATURE_LPAR))
> +		return -ENXIO;
> +
> +	esi_buf = kmalloc(MAX_BUF_SZ, GFP_KERNEL);
> +	if (esi_buf == NULL)
> +		return -ENOMEM;
> +	/*
> +	 * hcall(
> +	 * uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
> +	 * uint64 flags,            // Per the flag request
> +	 * uint64 firstAttributeId, // The attribute id
> +	 * uint64 bufferAddress,    // Guest physical address of the output buffer
> +	 * uint64 bufferSize);      // The size in bytes of the output buffer
> +	 */
> +	ret = plpar_hcall_norets(H_GET_ENERGY_SCALE_INFO, ESI_FLAGS_ALL, 0,
> +				 virt_to_phys(esi_buf), MAX_BUF_SZ);
> +	if (ret != H_SUCCESS) {
> +		pr_warn("hcall failed: H_GET_ENERGY_SCALE_INFO");
> +		goto out;

This prints a warning on all existing systems (that don't support the
new hcall), which is not ideal :)

The proper way to handle discovery of the hcall is by looking at
ibm,hypertas-functions and setting a FW_FEATURE flag based on that.

See fw_hypertas_feature_init().

> +	}

Also the hcall can return H_PARTIAL if the buffer isn't big enough.

> +	esi_hdr = (struct h_energy_scale_info_hdr *) esi_buf;
> +	if (esi_hdr->data_header_version != ESI_VERSION) {
> +		pr_warn("H_GET_ENERGY_SCALE_INFO VER MISMATCH - EXP: 0x%x, REC: 0x%x",
> +			ESI_VERSION, esi_hdr->data_header_version);

That shouldn't be a warning. PAPR says any changes will be backward
compatible. If we need to check the version at all, it would be to make
sure the hypervisor version is >= the version we expect. But I'm not
sure it's necessary to check at all?

> +	}
> +
> +	num_attrs = be64_to_cpu(esi_hdr->num_attrs);
> +	esi_attrs = (struct energy_scale_attribute *)
> +		    (esi_buf + be64_to_cpu(esi_hdr->array_offset));
> +
> +	pgs = kcalloc(num_attrs, sizeof(*pgs), GFP_KERNEL);
> +	if (!pgs)
> +		goto out;
> +
> +	papr_kobj = kobject_create_and_add("papr", firmware_kobj);
> +	if (!papr_kobj) {
> +		pr_warn("kobject_create_and_add papr failed\n");
> +		goto out_pgs;
> +	}
> +
> +	esi_kobj = kobject_create_and_add("energy_scale_info", papr_kobj);
> +	if (!esi_kobj) {
> +		pr_warn("kobject_create_and_add energy_scale_info failed\n");
> +		goto out_kobj;
> +	}
> +
> +	for (idx = 0; idx < num_attrs; idx++) {
> +		bool show_val_desc = true;
> +
> +		pgs[idx].pg.attrs = kcalloc(MAX_ATTRS + 1,
> +					    sizeof(*pgs[idx].pg.attrs),
> +					    GFP_KERNEL);
> +		if (!pgs[idx].pg.attrs) {
> +			goto out_pgattrs;
> +		}
> +
> +		pgs[idx].pg.name = kasprintf(GFP_KERNEL, "%lld",
> +					     be64_to_cpu(esi_attrs[idx].id));
> +		if (pgs[idx].pg.name == NULL) {
> +			goto out_pgattrs;
> +		}
> +		/* Do not add the value description if it does not exist */
> +		if (strnlen(esi_attrs[idx].value_desc,
> +			    sizeof(esi_attrs[idx].value_desc)) == 0)
> +			show_val_desc = false;
> +
> +		if (add_attr_group(be64_to_cpu(esi_attrs[idx].id), &pgs[idx],
> +				   show_val_desc)) {
> +			pr_warn("Failed to create papr attribute group %s\n",
> +				pgs[idx].pg.name);
> +			goto out_pgattrs;
> +		}
> +	}
> +
> +	kfree(esi_buf);
> +	return 0;
> +
> +out_pgattrs:

Is this right? If you failed the allocation for i = 1, then the attrs
for i = 0 would already be registered with sysfs, and then you free them
while they're still in use?

To simplify it you can probably do all the allocations first before
registering anything.

> +	for (i = 0; i < idx ; i++) {
> +		kfree(pgs[i].pg.attrs);
> +		kfree(pgs[i].pg.name);
> +	}
> +	kobject_put(esi_kobj);
> +out_kobj:
> +	kobject_put(papr_kobj);
> +out_pgs:
> +	kfree(pgs);
> +out:

If you're freeing something the label should be "out_free_thing". So in
this case out_free_esi_buf.

> +	kfree(esi_buf);
> +
> +	return -ENOMEM;
> +}
> +
> +machine_device_initcall(pseries, papr_init);
> -- 
> 2.31.1


cheers

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox