LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/2] powerpc/pmem: Add flush routines using new pmem store and sync instruction
From: Aneesh Kumar K.V @ 2020-05-11 11:34 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: alistair, oohall, Aneesh Kumar K.V
In-Reply-To: <20200511113443.36569-1-aneesh.kumar@linux.ibm.com>

Start using dcbstps; phwsync; sequence for flushing persistent memory range.
The new instructions are implemented as a variant of dcbf and hwsync and on
POWER9 they will be executed as those instructions.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/cacheflush.h | 31 +++++++++++++++++++++++++++
 arch/powerpc/lib/pmem.c               |  8 +++----
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index e92191b390f3..edb59edeeacb 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -74,6 +74,37 @@ static inline void flush_dcache_range(unsigned long start, unsigned long stop)
 
 }
 
+static inline void clean_pmem_range(unsigned long start, unsigned long stop)
+{
+	unsigned long shift = l1_dcache_shift();
+	unsigned long bytes = l1_dcache_bytes();
+	void *addr = (void *)(start & ~(bytes - 1));
+	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
+	unsigned long i;
+
+	for (i = 0; i < size >> shift; i++, addr += bytes)
+		asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
+
+
+	asm volatile(PPC_PHWSYNC():::"memory");
+}
+
+static inline void flush_pmem_range(unsigned long start, unsigned long stop)
+{
+	unsigned long shift = l1_dcache_shift();
+	unsigned long bytes = l1_dcache_bytes();
+	void *addr = (void *)(start & ~(bytes - 1));
+	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
+	unsigned long i;
+
+	for (i = 0; i < size >> shift; i++, addr += bytes)
+		asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
+
+
+	asm volatile(PPC_PHWSYNC():::"memory");
+}
+
+
 /*
  * Write any modified data cache blocks out to memory.
  * Does not invalidate the corresponding cache lines (especially for
diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
index 0666a8d29596..624708bcaaad 100644
--- a/arch/powerpc/lib/pmem.c
+++ b/arch/powerpc/lib/pmem.c
@@ -15,14 +15,14 @@
 void arch_wb_cache_pmem(void *addr, size_t size)
 {
 	unsigned long start = (unsigned long) addr;
-	flush_dcache_range(start, start + size);
+	clean_pmem_range(start, start + size);
 }
 EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 
 void arch_invalidate_pmem(void *addr, size_t size)
 {
 	unsigned long start = (unsigned long) addr;
-	flush_dcache_range(start, start + size);
+	flush_pmem_range(start, start + size);
 }
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 
@@ -35,7 +35,7 @@ long __copy_from_user_flushcache(void *dest, const void __user *src,
 	unsigned long copied, start = (unsigned long) dest;
 
 	copied = __copy_from_user(dest, src, size);
-	flush_dcache_range(start, start + size);
+	clean_pmem_range(start, start + size);
 
 	return copied;
 }
@@ -45,7 +45,7 @@ void *memcpy_flushcache(void *dest, const void *src, size_t size)
 	unsigned long start = (unsigned long) dest;
 
 	memcpy(dest, src, size);
-	flush_dcache_range(start, start + size);
+	clean_pmem_range(start, start + size);
 
 	return dest;
 }
-- 
2.26.2


^ permalink raw reply related

* [PATCH 1/2] powerpc/pmem: Add new instructions for persistent storage and sync
From: Aneesh Kumar K.V @ 2020-05-11 11:34 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: alistair, oohall, Aneesh Kumar K.V

POWER10 introduces two new variants of dcbf instructions (dcbstps and dcbfps)
that can be used to write modified locations back to persistent storage.

Additionally, POWER10 also introduce phwsync and plwsync which can be used
to establish order of these writes to persistent storage.

This patch exposes these instructions to the rest of the kernel. The existing
dcbf and hwsync instructions in P9 are adequate to enable appropriate
synchronization with OpenCAPI-hosted persistent storage. Hence the new
instructions are added as a variant of the old ones that old hardware
won't differentiate.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/ppc-opcode.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..e8ca55e5d31e 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -216,6 +216,8 @@
 #define PPC_INST_STWCX			0x7c00012d
 #define PPC_INST_LWSYNC			0x7c2004ac
 #define PPC_INST_SYNC			0x7c0004ac
+#define PPC_INST_PHWSYNC		0x7c8004ac
+#define PPC_INST_PLWSYNC		0x7ca004ac
 #define PPC_INST_SYNC_MASK		0xfc0007fe
 #define PPC_INST_ISYNC			0x4c00012c
 #define PPC_INST_LXVD2X			0x7c000698
@@ -281,6 +283,8 @@
 #define PPC_INST_TABORT			0x7c00071d
 #define PPC_INST_TSR			0x7c0005dd
 
+#define PPC_INST_DCBF			0x7c0000ac
+
 #define PPC_INST_NAP			0x4c000364
 #define PPC_INST_SLEEP			0x4c0003a4
 #define PPC_INST_WINKLE			0x4c0003e4
@@ -529,6 +533,14 @@
 #define STBCIX(s,a,b)		stringify_in_c(.long PPC_INST_STBCIX | \
 				       __PPC_RS(s) | __PPC_RA(a) | __PPC_RB(b))
 
+#define	PPC_DCBFPS(a, b)	stringify_in_c(.long PPC_INST_DCBF |	\
+				       ___PPC_RA(a) | ___PPC_RB(b) | (4 << 21))
+#define	PPC_DCBSTPS(a, b)	stringify_in_c(.long PPC_INST_DCBF |	\
+				       ___PPC_RA(a) | ___PPC_RB(b) | (6 << 21))
+
+#define	PPC_PHWSYNC()		stringify_in_c(.long PPC_INST_PHWSYNC)
+#define	PPC_PLWSYNC()		stringify_in_c(.long PPC_INST_PLWSYNC)
+
 /*
  * Define what the VSX XX1 form instructions will look like, then add
  * the 128 bit load store instructions based on that.
-- 
2.26.2


^ permalink raw reply related

* [powerpc:next] BUILD SUCCESS 1f12096aca212af8fad3ef58d5673cde691a1452
From: kbuild test robot @ 2020-05-11 11:40 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next
branch HEAD: 1f12096aca212af8fad3ef58d5673cde691a1452  Merge the lockless page table walk rework into next

elapsed time: 5675m

configs tested: 178
configs skipped: 1

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                               allnoconfig
arm64                            allyesconfig
arm64                               defconfig
arm64                            allmodconfig
arm64                             allnoconfig
sparc                            allyesconfig
m68k                             allyesconfig
m68k                             allmodconfig
um                                  defconfig
sparc64                             defconfig
c6x                              allyesconfig
i386                             allyesconfig
mips                             allmodconfig
s390                             allmodconfig
m68k                              allnoconfig
m68k                                defconfig
alpha                               defconfig
arc                              allyesconfig
um                                allnoconfig
s390                              allnoconfig
alpha                            allyesconfig
sh                               allmodconfig
openrisc                            defconfig
openrisc                         allyesconfig
sh                                allnoconfig
xtensa                           allyesconfig
riscv                               defconfig
mips                              allnoconfig
parisc                           allyesconfig
i386                              allnoconfig
i386                                defconfig
i386                              debian-10.3
ia64                             allmodconfig
ia64                                defconfig
ia64                              allnoconfig
ia64                             allyesconfig
m68k                           sun3_defconfig
nios2                               defconfig
nios2                            allyesconfig
c6x                               allnoconfig
nds32                               defconfig
nds32                             allnoconfig
csky                             allyesconfig
csky                                defconfig
h8300                            allyesconfig
h8300                            allmodconfig
xtensa                              defconfig
arc                                 defconfig
microblaze                        allnoconfig
mips                             allyesconfig
parisc                            allnoconfig
parisc                              defconfig
parisc                           allmodconfig
powerpc                             defconfig
powerpc                          allyesconfig
powerpc                          rhel-kconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a004-20200507
x86_64               randconfig-a006-20200507
x86_64               randconfig-a002-20200507
x86_64               randconfig-a005-20200508
x86_64               randconfig-a003-20200508
x86_64               randconfig-a001-20200508
x86_64               randconfig-a006-20200508
x86_64               randconfig-a004-20200508
x86_64               randconfig-a002-20200508
x86_64               randconfig-a005-20200511
x86_64               randconfig-a003-20200511
x86_64               randconfig-a006-20200511
x86_64               randconfig-a004-20200511
x86_64               randconfig-a001-20200511
x86_64               randconfig-a002-20200511
i386                 randconfig-a005-20200509
i386                 randconfig-a004-20200509
i386                 randconfig-a003-20200509
i386                 randconfig-a002-20200509
i386                 randconfig-a001-20200509
i386                 randconfig-a006-20200509
i386                 randconfig-a006-20200510
i386                 randconfig-a005-20200510
i386                 randconfig-a003-20200510
i386                 randconfig-a001-20200510
i386                 randconfig-a004-20200510
i386                 randconfig-a002-20200510
i386                 randconfig-a006-20200511
i386                 randconfig-a005-20200511
i386                 randconfig-a003-20200511
i386                 randconfig-a001-20200511
i386                 randconfig-a004-20200511
i386                 randconfig-a002-20200511
i386                 randconfig-a005-20200508
i386                 randconfig-a004-20200508
i386                 randconfig-a003-20200508
i386                 randconfig-a002-20200508
i386                 randconfig-a001-20200508
i386                 randconfig-a006-20200508
i386                 randconfig-a005-20200507
i386                 randconfig-a004-20200507
i386                 randconfig-a001-20200507
i386                 randconfig-a002-20200507
i386                 randconfig-a003-20200507
i386                 randconfig-a006-20200507
x86_64               randconfig-a015-20200507
x86_64               randconfig-a014-20200507
x86_64               randconfig-a012-20200507
x86_64               randconfig-a013-20200507
x86_64               randconfig-a011-20200507
x86_64               randconfig-a016-20200507
x86_64               randconfig-a014-20200508
x86_64               randconfig-a012-20200508
x86_64               randconfig-a016-20200508
x86_64               randconfig-a016-20200510
x86_64               randconfig-a012-20200510
x86_64               randconfig-a015-20200510
x86_64               randconfig-a013-20200510
x86_64               randconfig-a014-20200510
x86_64               randconfig-a011-20200510
x86_64               randconfig-a015-20200509
x86_64               randconfig-a014-20200509
x86_64               randconfig-a011-20200509
x86_64               randconfig-a013-20200509
x86_64               randconfig-a012-20200509
x86_64               randconfig-a016-20200509
x86_64               randconfig-a016-20200511
x86_64               randconfig-a012-20200511
x86_64               randconfig-a014-20200511
i386                 randconfig-a012-20200507
i386                 randconfig-a016-20200507
i386                 randconfig-a014-20200507
i386                 randconfig-a011-20200507
i386                 randconfig-a015-20200507
i386                 randconfig-a013-20200507
i386                 randconfig-a012-20200509
i386                 randconfig-a014-20200509
i386                 randconfig-a016-20200509
i386                 randconfig-a011-20200509
i386                 randconfig-a013-20200509
i386                 randconfig-a015-20200509
i386                 randconfig-a012-20200511
i386                 randconfig-a016-20200511
i386                 randconfig-a014-20200511
i386                 randconfig-a011-20200511
i386                 randconfig-a013-20200511
i386                 randconfig-a015-20200511
i386                 randconfig-a012-20200508
i386                 randconfig-a014-20200508
i386                 randconfig-a016-20200508
i386                 randconfig-a011-20200508
i386                 randconfig-a013-20200508
i386                 randconfig-a015-20200508
i386                 randconfig-a012-20200510
i386                 randconfig-a016-20200510
i386                 randconfig-a014-20200510
i386                 randconfig-a011-20200510
i386                 randconfig-a013-20200510
i386                 randconfig-a015-20200510
riscv                            allyesconfig
riscv                             allnoconfig
riscv                            allmodconfig
s390                             allyesconfig
s390                                defconfig
sparc                               defconfig
sparc64                           allnoconfig
sparc64                          allyesconfig
sparc64                          allmodconfig
um                               allmodconfig
um                               allyesconfig
x86_64                                   rhel
x86_64                               rhel-7.6
x86_64                    rhel-7.6-kselftests
x86_64                         rhel-7.2-clear
x86_64                                    lkp
x86_64                              fedora-25
x86_64                                  kexec

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [PATCH] powerpc/kvm: silence kmemleak false positives
From: Qian Cai @ 2020-05-11 11:43 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linux-kernel, kvm-ppc, catalin.marinas, linuxppc-dev
In-Reply-To: <87y2pybu38.fsf@mpe.ellerman.id.au>

> On May 11, 2020, at 7:15 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> There is kmemleak_alloc_phys(), which according to the docs can be used
> for tracking a phys address.
> 
> Did you try that?

Caitlin, feel free to give your thoughts here.

My understanding is that it seems the doc is a bit misleading. kmemleak_alloc_phys() is to allocate kmemleak objects for a physical address range, so  kmemleak could scan those memory pointers within for possible referencing other memory. It was only used in memblock so far, but those new memory allocations here contain no reference to other memory.

In this case, we have already had kmemleak objects for those memory allocation. It is just that other pointers reference those memory by their physical address which is a known kmemleak limitation won’t be able to track the the connection. Thus, we always use kmemleak_ignore() to not reporting those as leaks and don’t scan those because they do not contain other memory reference.

^ permalink raw reply

* [PATCH v4 1/2] powerpc/64s/hash: Add stress_slb kernel boot option to increase SLB faults
From: Michael Ellerman @ 2020-05-11 12:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: npiggin

From: Nicholas Piggin <npiggin@gmail.com>

This option increases the number of SLB misses by limiting the number
of kernel SLB entries, and increased flushing of cached lookaside
information. This helps stress test difficult to hit paths in the
kernel.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Relocate the code into arch/powerpc/mm, s/torture/stress/]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../admin-guide/kernel-parameters.txt         |   5 +
 arch/powerpc/mm/book3s64/hash_utils.c         |   6 +
 arch/powerpc/mm/book3s64/internal.h           |  16 ++
 arch/powerpc/mm/book3s64/slb.c                | 166 +++++++++++++-----
 4 files changed, 148 insertions(+), 45 deletions(-)
 create mode 100644 arch/powerpc/mm/book3s64/internal.h

v4: mpe: Relocate the code into arch/powerpc/mm, s/torture/stress/

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f2a93c8679e8..26ef1d74e642 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -871,6 +871,11 @@
 			can be useful when debugging issues that require an SLB
 			miss to occur.
 
+	stress_slb	[PPC]
+			Limits the number of kernel SLB entries, and flushes
+			them frequently to increase the rate of SLB faults
+			on kernel addresses.
+
 	disable=	[IPV6]
 			See Documentation/networking/ipv6.txt.
 
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 3d727f73a8db..622c6e8e9fa6 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -66,6 +66,9 @@
 
 #include <mm/mmu_decl.h>
 
+#include "internal.h"
+
+
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
 #else
@@ -870,6 +873,9 @@ static void __init htab_initialize(void)
 		printk(KERN_INFO "Using 1TB segments\n");
 	}
 
+	if (stress_slb_enabled)
+		static_branch_enable(&stress_slb_key);
+
 	/*
 	 * Calculate the required size of the htab.  We want the number of
 	 * PTEGs to equal one half the number of real pages.
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
new file mode 100644
index 000000000000..7eda0d30d765
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+
+#include <linux/jump_label.h>
+
+extern bool stress_slb_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_slb_key);
+
+static inline bool stress_slb(void)
+{
+	return static_branch_unlikely(&stress_slb_key);
+}
+
+#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 716204aee3da..8141e8b40ee5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -25,6 +25,9 @@
 #include <asm/udbg.h>
 #include <asm/code-patching.h>
 
+#include "internal.h"
+
+
 enum slb_index {
 	LINEAR_INDEX	= 0, /* Kernel linear map  (0xc000000000000000) */
 	KSTACK_INDEX	= 1, /* Kernel stack map */
@@ -54,6 +57,17 @@ static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
 	return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
 }
 
+bool stress_slb_enabled __initdata;
+
+static int __init parse_stress_slb(char *p)
+{
+	stress_slb_enabled = true;
+	return 0;
+}
+early_param("stress_slb", parse_stress_slb);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key);
+
 static void assert_slb_presence(bool present, unsigned long ea)
 {
 #ifdef CONFIG_DEBUG_VM
@@ -68,7 +82,7 @@ static void assert_slb_presence(bool present, unsigned long ea)
 	 * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
 	 * ignores all other bits from 0-27, so just clear them all.
 	 */
-	ea &= ~((1UL << 28) - 1);
+	ea &= ~((1UL << SID_SHIFT) - 1);
 	asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
 
 	WARN_ON(present == (tmp == 0));
@@ -153,14 +167,42 @@ void slb_flush_all_realmode(void)
 	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
 }
 
+static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside)
+{
+	struct slb_shadow *p = get_slb_shadow();
+	unsigned long ksp_esid_data, ksp_vsid_data;
+	u32 ih;
+
+	/*
+	 * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside
+	 * information created with Class=0 entries, which we use for kernel
+	 * SLB entries (the SLB entries themselves are still invalidated).
+	 *
+	 * Older processors will ignore this optimisation. Over-invalidation
+	 * is fine because we never rely on lookaside information existing.
+	 */
+	if (preserve_kernel_lookaside)
+		ih = 1;
+	else
+		ih = 0;
+
+	ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+	ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+	asm volatile(PPC_SLBIA(%0)"	\n"
+		     "slbmte	%1, %2	\n"
+		     :: "i" (ih),
+			"r" (ksp_vsid_data),
+			"r" (ksp_esid_data)
+		     : "memory");
+}
+
 /*
  * This flushes non-bolted entries, it can be run in virtual mode. Must
  * be called with interrupts disabled.
  */
 void slb_flush_and_restore_bolted(void)
 {
-	struct slb_shadow *p = get_slb_shadow();
-
 	BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
 
 	WARN_ON(!irqs_disabled());
@@ -171,13 +213,10 @@ void slb_flush_and_restore_bolted(void)
 	 */
 	hard_irq_disable();
 
-	asm volatile("isync\n"
-		     "slbia\n"
-		     "slbmte  %0, %1\n"
-		     "isync\n"
-		     :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
-			"r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
-		     : "memory");
+	isync();
+	__slb_flush_and_restore_bolted(false);
+	isync();
+
 	assert_slb_presence(true, get_paca()->kstack);
 
 	get_paca()->slb_cache_ptr = 0;
@@ -400,6 +439,30 @@ void preload_new_slb_context(unsigned long start, unsigned long sp)
 	local_irq_enable();
 }
 
+static void slb_cache_slbie_kernel(unsigned int index)
+{
+	unsigned long slbie_data = get_paca()->slb_cache[index];
+	unsigned long ksp = get_paca()->kstack;
+
+	slbie_data <<= SID_SHIFT;
+	slbie_data |= 0xc000000000000000ULL;
+	if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data)
+		return;
+	slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT;
+
+	asm volatile("slbie %0" : : "r" (slbie_data));
+}
+
+static void slb_cache_slbie_user(unsigned int index)
+{
+	unsigned long slbie_data = get_paca()->slb_cache[index];
+
+	slbie_data <<= SID_SHIFT;
+	slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT;
+	slbie_data |= SLBIE_C; /* user slbs have C=1 */
+
+	asm volatile("slbie %0" : : "r" (slbie_data));
+}
 
 /* Flush all user entries from the segment table of the current processor. */
 void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
@@ -414,8 +477,14 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 	 * which would update the slb_cache/slb_cache_ptr fields in the PACA.
 	 */
 	hard_irq_disable();
-	asm volatile("isync" : : : "memory");
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+	isync();
+	if (stress_slb()) {
+		__slb_flush_and_restore_bolted(false);
+		isync();
+		get_paca()->slb_cache_ptr = 0;
+		get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+
+	} else if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 		/*
 		 * SLBIA IH=3 invalidates all Class=1 SLBEs and their
 		 * associated lookaside structures, which matches what
@@ -423,47 +492,29 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 		 * cache.
 		 */
 		asm volatile(PPC_SLBIA(3));
+
 	} else {
 		unsigned long offset = get_paca()->slb_cache_ptr;
 
 		if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
 		    offset <= SLB_CACHE_ENTRIES) {
-			unsigned long slbie_data = 0;
-
-			for (i = 0; i < offset; i++) {
-				unsigned long ea;
-
-				ea = (unsigned long)
-					get_paca()->slb_cache[i] << SID_SHIFT;
-				/*
-				 * Could assert_slb_presence(true) here, but
-				 * hypervisor or machine check could have come
-				 * in and removed the entry at this point.
-				 */
-
-				slbie_data = ea;
-				slbie_data |= user_segment_size(slbie_data)
-						<< SLBIE_SSIZE_SHIFT;
-				slbie_data |= SLBIE_C; /* user slbs have C=1 */
-				asm volatile("slbie %0" : : "r" (slbie_data));
-			}
+			/*
+			 * Could assert_slb_presence(true) here, but
+			 * hypervisor or machine check could have come
+			 * in and removed the entry at this point.
+			 */
+
+			for (i = 0; i < offset; i++)
+				slb_cache_slbie_user(i);
 
 			/* Workaround POWER5 < DD2.1 issue */
 			if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
-				asm volatile("slbie %0" : : "r" (slbie_data));
+				slb_cache_slbie_user(0);
 
 		} else {
-			struct slb_shadow *p = get_slb_shadow();
-			unsigned long ksp_esid_data =
-				be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
-			unsigned long ksp_vsid_data =
-				be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
-
-			asm volatile(PPC_SLBIA(1) "\n"
-				     "slbmte	%0,%1\n"
-				     "isync"
-				     :: "r"(ksp_vsid_data),
-					"r"(ksp_esid_data));
+			/* Flush but retain kernel lookaside information */
+			__slb_flush_and_restore_bolted(true);
+			isync();
 
 			get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
 		}
@@ -503,7 +554,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 	 * address accesses by the kernel (user mode won't happen until
 	 * rfid, which is safe).
 	 */
-	asm volatile("isync" : : : "memory");
+	isync();
 }
 
 void slb_set_size(u16 size)
@@ -571,6 +622,9 @@ static void slb_cache_update(unsigned long esid_data)
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
 		return; /* ISAv3.0B and later does not use slb_cache */
 
+	if (stress_slb())
+		return;
+
 	/*
 	 * Now update slb cache entries
 	 */
@@ -580,7 +634,7 @@ static void slb_cache_update(unsigned long esid_data)
 		 * We have space in slb cache for optimized switch_slb().
 		 * Top 36 bits from esid_data as per ISA
 		 */
-		local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+		local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
 		local_paca->slb_cache_ptr++;
 	} else {
 		/*
@@ -671,6 +725,28 @@ static long slb_insert_entry(unsigned long ea, unsigned long context,
 	 * accesses user memory before it returns to userspace with rfid.
 	 */
 	assert_slb_presence(false, ea);
+	if (stress_slb()) {
+		int slb_cache_index = local_paca->slb_cache_ptr;
+
+		/*
+		 * stress_slb() does not use slb cache, repurpose as a
+		 * cache of inserted (non-bolted) kernel SLB entries. All
+		 * non-bolted kernel entries are flushed on any user fault,
+		 * or if there are already 3 non-boled kernel entries.
+		 */
+		BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3);
+		if (!kernel || slb_cache_index == 3) {
+			int i;
+
+			for (i = 0; i < slb_cache_index; i++)
+				slb_cache_slbie_kernel(i);
+			slb_cache_index = 0;
+		}
+
+		if (kernel)
+			local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
+		local_paca->slb_cache_ptr = slb_cache_index;
+	}
 	asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
 
 	barrier();
-- 
2.25.1


^ permalink raw reply related

* [PATCH v4 2/2] powerpc/64s/hash: Add stress_hpt kernel boot option to increase hash faults
From: Michael Ellerman @ 2020-05-11 12:58 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: npiggin
In-Reply-To: <20200511125825.3081305-1-mpe@ellerman.id.au>

From: Nicholas Piggin <npiggin@gmail.com>

This option increases the number of hash misses by limiting the number
of kernel HPT entries, by accessing the address immediately after
installing the PTE, then removing it again (except in the case of CI
entries that must not be accessed, these are removed on the next hash
fault).

This helps stress test difficult to hit paths in the kernel.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Relocate the code into arch/powerpc/mm, s/torture/stress/]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../admin-guide/kernel-parameters.txt         |  9 ++++
 arch/powerpc/mm/book3s64/hash_4k.c            |  6 +++
 arch/powerpc/mm/book3s64/hash_64k.c           | 11 ++++
 arch/powerpc/mm/book3s64/hash_utils.c         | 54 +++++++++++++++++++
 arch/powerpc/mm/book3s64/internal.h           | 10 ++++
 5 files changed, 90 insertions(+)

v4: mpe: Relocate the code into arch/powerpc/mm, s/torture/stress/

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 26ef1d74e642..c446a176f9c5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -876,6 +876,15 @@
 			them frequently to increase the rate of SLB faults
 			on kernel addresses.
 
+	stress_hpt	[PPC]
+			Limits the number of kernel HPT entries in the hash
+			page table to increase the rate of hash page table
+			faults on kernel addresses.
+
+			This may hang when run on processors / emulators which
+			do not have a TLB, or flush it more often than
+			required, QEMU seems to have problems.
+
 	disable=	[IPV6]
 			See Documentation/networking/ipv6.txt.
 
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
index 22e787123cdf..ff65b3028745 100644
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -16,6 +16,9 @@
 #include <asm/machdep.h>
 #include <asm/mmu.h>
 
+#include "internal.h"
+
+
 int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		   pte_t *ptep, unsigned long trap, unsigned long flags,
 		   int ssize, int subpg_prot)
@@ -118,6 +121,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		}
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
 		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+		if (stress_hpt())
+			hpt_do_stress(ea, access, rflags, hpte_group);
 	}
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
index 7084ce2951e6..11d21ec4e14d 100644
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -16,6 +16,9 @@
 #include <asm/machdep.h>
 #include <asm/mmu.h>
 
+#include "internal.h"
+
+
 /*
  * Return true, if the entry has a slot value which
  * the software considers as invalid.
@@ -216,6 +219,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
 	new_pte |= H_PAGE_HASHPTE;
 
+	if (stress_hpt())
+		hpt_do_stress(ea, access, rflags, hpte_group);
+
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
 }
@@ -327,7 +333,12 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
 		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+		if (stress_hpt())
+			hpt_do_stress(ea, access, rflags, hpte_group);
 	}
+
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 622c6e8e9fa6..f048d23338de 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -857,6 +857,20 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
 	pr_info("Partition table %p\n", partition_tb);
 }
 
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static bool stress_hpt_enabled __initdata;
+
+/* per-CPU array allocated if we enable stress_hpt. */
+static unsigned long *stress_hpt_last_group __ro_after_init;
+
+static int __init parse_stress_hpt(char *p)
+{
+	stress_hpt_enabled = true;
+	return 0;
+}
+early_param("stress_hpt", parse_stress_hpt);
+
 static void __init htab_initialize(void)
 {
 	unsigned long table;
@@ -876,6 +890,15 @@ static void __init htab_initialize(void)
 	if (stress_slb_enabled)
 		static_branch_enable(&stress_slb_key);
 
+	if (stress_hpt_enabled) {
+		unsigned long *tmp;
+		static_branch_enable(&stress_hpt_key);
+		tmp = memblock_alloc(sizeof(unsigned long) * NR_CPUS,
+				     sizeof(unsigned long));
+		memset(tmp, 0xff, sizeof(unsigned long) * NR_CPUS);
+		stress_hpt_last_group = tmp;
+	}
+
 	/*
 	 * Calculate the required size of the htab.  We want the number of
 	 * PTEGs to equal one half the number of real pages.
@@ -1860,6 +1883,37 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
 	return slot;
 }
 
+void hpt_do_stress(unsigned long ea, unsigned long access,
+		   unsigned long rflags, unsigned long hpte_group)
+{
+	unsigned long last_group;
+	int cpu = raw_smp_processor_id();
+
+	last_group = stress_hpt_last_group[cpu];
+	if (last_group != -1UL) {
+		while (mmu_hash_ops.hpte_remove(last_group) != -1)
+			;
+		stress_hpt_last_group[cpu] = -1UL;
+	}
+
+	if (ea >= PAGE_OFFSET) {
+		/*
+		 * We would really like to prefetch here to get the TLB loaded,
+		 * then remove the PTE before returning to userspace, to
+		 * increase the hash fault rate.
+		 *
+		 * Unfortunately QEMU TCG does not model the TLB in a way that
+		 * makes this possible, and systemsim (mambo) emulator does not
+		 * bring in TLBs with prefetches (although loads/stores do
+		 * work for non-CI PTEs).
+		 *
+		 * So remember this PTE and clear it on the next hash fault.
+		 */
+		stress_hpt_last_group[cpu] = hpte_group;
+	}
+}
+
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 {
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
index 7eda0d30d765..de824375b555 100644
--- a/arch/powerpc/mm/book3s64/internal.h
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -13,4 +13,14 @@ static inline bool stress_slb(void)
 	return static_branch_unlikely(&stress_slb_key);
 }
 
+DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
+
+static inline bool stress_hpt(void)
+{
+	return static_branch_unlikely(&stress_hpt_key);
+}
+
+void hpt_do_stress(unsigned long ea, unsigned long access,
+		   unsigned long rflags, unsigned long hpte_group);
+
 #endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v4 02/14] arm: add support for folded p4d page tables
From: Mike Rapoport @ 2020-05-11 14:15 UTC (permalink / raw)
  To: Marek Szyprowski, Andrew Morton
  Cc: Rich Felker, linux-ia64, Geert Uytterhoeven, Fenghua Yu, linux-mm,
	Paul Mackerras, Will Deacon, kvmarm, Jonas Bonn, Brian Cain,
	linux-hexagon, linux-sh, Russell King, Ley Foon Tan,
	Catalin Marinas, uclinux-h8-devel, linux-arch, Arnd Bergmann,
	Bartlomiej Zolnierkiewicz, Łukasz Stelmach, kvm-ppc,
	Stefan Kristiansson, openrisc, Stafford Horne, Guan Xuetao,
	linux-arm-kernel, Tony Luck, Yoshinori Sato, linux-kernel,
	Marc Zyngier, nios2-dev, linuxppc-dev, Mike Rapoport
In-Reply-To: <665dade8-727a-3318-6779-3998080da18f@samsung.com>

Hi Marek,

On Mon, May 11, 2020 at 08:36:41AM +0200, Marek Szyprowski wrote:
> Hi Mike,
> 
> On 08.05.2020 19:42, Mike Rapoport wrote:
> > On Fri, May 08, 2020 at 08:53:27AM +0200, Marek Szyprowski wrote:
> >> On 07.05.2020 18:11, Mike Rapoport wrote:
> >>> On Thu, May 07, 2020 at 02:16:56PM +0200, Marek Szyprowski wrote:
> >>>> On 14.04.2020 17:34, Mike Rapoport wrote:
> >>>>> From: Mike Rapoport <rppt@linux.ibm.com>
> >>>>>
> >>>>> Implement primitives necessary for the 4th level folding, add walks of p4d
> >>>>> level where appropriate, and remove __ARCH_USE_5LEVEL_HACK.
> >>>>>
> >>>>> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > Can you please try the patch below:
> >
> > diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> > index 963b5284d284..f86b3d17928e 100644
> > --- a/arch/arm/mm/init.c
> > +++ b/arch/arm/mm/init.c
> > @@ -571,7 +571,7 @@ static inline void section_update(unsigned long addr, pmdval_t mask,
> >   {
> >   	pmd_t *pmd;
> >   
> > -	pmd = pmd_off_k(addr);
> > +	pmd = pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, addr), addr), addr), addr);
> >   
> >   #ifdef CONFIG_ARM_LPAE
> >   	pmd[0] = __pmd((pmd_val(pmd[0]) & mask) | prot);
> This fixes kexec issue! Thanks!
> 
> 
> Feel free to add:
> 
> Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
> Fixes: 218f1c390557 ("arm: add support for folded p4d page tables")
> Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>

Thanks for testing!

The patch is still in mmotm tree, so I don't think "Fixes" apply.

Andrew, would you like me to send the fix as a formal patch or will pick
it up as a fixup?

> Best regards
> -- 
> Marek Szyprowski, PhD
> Samsung R&D Institute Poland
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH 31/31] module: move the set_fs hack for flush_icache_range to m68k
From: Christoph Hellwig @ 2020-05-11 15:11 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	open list:BROADCOM NVRAM DRIVER, Linux MM, sparclinux,
	linux-riscv, Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, Linux Kernel Mailing List, alpha, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <CAMuHMdU_OxNoKfO=i903kx0mgk0-i2h4u2ase3m9_dn6oFh_5g@mail.gmail.com>

On Mon, May 11, 2020 at 09:40:39AM +0200, Geert Uytterhoeven wrote:
> On Sun, May 10, 2020 at 9:57 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > flush_icache_range generally operates on kernel addresses, but for some
> > reason m68k needed a set_fs override.  Move that into the m68k code
> > insted of keeping it in the module loader.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
> Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>

Btw, do you know what part of flush_icache_range relied on set_fs?
Do any of the m68k maintainers have an idea how to handle that in
a nicer way when we can split the implementations?

^ permalink raw reply

* Re: sort out the flush_icache_range mess
From: Christoph Hellwig @ 2020-05-11 15:13 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	open list:BROADCOM NVRAM DRIVER, Linux MM, sparclinux,
	linux-riscv, Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, Linux Kernel Mailing List, alpha, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <CAMuHMdXazsBw0mjJd0uFHQud7qbb5-Uw-PTDB3+-M=huRWOfgQ@mail.gmail.com>

On Mon, May 11, 2020 at 09:46:17AM +0200, Geert Uytterhoeven wrote:
> Hi Christoph,
> 
> On Sun, May 10, 2020 at 9:55 AM Christoph Hellwig <hch@lst.de> wrote:
> > none of which really are used by a typical MMU enabled kernel, as a.out can
> > only be build for alpha and m68k to start with.
> 
> Quoting myself:
> "I think it's safe to assume no one still runs a.out binaries on m68k."
> http://lore.kernel.org/r/CAMuHMdW+m0Q+j3rsQdMXnrEPm+XB5Y2AQrxW5sD1mZAKgmEqoA@mail.gmail.com

Do you want to drop the:

    select HAVE_AOUT if MMU

for m68k then?

Note that we'll still need flush_icache_user_range for m68k with mmu,
as it also allows binfmt_flat for mmu configs.

^ permalink raw reply

* Re: [PATCH 02/31] arm64: fix the flush_icache_range arguments in machine_kexec
From: Christoph Hellwig @ 2020-05-11 15:15 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: linux-ia64, linux-sh, Roman Zippel, linux-mips, linux-mm,
	sparclinux, linux-riscv, Will Deacon, Christoph Hellwig,
	linux-arch, linux-c6x-dev, linux-hexagon, x86, linux-xtensa,
	Arnd Bergmann, Jessica Yu, linux-um, linux-m68k, openrisc,
	linux-arm-kernel, Michal Simek, linux-kernel, james.morse,
	linux-alpha, linux-fsdevel, Andrew Morton, linuxppc-dev
In-Reply-To: <20200511110014.GA19176@gaia>

On Mon, May 11, 2020 at 12:00:14PM +0100, Catalin Marinas wrote:
> Anyway, I think Christoph's patch needs to go in with a fixes tag:
> 
> Fixes: d28f6df1305a ("arm64/kexec: Add core kexec support")
> Cc: <stable@vger.kernel.org> # 4.8.x-
> 
> and we'll change these functions/helpers going forward for arm64.
> 
> Happy to pick this up via the arm64 for-next/fixes branch.

Please do, there are no dependencies on it in this series (I originally
planned to switch flush_icache_range to pass a kernel pointer + len
instead of the strange unsigned long start and end.  That still looks
very useful, but the series already is way too large, so I'm going to
defer that change for another merge window).

^ permalink raw reply

* Re: [PATCH 31/31] module: move the set_fs hack for flush_icache_range to m68k
From: Geert Uytterhoeven @ 2020-05-11 15:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	open list:BROADCOM NVRAM DRIVER, Linux MM, sparclinux,
	linux-riscv, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, Linux Kernel Mailing List, alpha, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <20200511151120.GA28634@lst.de>

Hi Christoph,

On Mon, May 11, 2020 at 5:11 PM Christoph Hellwig <hch@lst.de> wrote:
> On Mon, May 11, 2020 at 09:40:39AM +0200, Geert Uytterhoeven wrote:
> > On Sun, May 10, 2020 at 9:57 AM Christoph Hellwig <hch@lst.de> wrote:
> > >
> > > flush_icache_range generally operates on kernel addresses, but for some
> > > reason m68k needed a set_fs override.  Move that into the m68k code
> > > insted of keeping it in the module loader.
> > >
> > > Signed-off-by: Christoph Hellwig <hch@lst.de>
> >
> > Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
> > Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
>
> Btw, do you know what part of flush_icache_range relied on set_fs?
> Do any of the m68k maintainers have an idea how to handle that in
> a nicer way when we can split the implementations?

arch/m68k/mm/cache.c:virt_to_phys_slow()

All instructions that look up addresses in the page tables look at the
source/destination function codes (SFC/DFC) to know if they have to use
the supervisor or user page tables.
So the actual implementation is the same: set_fs() merely configures
SFC/DFC, to select the address space to use.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: sort out the flush_icache_range mess
From: Geert Uytterhoeven @ 2020-05-11 15:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	open list:BROADCOM NVRAM DRIVER, Linux MM, sparclinux,
	linux-riscv, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, Linux Kernel Mailing List, alpha, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <20200511151356.GB28634@lst.de>

Hi Christoph,

On Mon, May 11, 2020 at 5:14 PM Christoph Hellwig <hch@lst.de> wrote:
> On Mon, May 11, 2020 at 09:46:17AM +0200, Geert Uytterhoeven wrote:
> > On Sun, May 10, 2020 at 9:55 AM Christoph Hellwig <hch@lst.de> wrote:
> > > none of which really are used by a typical MMU enabled kernel, as a.out can
> > > only be build for alpha and m68k to start with.
> >
> > Quoting myself:
> > "I think it's safe to assume no one still runs a.out binaries on m68k."
> > http://lore.kernel.org/r/CAMuHMdW+m0Q+j3rsQdMXnrEPm+XB5Y2AQrxW5sD1mZAKgmEqoA@mail.gmail.com
>
> Do you want to drop the:
>
>     select HAVE_AOUT if MMU
>
> for m68k then?

If that helps to reduce maintenance, it's fine for me.
That leaves alpha as the sole user?

> Note that we'll still need flush_icache_user_range for m68k with mmu,
> as it also allows binfmt_flat for mmu configs.

Understood.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 03/31] MIPS: unexport __flush_icache_user_range
From: Thomas Bogendoerfer @ 2020-05-11 16:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-ia64, linux-sh, Roman Zippel, linux-mips, linux-mm,
	sparclinux, linux-riscv, linux-arch, linux-c6x-dev, linux-hexagon,
	x86, linux-xtensa, Arnd Bergmann, Jessica Yu, linux-um,
	linux-m68k, openrisc, linux-arm-kernel, Michal Simek,
	linux-kernel, linux-alpha, linux-fsdevel, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20200510075510.987823-4-hch@lst.de>

On Sun, May 10, 2020 at 09:54:42AM +0200, Christoph Hellwig wrote:
> __flush_icache_user_range is not used in modular code, so unexport it.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  arch/mips/mm/cache.c | 1 -
>  1 file changed, 1 deletion(-)

applied to mips-next.

Thomas.

-- 
Crap can work. Given enough thrust pigs will fly, but it's not necessarily a
good idea.                                                [ RFC1925, 2.3 ]

^ permalink raw reply

* Re: sort out the flush_icache_range mess
From: Christoph Hellwig @ 2020-05-11 16:35 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	open list:BROADCOM NVRAM DRIVER, Linux MM, sparclinux,
	linux-riscv, Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, Linux Kernel Mailing List, alpha, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <CAMuHMdU1xAmyWysi5xRoaRL7PFurPncvEL0CcEY0V_sUz3EJPw@mail.gmail.com>

On Mon, May 11, 2020 at 05:25:45PM +0200, Geert Uytterhoeven wrote:
> > Do you want to drop the:
> >
> >     select HAVE_AOUT if MMU
> >
> > for m68k then?
> 
> If that helps to reduce maintenance, it's fine for me.
> That leaves alpha as the sole user?

Yes.  And for alpha it isn't classic Linux a.out, but the OSF/1 ECOFF
format, which might lead to further cleanups eventually.

^ permalink raw reply

* Re: [PATCH 31/31] module: move the set_fs hack for flush_icache_range to m68k
From: Christoph Hellwig @ 2020-05-11 16:37 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: linux-ia64@vger.kernel.org, Linux-sh list, Roman Zippel,
	open list:BROADCOM NVRAM DRIVER, Linux MM, sparclinux,
	linux-riscv, Christoph Hellwig, Linux-Arch, linux-c6x-dev,
	open list:QUALCOMM HEXAGON..., the arch/x86 maintainers,
	open list:TENSILICA XTENSA PORT (xtensa), Arnd Bergmann,
	Jessica Yu, linux-um, linux-m68k, Openrisc, Linux ARM,
	Michal Simek, Linux Kernel Mailing List, alpha, Linux FS Devel,
	Andrew Morton, linuxppc-dev
In-Reply-To: <CAMuHMdW1S91i3x0unNcJnypHse7ifynGb4dZcVhJaemR3GH1Pg@mail.gmail.com>

On Mon, May 11, 2020 at 05:24:30PM +0200, Geert Uytterhoeven wrote:
> > Btw, do you know what part of flush_icache_range relied on set_fs?
> > Do any of the m68k maintainers have an idea how to handle that in
> > a nicer way when we can split the implementations?
> 
> arch/m68k/mm/cache.c:virt_to_phys_slow()
> 
> All instructions that look up addresses in the page tables look at the
> source/destination function codes (SFC/DFC) to know if they have to use
> the supervisor or user page tables.
> So the actual implementation is the same: set_fs() merely configures
> SFC/DFC, to select the address space to use.

So instead of the magic instructions could we use the normal kernel
virt to phys helpers instead of switching the addresses space?  Something
like this patch on top of the series:

diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c
index 5ecb3310e8745..5a861a14c1e69 100644
--- a/arch/m68k/mm/cache.c
+++ b/arch/m68k/mm/cache.c
@@ -71,47 +71,87 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr)
 	return 0;
 }
 
-/* Push n pages at kernel virtual address and clear the icache */
-/* RZ: use cpush %bc instead of cpush %dc, cinv %ic */
-void flush_icache_user_range(unsigned long address, unsigned long endaddr)
+static inline void coldfire_flush_icache_range(unsigned long start,
+		unsigned long end)
 {
-	if (CPU_IS_COLDFIRE) {
-		unsigned long start, end;
-		start = address & ICACHE_SET_MASK;
-		end = endaddr & ICACHE_SET_MASK;
-		if (start > end) {
-			flush_cf_icache(0, end);
-			end = ICACHE_MAX_ADDR;
-		}
-		flush_cf_icache(start, end);
-	} else if (CPU_IS_040_OR_060) {
-		address &= PAGE_MASK;
-
-		do {
-			asm volatile ("nop\n\t"
-				      ".chip 68040\n\t"
-				      "cpushp %%bc,(%0)\n\t"
-				      ".chip 68k"
-				      : : "a" (virt_to_phys_slow(address)));
-			address += PAGE_SIZE;
-		} while (address < endaddr);
-	} else {
-		unsigned long tmp;
-		asm volatile ("movec %%cacr,%0\n\t"
-			      "orw %1,%0\n\t"
-			      "movec %0,%%cacr"
-			      : "=&d" (tmp)
-			      : "di" (FLUSH_I));
+	start &= ICACHE_SET_MASK;
+	end &= ICACHE_SET_MASK;
+
+	if (start > end) {
+		flush_cf_icache(0, end);
+		end = ICACHE_MAX_ADDR;
 	}
+	flush_cf_icache(start, end);
+}
+
+static inline void mc68040_flush_icache_user_range(unsigned long start,
+		unsigned long end)
+{
+	start &= PAGE_MASK;
+
+	do {
+		asm volatile ("nop\n\t"
+			      ".chip 68040\n\t"
+			      "cpushp %%bc,(%0)\n\t"
+			      ".chip 68k"
+			      : : "a" (virt_to_phys_slow(start)));
+		start += PAGE_SIZE;
+	} while (start < end);
+}
+
+static inline void mc68020_flush_icache_range(unsigned long start,
+		unsigned long end)
+{
+	unsigned long tmp;
+
+	asm volatile ("movec %%cacr,%0\n\t"
+		      "orw %1,%0\n\t"
+		      "movec %0,%%cacr"
+		      : "=&d" (tmp)
+		      : "di" (FLUSH_I));
+}
+
+void flush_icache_user_range(unsigned long start, unsigned long end)
+{
+	if (CPU_IS_COLDFIRE)
+		coldfire_flush_icache_range(start, end);
+	else if (CPU_IS_040_OR_060)
+		mc68040_flush_icache_user_range(start, end);
+	else
+		mc68020_flush_icache_range(start, end);
 }
 
-void flush_icache_range(unsigned long address, unsigned long endaddr)
+static inline void mc68040_flush_icache_range(unsigned long start,
+		unsigned long end)
 {
-	mm_segment_t old_fs = get_fs();
+	start &= PAGE_MASK;
+
+	do {
+		void *vaddr = (void *)start;
+		phys_addr_t paddr;
+
+		if (is_vmalloc_addr(vaddr))
+			paddr = page_to_phys(vmalloc_to_page(vaddr));
+		else
+			paddr = virt_to_phys(vaddr);
+
+		asm volatile ("nop\n\t"
+			      ".chip 68040\n\t"
+			      "cpushp %%bc,(%0)\n\t"
+			      ".chip 68k"
+			      : : "a" (paddr));
+		start += PAGE_SIZE;
+	} while (start < end);
+}
 
-	set_fs(KERNEL_DS);
-	flush_icache_user_range(address, endaddr);
-	set_fs(old_fs);
+void flush_icache_range(unsigned long start, unsigned long end)
+{
+	if (CPU_IS_COLDFIRE)
+		coldfire_flush_icache_range(start, end);
+	else if (CPU_IS_040_OR_060)
+		mc68040_flush_icache_range(start, end);
+	else
+		mc68020_flush_icache_range(start, end);
 }
 EXPORT_SYMBOL(flush_icache_range);
 

^ permalink raw reply related

* Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics
From: Jonathan Adams @ 2020-05-11 17:02 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Emanuele Giuseppe Esposito, linux-s390, kvm list,
	David Hildenbrand, Cornelia Huck, Emanuele Giuseppe Esposito,
	LKML, kvm-ppc, linux-mips, Christian Borntraeger, Alexander Viro,
	linux-fsdevel, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <29982969-92f6-b6d0-aeae-22edb401e3ac@redhat.com>

On Fri, May 8, 2020 at 2:44 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> [Answering for Emanuele because he's not available until Monday]
>
> On 07/05/20 19:45, Jonathan Adams wrote:
> > This is good work.  As David Rientjes mentioned, I'm currently investigating
> > a similar project, based on a google-internal debugfs-based FS we call
> > "metricfs".  It's
> > designed in a slightly different fashion than statsfs here is, and the
> > statistics exported are
> > mostly fed into our OpenTelemetry-like system.  We're motivated by
> > wanting an upstreamed solution, so that we can upstream the metrics we
> > create that are of general interest, and lower the overall rebasing
> > burden for our tree.
>
> Cool.  We included a public reading API exactly so that there could be
> other "frontends".  I was mostly thinking of BPF as an in-tree user, but
> your metricfs could definitely use the reading API.
>
> >  - the 8/16/32/64 signed/unsigned integers seems like a wart, and the
> > built-in support to grab any offset from a structure doesn't seem like
> > much of an advantage. A simpler interface would be to just support an> "integer" (possibly signed/unsigned) type, which is always 64-bit, and
> > allow the caller to provide a function pointer to retrieve the value,
> > with one or two void *s cbargs.  Then the framework could provide an
> > offset-based callback (or callbacks) similar to the existing
> > functionality, and a similar one for per-CPU based statistics.  A
> > second "clear" callback could be optionally provided to allow for
> > statistics to be cleared, as in your current proposal.
>
> Ok, so basically splitting get_simple_value into many separate
> callbacks.  The callbacks would be in a struct like
>
> struct stats_fs_type {
>         uint64_t (*get)(struct stats_fs_value *, void *);
>         void (*clear)(struct stats_fs_value *, void *);
>         bool signed;
> }
...
> struct stats_fs_type stats_fs_type_u8 = {
>         stats_fs_get_u8,
>         stats_fs_clear_u8,
>         false
> };
>
> and custom types can be defined using "&(struct stats_fs_type) {...}".

That makes sense.

> >  - Beyond the statistic's type, one *very* useful piece of metadata
> > for telemetry tools is knowing whether a given statistic is
> > "cumulative" (an unsigned counter which is only ever increased), as
> > opposed to a floating value (like "amount of memory used").
>
> Good idea.  Also, clearing does not make sense for a floating value, so
> we can use cumulative/floating to get a default for the mode: KVM
> statistics for example are mostly cumulative and mode 644, except a few
> that are floating and those are all mode 444.  Therefore it makes sense
> to add cumulative/floating even before outputting it as metadata.
>
> > I'm more
> > concerned with getting the statistics model and capabilities right
> > from the beginning, because those are harder to adjust later.
>
> Agreed.
>
> > 1. Each metricfs metric can have one or two string or integer "keys".
> > If these exist, they expand the metric from a single value into a
> > multi-dimensional table. For example, we use this to report a hash
> > table we keep of functions calling "WARN()", in a 'warnings'
> > statistic:
> >
> > % cat .../warnings/values
> > x86_pmu_stop 1
> > %
> >
> > Indicates that the x86_pmu_stop() function has had a WARN() fire once
> > since the system was booted.  If multiple functions have fired
> > WARN()s, they are listed in this table with their own counts. [1]  We
> > also use these to report per-CPU counters on a CPU-by-CPU basis:
> >
> > % cat .../irq_x86/NMI/values
> > 0 42
> > 1 18
> > ... one line per cpu
> > % cat .../rx_bytes/values
> > lo 501360681
> > eth0 1457631256
>
> These seem like two different things.

I see your point; I agree that there are two different things here.

> The percpu and per-interface values are best represented as subordinate
> sources, one per CPU and one per interface.  For interfaces I would just
> use a separate directory, but it doesn't really make sense for CPUs.  So
> if we can cater for it in the model, it's better.  For example:
>
> - add a new argument to statsfs_create_source and statsfs_create_values
> that makes it not create directories and files respectively.
>
> - add a new "aggregate function" STATS_FS_LIST that directs the parent
> to build a table of all the simple values below it
>
> We can also add a helper statsfs_add_values_percpu that creates a new
> source for each CPU, I think.

I think I'd characterize this slightly differently; we have a set of
statistics which are essentially "in parallel":

  - a variety of statistics, N CPUs they're available for, or
  - a variety of statistics, N interfaces they're available for.
  - a variety of statistics, N kvm object they're available for.

Recreating a parallel hierarchy of statistics any time we add/subtract
a CPU or interface seems like a lot of overhead.  Perhaps a better
model would
be some sort of "parameter enumn" (naming is hard; parameter set?), so
when a CPU/network interface/etc is added you'd add its ID to the
"CPUs" we know about, and at removal time you'd take it out; it would
have an associated cbarg for the value getting callback.

Does that make sense as a design?

I'm working on characterizing all of our metricfs usage; I'll see if
this looks like it mostly covers our usecases.

> The warnings one instead is a real hash table.  It should be possible to
> implement it as some kind of customized aggregation, that is implemented
> in the client instead of coming from subordinate sources.  The
> presentation can then just use STATS_FS_LIST.  I don't see anything in
> the design that is a blocker.

Yes; though if it's low-enough overhead, you could imagine having a
dynamically-updated parameter enum based on the hash table.

> > 2.  We also export some metadata about each statistic.  For example,
> > the metadata for the NMI counter above looks like:
> >
> > % cat .../NMI/annotations
> > DESCRIPTION Non-maskable\ interrupts
> > CUMULATIVE
> > % cat .../NMI/fields
> > cpu value
> > int int
> > %
>
> Good idea.  I would prefer per-directory dot-named files for this.  For
> example a hypothetical statsfs version of /proc/interrupts could be like
> this:
>
> $ cat /sys/kernel/stats/interrupts/.schema
> 0                                          // Name
> CUMULATIVE                                 // Flags
> int:int                                    // Type(s)
> IR-IO-APIC    2-edge      timer            // Description
> ...
> LOC
> CUMULATIVE
> int:int
> Local timer interrupts
> ...
> $ cat /sys/kernel/stats/interrupts/LOC
> 0 4286815
> 1 4151572
> 2 4199361
> 3 4229248
>
> > 3. We have a (very few) statistics where the value itself is a string,
> > usually for device statuses.
>
> Maybe in addition to CUMULATIVE and FLOATING we can have ENUM
> properties, and a table to convert those enums to strings.  Aggregation
> could also be used to make a histogram out of enums in subordinate
> sources, e.g.
>
> $ cat /sys/kernel/stats/kvm/637-1/vcpu_state
> running 12
> uninitialized 0
> halted 4

That's along similar lines to the parameter enums, yeah.

> So in general I'd say the sources/values model holds up.  We certainly
> want to:
>
> - switch immediately to callbacks instead of the type constants (so that
> core statsfs code only does signed/unsigned)
>
> - add a field to distinguish cumulative and floating properties (and use
> it to determine the default file mode)

Yup, these make sense.

> - add a new argument to statsfs_create_source and statsfs_create_values
> that makes it not create directories and files respectively
>
> - add a new API to look for a statsfs_value recursively in all the
> subordinate sources, and pass the source/value pair to a callback
> function; and reimplement recursive aggregation and clear in terms of
> this function.

This is where I think a little iteration on the "parameter enums"
should happen before jumping into implementation.

> > For our use cases, we generally don't both output a statistic and it's
> > aggregation from the kernel; either we sum up things in the kernel
> > (e.g. over a bunch of per-cpu or per-memcg counters) and only have the
> > result statistic, or we expect user-space to sum up the data if it's
> > interested.  The tabular form makes it pretty easy to do so (i.e. you
> > can use awk(1) to sum all of the per-cpu NMI counters).
>
> Yep, the above "not create a dentry" flag would handle the case where
> you sum things up in the kernel because the more fine grained counters
> would be overwhelming.

nodnod; or the callback could handle the sum itself.

Thanks,
- jonathan

^ permalink raw reply

* Re: [PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics
From: Paolo Bonzini @ 2020-05-11 17:34 UTC (permalink / raw)
  To: Jonathan Adams
  Cc: Emanuele Giuseppe Esposito, linux-s390, kvm list,
	David Hildenbrand, Cornelia Huck, Emanuele Giuseppe Esposito,
	LKML, kvm-ppc, linux-mips, Christian Borntraeger, Alexander Viro,
	linux-fsdevel, Vitaly Kuznetsov, linuxppc-dev, Jim Mattson
In-Reply-To: <CA+VK+GOccmwVov9Fx1eMZkzivBduWRuoyAuCRtjMfM4LemRkgw@mail.gmail.com>

Hi Jonathan, I think the remaining sticky point is this one:

On 11/05/20 19:02, Jonathan Adams wrote:
> I think I'd characterize this slightly differently; we have a set of
> statistics which are essentially "in parallel":
> 
>   - a variety of statistics, N CPUs they're available for, or
>   - a variety of statistics, N interfaces they're available for.
>   - a variety of statistics, N kvm object they're available for.
> 
> Recreating a parallel hierarchy of statistics any time we add/subtract
> a CPU or interface seems like a lot of overhead.  Perhaps a better 
> model would be some sort of "parameter enumn" (naming is hard;
> parameter set?), so when a CPU/network interface/etc is added you'd
> add its ID to the "CPUs" we know about, and at removal time you'd
> take it out; it would have an associated cbarg for the value getting
> callback.
> 
>> Yep, the above "not create a dentry" flag would handle the case where
>> you sum things up in the kernel because the more fine grained counters
>> would be overwhelming.
>
> nodnod; or the callback could handle the sum itself.

In general for statsfs we took a more explicit approach where each
addend in a sum is a separate stats_fs_source.  In this version of the
patches it's also a directory, but we'll take your feedback and add both
the ability to hide directories (first) and to list values (second).

So, in the cases of interfaces and KVM objects I would prefer to keep
each addend separate.

For CPUs that however would be pretty bad.  Many subsystems might
accumulate stats percpu for performance reason, which would then be
exposed as the sum (usually).  So yeah, native handling of percpu values
makes sense.  I think it should fit naturally into the same custom
aggregation framework as hash table keys, we'll see if there's any devil
in the details.

Core kernel stats such as /proc/interrupts or /proc/stat are the
exception here, since individual per-CPU values can be vital for
debugging.  For those, creating a source per stat, possibly on-the-fly
at hotplug/hot-unplug time because NR_CPUS can be huge, would still be
my preferred way to do it.

Thanks,

Paolo

^ permalink raw reply

* Re: [PATCH v2 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline
From: Srikar Dronamraju @ 2020-05-11 17:47 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Linus Torvalds, linux-kernel, Michal Hocko, linux-mm, Mel Gorman,
	Kirill A. Shutemov, Andrew Morton, linuxppc-dev,
	Christopher Lameter, Vlastimil Babka
In-Reply-To: <ce9d47bc-f92c-dd22-0d59-e8d59c913526@redhat.com>

* David Hildenbrand <david@redhat.com> [2020-05-08 15:42:12]:

Hi David,

Thanks for the steps to tryout.

> > 
> > #! /bin/bash
> > sudo x86_64-softmmu/qemu-system-x86_64 \
> >     --enable-kvm \
> >     -m 4G,maxmem=20G,slots=2 \
> >     -smp sockets=2,cores=2 \
> >     -numa node,nodeid=0,cpus=0-1,mem=4G -numa node,nodeid=1,cpus=2-3,mem=0G \
> 
> Sorry, this line has to be
> 
> -numa node,nodeid=0,cpus=0-3,mem=4G -numa node,nodeid=1,mem=0G \
> 
> >     -kernel /home/dhildenb/git/linux/arch/x86_64/boot/bzImage \
> >     -append "console=ttyS0 rd.shell rd.luks=0 rd.lvm=0 rd.md=0 rd.dm=0" \
> >     -initrd /boot/initramfs-5.2.8-200.fc30.x86_64.img \
> >     -machine pc,nvdimm \
> >     -nographic \
> >     -nodefaults \
> >     -chardev stdio,id=serial \
> >     -device isa-serial,chardev=serial \
> >     -chardev socket,id=monitor,path=/var/tmp/monitor,server,nowait \
> >     -mon chardev=monitor,mode=readline
> > 
> > to get a cpu-less and memory-less node 1. Never tried with node 0.
> > 

I tried 

qemu-system-x86_64 -enable-kvm -m 4G,maxmem=20G,slots=2 -smp sockets=2,cores=2 -cpu host -numa node,nodeid=0,cpus=0-3,mem=4G -numa node,nodeid=1,mem=0G -vga none -nographic -serial mon:stdio /home/srikar/fedora.qcow2

and the resulting guest was.

[root@localhost ~]# numactl -H
available: 1 nodes (0)
node 0 cpus: 0 1 2 3
node 0 size: 3927 MB
node 0 free: 3316 MB
node distances:
node   0
  0:  10

[root@localhost ~]# lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
Address sizes:       40 bits physical, 48 bits virtual
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  1
Core(s) per socket:  2
Socket(s):           2
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               46
Model name:          Intel(R) Xeon(R) CPU           X7560  @ 2.27GHz
Stepping:            6
CPU MHz:             2260.986
BogoMIPS:            4521.97
Virtualization:      VT-x
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            4096K
L3 cache:            16384K
NUMA node0 CPU(s):   0-3
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology cpuid tsc_known_freq pni vmx ssse3 cx16 sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer hypervisor lahf_lm cpuid_fault pti ssbd ibrs ibpb tpr_shadow vnmi flexpriority ept vpid tsc_adjust arat umip arch_capabilities

[root@localhost ~]# cat /sys/devices/system/node/online
0
[root@localhost ~]# cat /sys/devices/system/node/possible
0-1

---------------------------------------------------------------------------------

I also tried

qemu-system-x86_64 -enable-kvm -m 4G,maxmem=20G,slots=2 -smp sockets=2,cores=2 -cpu host -numa node,nodeid=1,cpus=0-3,mem=4G -numa node,nodeid=0,mem=0G -vga none -nographic -serial mon:stdio /home/srikar/fedora.qcow2

and the resulting guest was.

[root@localhost ~]# numactl -H
available: 1 nodes (0)
node 0 cpus: 0 1 2 3
node 0 size: 3927 MB
node 0 free: 3316 MB
node distances:
node   0
  0:  10

[root@localhost ~]# lscpu
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
Address sizes:       40 bits physical, 48 bits virtual
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  1
Core(s) per socket:  2
Socket(s):           2
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               46
Model name:          Intel(R) Xeon(R) CPU           X7560  @ 2.27GHz
Stepping:            6
CPU MHz:             2260.986
BogoMIPS:            4521.97
Virtualization:      VT-x
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            4096K
L3 cache:            16384K
NUMA node0 CPU(s):   0-3
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology cpuid tsc_known_freq pni vmx ssse3 cx16 sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer hypervisor lahf_lm cpuid_fault pti ssbd ibrs ibpb tpr_shadow vnmi flexpriority ept vpid tsc_adjust arat umip arch_capabilities

[root@localhost ~]# cat /sys/devices/system/node/online
0
[root@localhost ~]# cat /sys/devices/system/node/possible
0-1

Even without my patch, both the combinations, I am still unable to see a
cpuless, memoryless node being online. And the interesting part being even
if I mark node 0 as cpuless,memoryless and node 1 as actual node, the system
somewhere marks node 0 as the actual node.

> 
> David / dhildenb
> 

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH V3 2/3] mm/hugetlb: Define a generic fallback for is_hugepage_only_range()
From: Mike Kravetz @ 2020-05-11 18:52 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm, akpm
  Cc: Rich Felker, linux-ia64, linux-sh, Catalin Marinas,
	Heiko Carstens, linux-kernel, James E.J. Bottomley,
	Paul Mackerras, H. Peter Anvin, sparclinux, linux-riscv,
	Will Deacon, linux-arch, linux-s390, Yoshinori Sato, Helge Deller,
	x86, Russell King, Christian Borntraeger, Ingo Molnar, Fenghua Yu,
	Vasily Gorbik, Thomas Bogendoerfer, Borislav Petkov,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Tony Luck,
	linux-parisc, linux-mips, Palmer Dabbelt, linuxppc-dev,
	David S. Miller
In-Reply-To: <c21ab871-da06-baf6-ba31-80b13402b8c9@arm.com>

On 5/10/20 8:14 PM, Anshuman Khandual wrote:
> On 05/09/2020 03:52 AM, Mike Kravetz wrote:
>> On 5/7/20 8:07 PM, Anshuman Khandual wrote:
>>
>> Did you try building without CONFIG_HUGETLB_PAGE defined?  I'm guessing
> 
> Yes I did for multiple platforms (s390, arm64, ia64, x86, powerpc etc).
> 
>> that you need a stub for is_hugepage_only_range().  Or, perhaps add this
>> to asm-generic/hugetlb.h?
>>
> There is already a stub (include/linux/hugetlb.h) when !CONFIG_HUGETLB_PAGE.
> 

Thanks!  I missed that stub in the existing code.  I like the removal of
redundant code.

Acked-by: Mike Kravetz <mike.kravetz@oracle.com>

-- 
Mike Kravetz

^ permalink raw reply

* Re: [PATCH v8 5/5] powerpc/hv-24x7: Update post_mobility_fixup() to handle migration
From: Nathan Lynch @ 2020-05-11 19:40 UTC (permalink / raw)
  To: Kajol Jain
  Cc: ravi.bangoria, maddy, anju, peterz, gregkh, suka,
	alexander.shishkin, mingo, mpetlan, yao.jin, ak, mamatha4, acme,
	jmario, namhyung, linuxppc-dev, jolsa, kan.liang
In-Reply-To: <20200506110737.14904-6-kjain@linux.ibm.com>

Hello,

Kajol Jain <kjain@linux.ibm.com> writes:
> Function 'read_sys_info_pseries()' is added to get system parameter
> values like number of sockets and chips per socket.
> and it gets these details via rtas_call with token
> "PROCESSOR_MODULE_INFO".
>
> Incase lpar migrate from one system to another, system
> parameter details like chips per sockets or number of sockets might
> change. So, it needs to be re-initialized otherwise, these values
> corresponds to previous system values.
> This patch adds a call to 'read_sys_info_pseries()' from
> 'post-mobility_fixup()' to re-init the physsockets and physchips values
>
> Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/mobility.c | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)

Please cc me on patches for this code, thanks.

I see no technical problems with how this patch handles partition
migration. However:

"Update post_mobility_fixup() to handle migration" is not an appropriate
summary for this change. post_mobility_fixup() already handles
migration. A better summary would be

"powerpc/pseries: update hv-24x7 info after migration"


> diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
> index b571285f6c14..0fb8f1e6e9d2 100644
> --- a/arch/powerpc/platforms/pseries/mobility.c
> +++ b/arch/powerpc/platforms/pseries/mobility.c
> @@ -42,6 +42,12 @@ struct update_props_workarea {
>  #define MIGRATION_SCOPE	(1)
>  #define PRRN_SCOPE -2
>  
> +#ifdef CONFIG_HV_PERF_CTRS
> +void read_sys_info_pseries(void);
> +#else
> +static inline void read_sys_info_pseries(void) { }
> +#endif

This should go in a header.


>  static int mobility_rtas_call(int token, char *buf, s32 scope)
>  {
>  	int rc;
> @@ -371,6 +377,16 @@ void post_mobility_fixup(void)
>  	/* Possibly switch to a new RFI flush type */
>  	pseries_setup_rfi_flush();
>  
> +	/*
> +	 * In case an Lpar migrates from one system to another, system
> +	 * parameter details like chips per sockets, cores per chip and
> +	 * number of sockets details might change.
> +	 * So, they needs to be re-initialized otherwise the
> +	 * values will correspond to the previous system.
> +	 * Call read_sys_info_pseries() to reinitialise the values.
> +	 */

This is needlessly verbose; any literate reader of this code knows this
is used immediately after resuming from a suspend (migration). If you
give your hook a more descriptive name, the comment becomes unnecessary.


> +	read_sys_info_pseries();
> +


^ permalink raw reply

* Re: [PATCH V3 3/3] mm/hugetlb: Define a generic fallback for arch_clear_hugepage_flags()
From: Mike Kravetz @ 2020-05-11 20:22 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm, akpm
  Cc: Rich Felker, linux-ia64, linux-sh, Catalin Marinas,
	Heiko Carstens, linux-kernel, James E.J. Bottomley,
	Paul Mackerras, H. Peter Anvin, sparclinux, linux-riscv,
	Will Deacon, linux-arch, linux-s390, Yoshinori Sato, Helge Deller,
	x86, Russell King, Christian Borntraeger, Ingo Molnar, Fenghua Yu,
	Vasily Gorbik, Thomas Bogendoerfer, Borislav Petkov,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Tony Luck,
	linux-parisc, linux-mips, Palmer Dabbelt, linuxppc-dev,
	David S. Miller
In-Reply-To: <1588907271-11920-4-git-send-email-anshuman.khandual@arm.com>

On 5/7/20 8:07 PM, Anshuman Khandual wrote:
> There are multiple similar definitions for arch_clear_hugepage_flags() on
> various platforms. Lets just add it's generic fallback definition for
> platforms that do not override. This help reduce code duplication.
> 
> Cc: Russell King <linux@armlinux.org.uk>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Tony Luck <tony.luck@intel.com>
> Cc: Fenghua Yu <fenghua.yu@intel.com>
> Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
> Cc: Helge Deller <deller@gmx.de>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Paul Walmsley <paul.walmsley@sifive.com>
> Cc: Palmer Dabbelt <palmer@dabbelt.com>
> Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
> Cc: Vasily Gorbik <gor@linux.ibm.com>
> Cc: Christian Borntraeger <borntraeger@de.ibm.com>
> Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
> Cc: Rich Felker <dalias@libc.org>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Borislav Petkov <bp@alien8.de>
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: x86@kernel.org
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-ia64@vger.kernel.org
> Cc: linux-mips@vger.kernel.org
> Cc: linux-parisc@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-riscv@lists.infradead.org
> Cc: linux-s390@vger.kernel.org
> Cc: linux-sh@vger.kernel.org
> Cc: sparclinux@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: linux-arch@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>

Thanks!
Removing duplicate code is good.

Acked-by: Mike Kravetz <mike.kravetz@oracle.com>

-- 
Mike Kravetz

^ permalink raw reply

* Re: [PATCH v8 2/5] powerpc/hv-24x7: Add rtas call in hv-24x7 driver to get processor details
From: Nathan Lynch @ 2020-05-11 21:07 UTC (permalink / raw)
  To: Kajol Jain
  Cc: ravi.bangoria, maddy, anju, peterz, gregkh, suka,
	alexander.shishkin, mingo, mpetlan, yao.jin, ak, mamatha4, acme,
	jmario, namhyung, linuxppc-dev, jolsa, kan.liang
In-Reply-To: <20200506110737.14904-3-kjain@linux.ibm.com>

Hi,

Kajol Jain <kjain@linux.ibm.com> writes:
> diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
> index 48e8f4b17b91..8cf242aad98f 100644
> --- a/arch/powerpc/perf/hv-24x7.c
> +++ b/arch/powerpc/perf/hv-24x7.c
> @@ -20,6 +20,7 @@
>  #include <asm/io.h>
>  #include <linux/byteorder/generic.h>
>  
> +#include <asm/rtas.h>
>  #include "hv-24x7.h"
>  #include "hv-24x7-catalog.h"
>  #include "hv-common.h"
> @@ -57,6 +58,75 @@ static bool is_physical_domain(unsigned domain)
>  	}
>  }
>  
> +/*
> + * The Processor Module Information system parameter allows transferring
> + * of certain processor module information from the platform to the OS.
> + * Refer PAPR+ document to get parameter token value as '43'.
> + */
> +
> +#define PROCESSOR_MODULE_INFO   43
> +#define PROCESSOR_MAX_LENGTH	(8 * 1024)
> +
> +DEFINE_SPINLOCK(rtas_local_data_buf_lock);
> +EXPORT_SYMBOL(rtas_local_data_buf_lock);

This should be static and not exported, correct?

> +
> +static u32 phys_sockets;	/* Physical sockets */
> +static u32 phys_chipspersocket;	/* Physical chips per socket*/
> +static u32 phys_coresperchip; /* Physical cores per chip */
> +
> +/*
> + * Function read_sys_info_pseries() make a rtas_call which require
> + * data buffer of size 8K. As standard 'rtas_data_buf' is of size
> + * 4K, we are adding new local buffer 'rtas_local_data_buf'.

Sorry if this has been covered before but I don't understand why it
would require a larger buffer; by my reading this call will return *ten
bytes* of output. Also, current versions of PAPR+ limit the output
length to 4002 bytes. I feel like I'm missing something.


> + */
> +static __be16 rtas_local_data_buf[PROCESSOR_MAX_LENGTH] __cacheline_aligned;
> +
> +/*
> + * read_sys_info_pseries()
> + * Retrieve the number of sockets and chips per socket and cores per
> + * chip details through the get-system-parameter rtas call.
> + */
> +void read_sys_info_pseries(void)
> +{
> +	int call_status, len, ntypes;
> +
> +	/*
> +	 * Making system parameter: chips and sockets and cores per chip
> +	 * default to 1.
> +	 */
> +	phys_sockets = 1;
> +	phys_chipspersocket = 1;
> +	phys_coresperchip = 1;
> +	memset(rtas_local_data_buf, 0, PROCESSOR_MAX_LENGTH * sizeof(__be16));

Modifying global state outside of any critical section...? How do
you prevent readers from seeing inconsistent results?


> +	spin_lock(&rtas_local_data_buf_lock);
> +
> +	call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
> +				NULL,
> +				PROCESSOR_MODULE_INFO,
> +				__pa(rtas_local_data_buf),
> +				PROCESSOR_MAX_LENGTH);
> +
> +	spin_unlock(&rtas_local_data_buf_lock);

Using this lock this way fails to provide any protection to the data
buffer or the phys_* variables.


> +
> +	if (call_status != 0) {
> +		pr_info("Error calling get-system-parameter (0x%x)\n",
> +			call_status);

To be robust, this should handle busy (-2) and extended delay (990x)
statuses. And if it's going to log errors it should use pr_err() and use
decimal, not hex, to report the RTAS call status, since that's how
they're specified in PAPR+.


^ permalink raw reply

* Re: [PATCH RFC 2/4] powerpc: Add Microwatt platform
From: Michael Ellerman @ 2020-05-12  1:56 UTC (permalink / raw)
  To: Paul Mackerras, linuxppc-dev, Benjamin Herrenschmidt,
	Michael Neuling, Anton Blanchard
In-Reply-To: <20200509050255.GC1464954@thinks.paulus.ozlabs.org>

Paul Mackerras <paulus@ozlabs.org> writes:
> Microwatt is a FPGA-based implementation of the Power ISA.  It
> currently only implements little-endian 64-bit mode, and does
> not (yet) support SMP.

... or FP or VSX or Altivec?

What about transactional memory?

> This adds a new machine type to support FPGA-based SoCs with a
> Microwatt core.
>
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> ---
>  arch/powerpc/Kconfig                      |    2 +-
>  arch/powerpc/configs/microwatt_defconfig  | 1418 +++++++++++++++++++++
>  arch/powerpc/platforms/Kconfig            |    1 +
>  arch/powerpc/platforms/Makefile           |    1 +
>  arch/powerpc/platforms/microwatt/Kconfig  |    9 +
>  arch/powerpc/platforms/microwatt/Makefile |    1 +
>  arch/powerpc/platforms/microwatt/setup.c  |   40 +
>  7 files changed, 1471 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/configs/microwatt_defconfig
>  create mode 100644 arch/powerpc/platforms/microwatt/Kconfig
>  create mode 100644 arch/powerpc/platforms/microwatt/Makefile
>  create mode 100644 arch/powerpc/platforms/microwatt/setup.c
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 497b7d0b2d7e..97286b8312f5 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -407,7 +407,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
>  
>  config MATH_EMULATION
>  	bool "Math emulation"
> -	depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
> +	depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE || PPC_MICROWATT
>  	help
>  	  Some PowerPC chips designed for embedded applications do not have
>  	  a floating-point unit and therefore do not implement the
> diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig
> new file mode 100644
> index 000000000000..f4f4c965a786
> --- /dev/null
> +++ b/arch/powerpc/configs/microwatt_defconfig
> @@ -0,0 +1,1418 @@
> +#
> +# Automatically generated file; DO NOT EDIT.
> +# Linux/powerpc 5.6.0 Kernel Configuration
> +#
> +
> +#
> +# Compiler: powerpc64le-linux-gnu-gcc (GCC) 9.2.1 20190827 (Red Hat Cross 9.2.1-1)
> +#
> +CONFIG_CC_IS_GCC=y
> +CONFIG_GCC_VERSION=90201
> +CONFIG_CLANG_VERSION=0
> +CONFIG_CC_HAS_ASM_GOTO=y
> +CONFIG_CC_HAS_ASM_INLINE=y
> +CONFIG_CC_HAS_WARN_MAYBE_UNINITIALIZED=y
> +CONFIG_CC_DISABLE_WARN_MAYBE_UNINITIALIZED=y
> +CONFIG_IRQ_WORK=y
> +CONFIG_BUILDTIME_TABLE_SORT=y
> +CONFIG_THREAD_INFO_IN_TASK=y

This should be a minimised config generated with savedefconfig.

> diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
> index 1f8025383caa..5b701acc6afa 100644
> --- a/arch/powerpc/platforms/Kconfig
> +++ b/arch/powerpc/platforms/Kconfig
> @@ -20,6 +20,7 @@ source "arch/powerpc/platforms/embedded6xx/Kconfig"
>  source "arch/powerpc/platforms/44x/Kconfig"
>  source "arch/powerpc/platforms/40x/Kconfig"
>  source "arch/powerpc/platforms/amigaone/Kconfig"
> +source "arch/powerpc/platforms/microwatt/Kconfig"
>  
>  config KVM_GUEST
>  	bool "KVM Guest support"
> diff --git a/arch/powerpc/platforms/Makefile b/arch/powerpc/platforms/Makefile
> index 143d4417f6cc..edcb54cdb1a8 100644
> --- a/arch/powerpc/platforms/Makefile
> +++ b/arch/powerpc/platforms/Makefile
> @@ -22,3 +22,4 @@ obj-$(CONFIG_PPC_CELL)		+= cell/
>  obj-$(CONFIG_PPC_PS3)		+= ps3/
>  obj-$(CONFIG_EMBEDDED6xx)	+= embedded6xx/
>  obj-$(CONFIG_AMIGAONE)		+= amigaone/
> +obj-$(CONFIG_PPC_MICROWATT)	+= microwatt/
> diff --git a/arch/powerpc/platforms/microwatt/Kconfig b/arch/powerpc/platforms/microwatt/Kconfig
> new file mode 100644
> index 000000000000..981f722ae9ce
> --- /dev/null
> +++ b/arch/powerpc/platforms/microwatt/Kconfig
> @@ -0,0 +1,9 @@
> +# SPDX-License-Identifier: GPL-2.0
> +config PPC_MICROWATT
> +	depends on PPC64 && PPC_BOOK3S

That can just be PPC_BOOK3S_64

I think it should also depend on !CPU_BIG_ENDIAN && !SMP ?

Should it also select MATH_EMULATION?
And MATH_EMULATION_FULL ?

> +	bool "Microwatt SoC platform"
> +	select PPC_XICS
> +	select PPC_NATIVE
> +	help
> +          This option enables support for FPGA-based Microwatt implementations.
> +
> diff --git a/arch/powerpc/platforms/microwatt/Makefile b/arch/powerpc/platforms/microwatt/Makefile
> new file mode 100644
> index 000000000000..e6885b3b2ee7
> --- /dev/null
> +++ b/arch/powerpc/platforms/microwatt/Makefile
> @@ -0,0 +1 @@
> +obj-y	+= setup.o
> diff --git a/arch/powerpc/platforms/microwatt/setup.c b/arch/powerpc/platforms/microwatt/setup.c
> new file mode 100644
> index 000000000000..3cfc5955a6fe
> --- /dev/null
> +++ b/arch/powerpc/platforms/microwatt/setup.c
> @@ -0,0 +1,40 @@
> +/*
> + * Microwatt FPGA-based SoC platform setup code.
> + *
> + * Copyright 2020 Paul Mackerras (paulus@ozlabs.org), IBM Corp.
> + */
> +
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/stddef.h>
> +#include <linux/init.h>
> +#include <linux/of.h>
> +#include <asm/machdep.h>
> +#include <asm/time.h>
> +
> +static void __init microwatt_calibrate_decr(void)
> +{
> +	ppc_tb_freq = 100000000;
> +	ppc_proc_freq = 100000000;
> +}

Why can't those come via the device tree?

> +
> +static void __init microwatt_setup_arch(void)
> +{
> +}

That can be NULL in ppc_md.

> +static void __init microwatt_init_IRQ(void)
> +{
> +}

Can be NULL in ppc_md.

> +static int __init microwatt_probe(void)
> +{
> +	return of_machine_is_compatible("microwatt-soc");
> +}
> +
> +define_machine(microwatt) {
> +	.name			= "microwatt",
> +	.probe			= microwatt_probe,
> +	.setup_arch		= microwatt_setup_arch,
> +	.init_IRQ		= microwatt_init_IRQ,
> +	.calibrate_decr		= microwatt_calibrate_decr,
> +};
> -- 
> 2.25.3

cheers

^ permalink raw reply

* Re: [PATCH RFC 3/4] powerpc/microwatt: Add early debug UART support for Microwatt
From: Michael Ellerman @ 2020-05-12  1:57 UTC (permalink / raw)
  To: Paul Mackerras, linuxppc-dev, Benjamin Herrenschmidt,
	Michael Neuling, Anton Blanchard
In-Reply-To: <20200509050340.GD1464954@thinks.paulus.ozlabs.org>

Paul Mackerras <paulus@ozlabs.org> writes:
> Currently microwatt-based SoCs come with a "potato" UART.  This
> adds udbg support for the potato UART, giving us both an early
> debug console, and a runtime console using the hvc-udbg support.

Can y'all get a real UART?

There's more code here than in the platform itself.

cheers

^ permalink raw reply

* Re: [PATCH 2/3] ASoC: fsl_esai: Add support for imx8qm
From: Shengjiu Wang @ 2020-05-12  2:48 UTC (permalink / raw)
  To: Mark Brown
  Cc: open list:OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS,
	Linux-ALSA, Timur Tabi, Xiubo Li, Fabio Estevam, Shengjiu Wang,
	Liam Girdwood, Takashi Iwai, Nicolin Chen, Rob Herring,
	linuxppc-dev, linux-kernel
In-Reply-To: <CAA+D8ANDHHejFD1rYmFOG24yivpEJa+xO-WpVr=Vzfz9yW9H7g@mail.gmail.com>

Hi Mark, Nicolin

On Wed, May 6, 2020 at 10:33 AM Shengjiu Wang <shengjiu.wang@gmail.com> wrote:
>
> Hi
>
> On Fri, May 1, 2020 at 6:23 PM Mark Brown <broonie@kernel.org> wrote:
> >
> > On Fri, May 01, 2020 at 04:12:05PM +0800, Shengjiu Wang wrote:
> > > The difference for esai on imx8qm is that DMA device is EDMA.
> > >
> > > EDMA requires the period size to be multiple of maxburst. Otherwise
> > > the remaining bytes are not transferred and thus noise is produced.
> >
> > If this constraint comes from the DMA controller then normally you'd
> > expect the DMA controller integration to be enforcing this - is there no
> > information in the DMA API that lets us know that this constraint is
> > there?
>
> No, I can't find one API for this.
> Do you have a recommendation?
>
could you please recommend which DMA API can I use?

best regards
wang shengjiu

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox