LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH 3/9] arc: remove support for DISCONTIGMEM
From: David Hildenbrand @ 2021-06-09 10:53 UTC (permalink / raw)
  To: Mike Rapoport, Andrew Morton
  Cc: linux-ia64, linux-sh, linux-mips, linux-mm, sparclinux,
	linux-riscv, linux-arch, linux-s390, Jonathan Corbet, linux-doc,
	Mike Rapoport, Geert Uytterhoeven, Matt Turner, linux-snps-arc,
	linux-xtensa, Arnd Bergmann, linux-m68k, Ivan Kokshaysky,
	linux-arm-kernel, Richard Henderson, Vineet Gupta, kexec,
	linux-kernel, linux-alpha, linuxppc-dev
In-Reply-To: <20210602105348.13387-4-rppt@kernel.org>

On 02.06.21 12:53, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> DISCONTIGMEM was replaced by FLATMEM with freeing of the unused memory map
> in v5.11.
> 
> Remove the support for DISCONTIGMEM entirely.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH 5/9] mm: remove CONFIG_DISCONTIGMEM
From: David Hildenbrand @ 2021-06-09 10:55 UTC (permalink / raw)
  To: Mike Rapoport, Andrew Morton
  Cc: linux-ia64, linux-sh, linux-mips, linux-mm, sparclinux,
	linux-riscv, linux-arch, linux-s390, Jonathan Corbet, linux-doc,
	Mike Rapoport, Geert Uytterhoeven, Matt Turner, linux-snps-arc,
	linux-xtensa, Arnd Bergmann, linux-m68k, Ivan Kokshaysky,
	linux-arm-kernel, Richard Henderson, Vineet Gupta, kexec,
	linux-kernel, linux-alpha, linuxppc-dev
In-Reply-To: <20210602105348.13387-6-rppt@kernel.org>

On 02.06.21 12:53, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> There are no architectures that support DISCONTIGMEM left.
> 
> Remove the configuration option and the dead code it was guarding in the
> generic memory management code.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   include/asm-generic/memory_model.h | 37 ++++--------------------------
>   include/linux/mmzone.h             |  4 ++--
>   mm/Kconfig                         | 25 +++-----------------
>   mm/memory.c                        |  3 +--
>   mm/page_alloc.c                    | 13 -----------
>   5 files changed, 10 insertions(+), 72 deletions(-)
> 
> diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
> index 7637fb46ba4f..a2c8ed60233a 100644
> --- a/include/asm-generic/memory_model.h
> +++ b/include/asm-generic/memory_model.h
> @@ -6,47 +6,18 @@
>   
>   #ifndef __ASSEMBLY__
>   
> +/*
> + * supports 3 memory models.
> + */
>   #if defined(CONFIG_FLATMEM)
>   
>   #ifndef ARCH_PFN_OFFSET
>   #define ARCH_PFN_OFFSET		(0UL)
>   #endif
>   
> -#elif defined(CONFIG_DISCONTIGMEM)
> -
> -#ifndef arch_pfn_to_nid
> -#define arch_pfn_to_nid(pfn)	pfn_to_nid(pfn)
> -#endif
> -
> -#ifndef arch_local_page_offset
> -#define arch_local_page_offset(pfn, nid)	\
> -	((pfn) - NODE_DATA(nid)->node_start_pfn)
> -#endif
> -
> -#endif /* CONFIG_DISCONTIGMEM */
> -
> -/*
> - * supports 3 memory models.
> - */
> -#if defined(CONFIG_FLATMEM)
> -
>   #define __pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
>   #define __page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
>   				 ARCH_PFN_OFFSET)
> -#elif defined(CONFIG_DISCONTIGMEM)
> -
> -#define __pfn_to_page(pfn)			\
> -({	unsigned long __pfn = (pfn);		\
> -	unsigned long __nid = arch_pfn_to_nid(__pfn);  \
> -	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
> -})
> -
> -#define __page_to_pfn(pg)						\
> -({	const struct page *__pg = (pg);					\
> -	struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));	\
> -	(unsigned long)(__pg - __pgdat->node_mem_map) +			\
> -	 __pgdat->node_start_pfn;					\
> -})
>   
>   #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
>   
> @@ -70,7 +41,7 @@
>   	struct mem_section *__sec = __pfn_to_section(__pfn);	\
>   	__section_mem_map_addr(__sec) + __pfn;		\
>   })
> -#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
> +#endif /* CONFIG_FLATMEM/SPARSEMEM */
>   
>   /*
>    * Convert a physical address to a Page Frame Number and back
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 0d53eba1c383..2b41e252a995 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -738,8 +738,8 @@ struct zonelist {
>   	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
>   };
>   
> -#ifndef CONFIG_DISCONTIGMEM
> -/* The array of struct pages - for discontigmem use pgdat->lmem_map */
> +#ifdef CONFIG_FLATMEM
> +/* The array of struct pages for flatmem */
>   extern struct page *mem_map;
>   #endif
>   
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 02d44e3420f5..218b96ccc84a 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -19,7 +19,7 @@ choice
>   
>   config FLATMEM_MANUAL
>   	bool "Flat Memory"
> -	depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
> +	depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE
>   	help
>   	  This option is best suited for non-NUMA systems with
>   	  flat address space. The FLATMEM is the most efficient
> @@ -32,21 +32,6 @@ config FLATMEM_MANUAL
>   
>   	  If unsure, choose this option (Flat Memory) over any other.
>   
> -config DISCONTIGMEM_MANUAL
> -	bool "Discontiguous Memory"
> -	depends on ARCH_DISCONTIGMEM_ENABLE
> -	help
> -	  This option provides enhanced support for discontiguous
> -	  memory systems, over FLATMEM.  These systems have holes
> -	  in their physical address spaces, and this option provides
> -	  more efficient handling of these holes.
> -
> -	  Although "Discontiguous Memory" is still used by several
> -	  architectures, it is considered deprecated in favor of
> -	  "Sparse Memory".
> -
> -	  If unsure, choose "Sparse Memory" over this option.
> -
>   config SPARSEMEM_MANUAL
>   	bool "Sparse Memory"
>   	depends on ARCH_SPARSEMEM_ENABLE
> @@ -62,17 +47,13 @@ config SPARSEMEM_MANUAL
>   
>   endchoice
>   
> -config DISCONTIGMEM
> -	def_bool y
> -	depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
> -
>   config SPARSEMEM
>   	def_bool y
>   	depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL
>   
>   config FLATMEM
>   	def_bool y
> -	depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
> +	depends on !SPARSEMEM || FLATMEM_MANUAL
>   
>   config FLAT_NODE_MEM_MAP
>   	def_bool y
> @@ -85,7 +66,7 @@ config FLAT_NODE_MEM_MAP
>   #
>   config NEED_MULTIPLE_NODES
>   	def_bool y
> -	depends on DISCONTIGMEM || NUMA
> +	depends on NUMA
>   
>   #
>   # SPARSEMEM_EXTREME (which is the default) does some bootmem
> diff --git a/mm/memory.c b/mm/memory.c
> index 730daa00952b..7c7b6ea02504 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -90,8 +90,7 @@
>   #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
>   #endif
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> -/* use the per-pgdat data instead for discontigmem - mbligh */
> +#ifdef CONFIG_FLATMEM
>   unsigned long max_mapnr;
>   EXPORT_SYMBOL(max_mapnr);
>   
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index aaa1655cf682..6fc22482eaa8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -331,20 +331,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
>   
>   int min_free_kbytes = 1024;
>   int user_min_free_kbytes = -1;
> -#ifdef CONFIG_DISCONTIGMEM
> -/*
> - * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
> - * are not on separate NUMA nodes. Functionally this works but with
> - * watermark_boost_factor, it can reclaim prematurely as the ranges can be
> - * quite small. By default, do not boost watermarks on discontigmem as in
> - * many cases very high-order allocations like THP are likely to be
> - * unsupported and the premature reclaim offsets the advantage of long-term
> - * fragmentation avoidance.
> - */
> -int watermark_boost_factor __read_mostly;
> -#else
>   int watermark_boost_factor __read_mostly = 15000;
> -#endif
>   int watermark_scale_factor = 10;
>   
>   static unsigned long nr_kernel_pages __initdata;
> 

Nice

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH 6/9] arch, mm: remove stale mentions of DISCONIGMEM
From: David Hildenbrand @ 2021-06-09 10:55 UTC (permalink / raw)
  To: Mike Rapoport, Andrew Morton
  Cc: linux-ia64, linux-sh, linux-mips, linux-mm, sparclinux,
	linux-riscv, linux-arch, linux-s390, Jonathan Corbet, linux-doc,
	Mike Rapoport, Geert Uytterhoeven, Matt Turner, linux-snps-arc,
	linux-xtensa, Arnd Bergmann, linux-m68k, Ivan Kokshaysky,
	linux-arm-kernel, Richard Henderson, Vineet Gupta, kexec,
	linux-kernel, linux-alpha, linuxppc-dev
In-Reply-To: <20210602105348.13387-7-rppt@kernel.org>

On 02.06.21 12:53, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> There are several places that mention DISCONIGMEM in comments or have stale
> code guarded by CONFIG_DISCONTIGMEM.
> 
> Remove the dead code and update the comments.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   arch/ia64/kernel/topology.c     | 5 ++---
>   arch/ia64/mm/numa.c             | 5 ++---
>   arch/mips/include/asm/mmzone.h  | 6 ------
>   arch/mips/mm/init.c             | 3 ---
>   arch/nds32/include/asm/memory.h | 6 ------
>   arch/xtensa/include/asm/page.h  | 4 ----
>   include/linux/gfp.h             | 4 ++--
>   7 files changed, 6 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
> index 09fc385c2acd..3639e0a7cb3b 100644
> --- a/arch/ia64/kernel/topology.c
> +++ b/arch/ia64/kernel/topology.c
> @@ -3,9 +3,8 @@
>    * License.  See the file "COPYING" in the main directory of this archive
>    * for more details.
>    *
> - * This file contains NUMA specific variables and functions which can
> - * be split away from DISCONTIGMEM and are used on NUMA machines with
> - * contiguous memory.
> + * This file contains NUMA specific variables and functions which are used on
> + * NUMA machines with contiguous memory.
>    * 		2002/08/07 Erich Focht <efocht@ess.nec.de>
>    * Populate cpu entries in sysfs for non-numa systems as well
>    *  	Intel Corporation - Ashok Raj
> diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
> index 46b6e5f3a40f..d6579ec3ea32 100644
> --- a/arch/ia64/mm/numa.c
> +++ b/arch/ia64/mm/numa.c
> @@ -3,9 +3,8 @@
>    * License.  See the file "COPYING" in the main directory of this archive
>    * for more details.
>    *
> - * This file contains NUMA specific variables and functions which can
> - * be split away from DISCONTIGMEM and are used on NUMA machines with
> - * contiguous memory.
> + * This file contains NUMA specific variables and functions which are used on
> + * NUMA machines with contiguous memory.
>    *
>    *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
>    */
> diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h
> index b826b8473e95..7649ab45e80c 100644
> --- a/arch/mips/include/asm/mmzone.h
> +++ b/arch/mips/include/asm/mmzone.h
> @@ -20,10 +20,4 @@
>   #define nid_to_addrbase(nid) 0
>   #endif
>   
> -#ifdef CONFIG_DISCONTIGMEM
> -
> -#define pfn_to_nid(pfn)		pa_to_nid((pfn) << PAGE_SHIFT)
> -
> -#endif /* CONFIG_DISCONTIGMEM */
> -
>   #endif /* _ASM_MMZONE_H_ */
> diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
> index c36358758969..97f6ca341448 100644
> --- a/arch/mips/mm/init.c
> +++ b/arch/mips/mm/init.c
> @@ -454,9 +454,6 @@ void __init mem_init(void)
>   	BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (_PFN_SHIFT > PAGE_SHIFT));
>   
>   #ifdef CONFIG_HIGHMEM
> -#ifdef CONFIG_DISCONTIGMEM
> -#error "CONFIG_HIGHMEM and CONFIG_DISCONTIGMEM dont work together yet"
> -#endif
>   	max_mapnr = highend_pfn ? highend_pfn : max_low_pfn;
>   #else
>   	max_mapnr = max_low_pfn;
> diff --git a/arch/nds32/include/asm/memory.h b/arch/nds32/include/asm/memory.h
> index 940d32842793..62faafbc28e4 100644
> --- a/arch/nds32/include/asm/memory.h
> +++ b/arch/nds32/include/asm/memory.h
> @@ -76,18 +76,12 @@
>    *  virt_to_page(k)	convert a _valid_ virtual address to struct page *
>    *  virt_addr_valid(k)	indicates whether a virtual address is valid
>    */
> -#ifndef CONFIG_DISCONTIGMEM
> -
>   #define ARCH_PFN_OFFSET		PHYS_PFN_OFFSET
>   #define pfn_valid(pfn)		((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr))
>   
>   #define virt_to_page(kaddr)	(pfn_to_page(__pa(kaddr) >> PAGE_SHIFT))
>   #define virt_addr_valid(kaddr)	((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory)
>   
> -#else /* CONFIG_DISCONTIGMEM */
> -#error CONFIG_DISCONTIGMEM is not supported yet.
> -#endif /* !CONFIG_DISCONTIGMEM */
> -
>   #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
>   
>   #endif
> diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h
> index 37ce25ef92d6..493eb7083b1a 100644
> --- a/arch/xtensa/include/asm/page.h
> +++ b/arch/xtensa/include/asm/page.h
> @@ -192,10 +192,6 @@ static inline unsigned long ___pa(unsigned long va)
>   #define pfn_valid(pfn) \
>   	((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr)
>   
> -#ifdef CONFIG_DISCONTIGMEM
> -# error CONFIG_DISCONTIGMEM not supported
> -#endif
> -
>   #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
>   #define page_to_virt(page)	__va(page_to_pfn(page) << PAGE_SHIFT)
>   #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 11da8af06704..dbe1f5fc901d 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -494,8 +494,8 @@ static inline int gfp_zonelist(gfp_t flags)
>    * There are two zonelists per node, one for all zones with memory and
>    * one containing just zones from the node the zonelist belongs to.
>    *
> - * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
> - * optimized to &contig_page_data at compile-time.
> + * For the case of non-NUMA systems the NODE_DATA() gets optimized to
> + * &contig_page_data at compile-time.
>    */
>   static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
>   {
> 

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH 7/9] docs: remove description of DISCONTIGMEM
From: David Hildenbrand @ 2021-06-09 10:56 UTC (permalink / raw)
  To: Mike Rapoport, Andrew Morton
  Cc: linux-ia64, linux-sh, linux-mips, linux-mm, sparclinux,
	linux-riscv, linux-arch, linux-s390, Jonathan Corbet, linux-doc,
	Mike Rapoport, Geert Uytterhoeven, Matt Turner, linux-snps-arc,
	linux-xtensa, Arnd Bergmann, linux-m68k, Ivan Kokshaysky,
	linux-arm-kernel, Richard Henderson, Vineet Gupta, kexec,
	linux-kernel, linux-alpha, linuxppc-dev
In-Reply-To: <20210602105348.13387-8-rppt@kernel.org>

On 02.06.21 12:53, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> Remove description of DISCONTIGMEM from the "Memory Models" document and
> update VM sysctl description so that it won't mention DISCONIGMEM.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   Documentation/admin-guide/sysctl/vm.rst | 12 +++----
>   Documentation/vm/memory-model.rst       | 45 ++-----------------------
>   2 files changed, 8 insertions(+), 49 deletions(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index 586cd4b86428..ddbd71d592e0 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -936,12 +936,12 @@ allocations, THP and hugetlbfs pages.
>   
>   To make it sensible with respect to the watermark_scale_factor
>   parameter, the unit is in fractions of 10,000. The default value of
> -15,000 on !DISCONTIGMEM configurations means that up to 150% of the high
> -watermark will be reclaimed in the event of a pageblock being mixed due
> -to fragmentation. The level of reclaim is determined by the number of
> -fragmentation events that occurred in the recent past. If this value is
> -smaller than a pageblock then a pageblocks worth of pages will be reclaimed
> -(e.g.  2MB on 64-bit x86). A boost factor of 0 will disable the feature.
> +15,000 means that up to 150% of the high watermark will be reclaimed in the
> +event of a pageblock being mixed due to fragmentation. The level of reclaim
> +is determined by the number of fragmentation events that occurred in the
> +recent past. If this value is smaller than a pageblock then a pageblocks
> +worth of pages will be reclaimed (e.g.  2MB on 64-bit x86). A boost factor
> +of 0 will disable the feature.
>   
>   
>   watermark_scale_factor
> diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
> index ce398a7dc6cd..30e8fbed6914 100644
> --- a/Documentation/vm/memory-model.rst
> +++ b/Documentation/vm/memory-model.rst
> @@ -14,15 +14,11 @@ for the CPU. Then there could be several contiguous ranges at
>   completely distinct addresses. And, don't forget about NUMA, where
>   different memory banks are attached to different CPUs.
>   
> -Linux abstracts this diversity using one of the three memory models:
> -FLATMEM, DISCONTIGMEM and SPARSEMEM. Each architecture defines what
> +Linux abstracts this diversity using one of the two memory models:
> +FLATMEM and SPARSEMEM. Each architecture defines what
>   memory models it supports, what the default memory model is and
>   whether it is possible to manually override that default.
>   
> -.. note::
> -   At time of this writing, DISCONTIGMEM is considered deprecated,
> -   although it is still in use by several architectures.
> -
>   All the memory models track the status of physical page frames using
>   struct page arranged in one or more arrays.
>   
> @@ -63,43 +59,6 @@ straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the
>   The `ARCH_PFN_OFFSET` defines the first page frame number for
>   systems with physical memory starting at address different from 0.
>   
> -DISCONTIGMEM
> -============
> -
> -The DISCONTIGMEM model treats the physical memory as a collection of
> -`nodes` similarly to how Linux NUMA support does. For each node Linux
> -constructs an independent memory management subsystem represented by
> -`struct pglist_data` (or `pg_data_t` for short). Among other
> -things, `pg_data_t` holds the `node_mem_map` array that maps
> -physical pages belonging to that node. The `node_start_pfn` field of
> -`pg_data_t` is the number of the first page frame belonging to that
> -node.
> -
> -The architecture setup code should call :c:func:`free_area_init_node` for
> -each node in the system to initialize the `pg_data_t` object and its
> -`node_mem_map`.
> -
> -Every `node_mem_map` behaves exactly as FLATMEM's `mem_map` -
> -every physical page frame in a node has a `struct page` entry in the
> -`node_mem_map` array. When DISCONTIGMEM is enabled, a portion of the
> -`flags` field of the `struct page` encodes the node number of the
> -node hosting that page.
> -
> -The conversion between a PFN and the `struct page` in the
> -DISCONTIGMEM model became slightly more complex as it has to determine
> -which node hosts the physical page and which `pg_data_t` object
> -holds the `struct page`.
> -
> -Architectures that support DISCONTIGMEM provide :c:func:`pfn_to_nid`
> -to convert PFN to the node number. The opposite conversion helper
> -:c:func:`page_to_nid` is generic as it uses the node number encoded in
> -page->flags.
> -
> -Once the node number is known, the PFN can be used to index
> -appropriate `node_mem_map` array to access the `struct page` and
> -the offset of the `struct page` from the `node_mem_map` plus
> -`node_start_pfn` is the PFN of that page.
> -
>   SPARSEMEM
>   =========
>   
> 

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH 8/9] mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA
From: David Hildenbrand @ 2021-06-09 10:56 UTC (permalink / raw)
  To: Mike Rapoport, Andrew Morton
  Cc: linux-ia64, linux-sh, linux-mips, linux-mm, sparclinux,
	linux-riscv, linux-arch, linux-s390, Jonathan Corbet, linux-doc,
	Mike Rapoport, Geert Uytterhoeven, Matt Turner, linux-snps-arc,
	linux-xtensa, Arnd Bergmann, linux-m68k, Ivan Kokshaysky,
	linux-arm-kernel, Richard Henderson, Vineet Gupta, kexec,
	linux-kernel, linux-alpha, linuxppc-dev
In-Reply-To: <20210602105348.13387-9-rppt@kernel.org>

On 02.06.21 12:53, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> After removal of DISCINTIGMEM the NEED_MULTIPLE_NODES and NUMA
> configuration options are equivalent.
> 
> Drop CONFIG_NEED_MULTIPLE_NODES and use CONFIG_NUMA instead.
> 
> Done with
> 
> 	$ sed -i 's/CONFIG_NEED_MULTIPLE_NODES/CONFIG_NUMA/' \
> 		$(git grep -wl CONFIG_NEED_MULTIPLE_NODES)
> 	$ sed -i 's/NEED_MULTIPLE_NODES/NUMA/' \
> 		$(git grep -wl NEED_MULTIPLE_NODES)
> 
> with manual tweaks afterwards.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   arch/arm64/Kconfig                |  2 +-
>   arch/ia64/Kconfig                 |  2 +-
>   arch/mips/Kconfig                 |  2 +-
>   arch/mips/include/asm/mmzone.h    |  2 +-
>   arch/mips/include/asm/page.h      |  2 +-
>   arch/mips/mm/init.c               |  4 ++--
>   arch/powerpc/Kconfig              |  2 +-
>   arch/powerpc/include/asm/mmzone.h |  4 ++--
>   arch/powerpc/kernel/setup_64.c    |  2 +-
>   arch/powerpc/kernel/smp.c         |  2 +-
>   arch/powerpc/kexec/core.c         |  4 ++--
>   arch/powerpc/mm/Makefile          |  2 +-
>   arch/powerpc/mm/mem.c             |  4 ++--
>   arch/riscv/Kconfig                |  2 +-
>   arch/s390/Kconfig                 |  2 +-
>   arch/sh/include/asm/mmzone.h      |  4 ++--
>   arch/sh/kernel/topology.c         |  2 +-
>   arch/sh/mm/Kconfig                |  2 +-
>   arch/sh/mm/init.c                 |  2 +-
>   arch/sparc/Kconfig                |  2 +-
>   arch/sparc/include/asm/mmzone.h   |  4 ++--
>   arch/sparc/kernel/smp_64.c        |  2 +-
>   arch/sparc/mm/init_64.c           | 12 ++++++------
>   arch/x86/Kconfig                  |  2 +-
>   arch/x86/kernel/setup_percpu.c    |  6 +++---
>   arch/x86/mm/init_32.c             |  4 ++--
>   include/asm-generic/topology.h    |  2 +-
>   include/linux/memblock.h          |  6 +++---
>   include/linux/mm.h                |  4 ++--
>   include/linux/mmzone.h            |  8 ++++----
>   kernel/crash_core.c               |  2 +-
>   mm/Kconfig                        |  9 ---------
>   mm/memblock.c                     |  8 ++++----
>   mm/page_alloc.c                   |  6 +++---
>   34 files changed, 58 insertions(+), 67 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 9f1d8566bbf9..d01a1545ab8f 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1035,7 +1035,7 @@ config NODES_SHIFT
>   	int "Maximum NUMA Nodes (as a power of 2)"
>   	range 1 10
>   	default "4"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   	help
>   	  Specify the maximum number of NUMA Nodes available on the target
>   	  system.  Increases memory reserved to accommodate various tables.
> diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
> index 279252e3e0f7..da22a35e6f03 100644
> --- a/arch/ia64/Kconfig
> +++ b/arch/ia64/Kconfig
> @@ -302,7 +302,7 @@ config NODES_SHIFT
>   	int "Max num nodes shift(3-10)"
>   	range 3 10
>   	default "10"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   	help
>   	  This option specifies the maximum number of nodes in your SSI system.
>   	  MAX_NUMNODES will be 2^(This value).
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index ed51970c08e7..4704a16c2e44 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -2867,7 +2867,7 @@ config RANDOMIZE_BASE_MAX_OFFSET
>   config NODES_SHIFT
>   	int
>   	default "6"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   
>   config HW_PERF_EVENTS
>   	bool "Enable hardware performance counter support for perf events"
> diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h
> index 7649ab45e80c..602a21aee9d4 100644
> --- a/arch/mips/include/asm/mmzone.h
> +++ b/arch/mips/include/asm/mmzone.h
> @@ -8,7 +8,7 @@
>   
>   #include <asm/page.h>
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   # include <mmzone.h>
>   #endif
>   
> diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h
> index 195ff4e9771f..96bc798c1ec1 100644
> --- a/arch/mips/include/asm/page.h
> +++ b/arch/mips/include/asm/page.h
> @@ -239,7 +239,7 @@ static inline int pfn_valid(unsigned long pfn)
>   
>   /* pfn_valid is defined in linux/mmzone.h */
>   
> -#elif defined(CONFIG_NEED_MULTIPLE_NODES)
> +#elif defined(CONFIG_NUMA)
>   
>   #define pfn_valid(pfn)							\
>   ({									\
> diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
> index 97f6ca341448..19347dc6bbf8 100644
> --- a/arch/mips/mm/init.c
> +++ b/arch/mips/mm/init.c
> @@ -394,7 +394,7 @@ void maar_init(void)
>   	}
>   }
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   void __init paging_init(void)
>   {
>   	unsigned long max_zone_pfns[MAX_NR_ZONES];
> @@ -473,7 +473,7 @@ void __init mem_init(void)
>   				0x80000000 - 4, KCORE_TEXT);
>   #endif
>   }
> -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* !CONFIG_NUMA */
>   
>   void free_init_pages(const char *what, unsigned long begin, unsigned long end)
>   {
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 088dd2afcfe4..14b132cf95e2 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -671,7 +671,7 @@ config NODES_SHIFT
>   	int
>   	default "8" if PPC64
>   	default "4"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   
>   config USE_PERCPU_NUMA_NODE_ID
>   	def_bool y
> diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
> index 6cda76b57c5d..4c6c6dbd182f 100644
> --- a/arch/powerpc/include/asm/mmzone.h
> +++ b/arch/powerpc/include/asm/mmzone.h
> @@ -18,7 +18,7 @@
>    *    flags field of the struct page
>    */
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   
>   extern struct pglist_data *node_data[];
>   /*
> @@ -41,7 +41,7 @@ u64 memory_hotplug_max(void);
>   
>   #else
>   #define memory_hotplug_max() memblock_end_of_DRAM()
> -#endif /* CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* CONFIG_NUMA */
>   #ifdef CONFIG_FA_DUMP
>   #define __HAVE_ARCH_RESERVED_KERNEL_PAGES
>   #endif
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index e42b85e4f1aa..a35fbf4d0bce 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -788,7 +788,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
>   					size_t align)
>   {
>   	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int node = early_cpu_to_node(cpu);
>   	void *ptr;
>   
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 2e05c783440a..a5209ea3859e 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1047,7 +1047,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
>   			zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
>   						GFP_KERNEL, cpu_to_node(cpu));
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   		/*
>   		 * numa_node_id() works after this.
>   		 */
> diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
> index 56da5eb2b923..48525e8b5730 100644
> --- a/arch/powerpc/kexec/core.c
> +++ b/arch/powerpc/kexec/core.c
> @@ -68,11 +68,11 @@ void machine_kexec_cleanup(struct kimage *image)
>   void arch_crash_save_vmcoreinfo(void)
>   {
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	VMCOREINFO_SYMBOL(node_data);
>   	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
>   #endif
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   	VMCOREINFO_SYMBOL(contig_page_data);
>   #endif
>   #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
> index c3df3a8501d4..2ffcf540f08b 100644
> --- a/arch/powerpc/mm/Makefile
> +++ b/arch/powerpc/mm/Makefile
> @@ -13,7 +13,7 @@ obj-y				:= fault.o mem.o pgtable.o mmap.o maccess.o \
>   obj-$(CONFIG_PPC_MMU_NOHASH)	+= nohash/
>   obj-$(CONFIG_PPC_BOOK3S_32)	+= book3s32/
>   obj-$(CONFIG_PPC_BOOK3S_64)	+= book3s64/
> -obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
> +obj-$(CONFIG_NUMA) += numa.o
>   obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
>   obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
>   obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 043bbeaf407c..7a266991315f 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -126,7 +126,7 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
>   }
>   #endif
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   void __init mem_topology_setup(void)
>   {
>   	max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
> @@ -161,7 +161,7 @@ static int __init mark_nonram_nosave(void)
>   
>   	return 0;
>   }
> -#else /* CONFIG_NEED_MULTIPLE_NODES */
> +#else /* CONFIG_NUMA */
>   static int __init mark_nonram_nosave(void)
>   {
>   	return 0;
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index a8ad8eb76120..e985dbf9ff27 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -331,7 +331,7 @@ config NODES_SHIFT
>   	int "Maximum NUMA Nodes (as a power of 2)"
>   	range 1 10
>   	default "2"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   	help
>   	  Specify the maximum number of NUMA Nodes available on the target
>   	  system.  Increases memory reserved to accommodate various tables.
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index b4c7c34069f8..707afbcd81c2 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -475,7 +475,7 @@ config NUMA
>   
>   config NODES_SHIFT
>   	int
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   	default "1"
>   
>   config SCHED_SMT
> diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h
> index 6552a088dc97..7b8dead2723d 100644
> --- a/arch/sh/include/asm/mmzone.h
> +++ b/arch/sh/include/asm/mmzone.h
> @@ -2,7 +2,7 @@
>   #ifndef __ASM_SH_MMZONE_H
>   #define __ASM_SH_MMZONE_H
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   #include <linux/numa.h>
>   
>   extern struct pglist_data *node_data[];
> @@ -31,7 +31,7 @@ static inline void
>   setup_bootmem_node(int nid, unsigned long start, unsigned long end)
>   {
>   }
> -#endif /* CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* CONFIG_NUMA */
>   
>   /* Platform specific mem init */
>   void __init plat_mem_setup(void);
> diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c
> index 7a989eed3b18..76af6db9daa2 100644
> --- a/arch/sh/kernel/topology.c
> +++ b/arch/sh/kernel/topology.c
> @@ -46,7 +46,7 @@ static int __init topology_init(void)
>   {
>   	int i, ret;
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	for_each_online_node(i)
>   		register_one_node(i);
>   #endif
> diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
> index d551a9cac41e..ba569cfb4368 100644
> --- a/arch/sh/mm/Kconfig
> +++ b/arch/sh/mm/Kconfig
> @@ -120,7 +120,7 @@ config NODES_SHIFT
>   	int
>   	default "3" if CPU_SUBTYPE_SHX3
>   	default "1"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   
>   config ARCH_FLATMEM_ENABLE
>   	def_bool y
> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
> index 168d7d4dd735..ce26c7f8950a 100644
> --- a/arch/sh/mm/init.c
> +++ b/arch/sh/mm/init.c
> @@ -211,7 +211,7 @@ void __init allocate_pgdat(unsigned int nid)
>   
>   	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	NODE_DATA(nid) = memblock_alloc_try_nid(
>   				sizeof(struct pglist_data),
>   				SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 164a5254c91c..c72f52c704cd 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -265,7 +265,7 @@ config NODES_SHIFT
>   	int "Maximum NUMA Nodes (as a power of 2)"
>   	range 4 5 if SPARC64
>   	default "5"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   	help
>   	  Specify the maximum number of NUMA Nodes available on the target
>   	  system.  Increases memory reserved to accommodate various tables.
> diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h
> index 6543fb97a849..a236d8aa893a 100644
> --- a/arch/sparc/include/asm/mmzone.h
> +++ b/arch/sparc/include/asm/mmzone.h
> @@ -2,7 +2,7 @@
>   #ifndef _SPARC64_MMZONE_H
>   #define _SPARC64_MMZONE_H
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   
>   #include <linux/cpumask.h>
>   
> @@ -13,6 +13,6 @@ extern struct pglist_data *node_data[];
>   extern int numa_cpu_lookup_table[];
>   extern cpumask_t numa_cpumask_lookup_table[];
>   
> -#endif /* CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* CONFIG_NUMA */
>   
>   #endif /* _SPARC64_MMZONE_H */
> diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
> index e38d8bf454e8..c89a5971fb0d 100644
> --- a/arch/sparc/kernel/smp_64.c
> +++ b/arch/sparc/kernel/smp_64.c
> @@ -1546,7 +1546,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
>   					size_t align)
>   {
>   	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int node = cpu_to_node(cpu);
>   	void *ptr;
>   
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index e454f179cf5d..06e938d03f3b 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -903,7 +903,7 @@ struct node_mem_mask {
>   static struct node_mem_mask node_masks[MAX_NUMNODES];
>   static int num_node_masks;
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   
>   struct mdesc_mlgroup {
>   	u64	node;
> @@ -1059,7 +1059,7 @@ static void __init allocate_node_data(int nid)
>   {
>   	struct pglist_data *p;
>   	unsigned long start_pfn, end_pfn;
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   
>   	NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data),
>   					     SMP_CACHE_BYTES, nid);
> @@ -1080,7 +1080,7 @@ static void __init allocate_node_data(int nid)
>   
>   static void init_node_masks_nonnuma(void)
>   {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int i;
>   #endif
>   
> @@ -1090,7 +1090,7 @@ static void init_node_masks_nonnuma(void)
>   	node_masks[0].match = 0;
>   	num_node_masks = 1;
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	for (i = 0; i < NR_CPUS; i++)
>   		numa_cpu_lookup_table[i] = 0;
>   
> @@ -1098,7 +1098,7 @@ static void init_node_masks_nonnuma(void)
>   #endif
>   }
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   struct pglist_data *node_data[MAX_NUMNODES];
>   
>   EXPORT_SYMBOL(numa_cpu_lookup_table);
> @@ -2487,7 +2487,7 @@ int page_in_phys_avail(unsigned long paddr)
>   
>   static void __init register_page_bootmem_info(void)
>   {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int i;
>   
>   	for_each_online_node(i)
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 0045e1b44190..5d523ff70fe7 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1597,7 +1597,7 @@ config NODES_SHIFT
>   	default "10" if MAXSMP
>   	default "6" if X86_64
>   	default "3"
> -	depends on NEED_MULTIPLE_NODES
> +	depends on NUMA
>   	help
>   	  Specify the maximum number of NUMA Nodes available on the target
>   	  system.  Increases memory reserved to accommodate various tables.
> diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
> index 0941d2f44f2a..78a32b956e81 100644
> --- a/arch/x86/kernel/setup_percpu.c
> +++ b/arch/x86/kernel/setup_percpu.c
> @@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
>    */
>   static bool __init pcpu_need_numa(void)
>   {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	pg_data_t *last = NULL;
>   	unsigned int cpu;
>   
> @@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
>   					unsigned long align)
>   {
>   	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int node = early_cpu_to_node(cpu);
>   	void *ptr;
>   
> @@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
>   
>   static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
>   {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	if (early_cpu_to_node(from) == early_cpu_to_node(to))
>   		return LOCAL_DISTANCE;
>   	else
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 21ffb03f6c72..74b78840182d 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -651,7 +651,7 @@ void __init find_low_pfn_range(void)
>   		highmem_pfn_init();
>   }
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   void __init initmem_init(void)
>   {
>   #ifdef CONFIG_HIGHMEM
> @@ -677,7 +677,7 @@ void __init initmem_init(void)
>   
>   	setup_bootmem_allocator();
>   }
> -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* !CONFIG_NUMA */
>   
>   void __init setup_bootmem_allocator(void)
>   {
> diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
> index 5aa8705df87e..4dbe715be65b 100644
> --- a/include/asm-generic/topology.h
> +++ b/include/asm-generic/topology.h
> @@ -45,7 +45,7 @@
>   #endif
>   
>   #ifndef cpumask_of_node
> -  #ifdef CONFIG_NEED_MULTIPLE_NODES
> +  #ifdef CONFIG_NUMA
>       #define cpumask_of_node(node)	((node) == 0 ? cpu_online_mask : cpu_none_mask)
>     #else
>       #define cpumask_of_node(node)	((void)(node), cpu_online_mask)
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index 5984fff3f175..552309342c38 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -50,7 +50,7 @@ struct memblock_region {
>   	phys_addr_t base;
>   	phys_addr_t size;
>   	enum memblock_flags flags;
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int nid;
>   #endif
>   };
> @@ -347,7 +347,7 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
>   int memblock_set_node(phys_addr_t base, phys_addr_t size,
>   		      struct memblock_type *type, int nid);
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   static inline void memblock_set_region_node(struct memblock_region *r, int nid)
>   {
>   	r->nid = nid;
> @@ -366,7 +366,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
>   {
>   	return 0;
>   }
> -#endif /* CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* CONFIG_NUMA */
>   
>   /* Flags for memblock allocation APIs */
>   #define MEMBLOCK_ALLOC_ANYWHERE	(~(phys_addr_t)0)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c274f75efcf9..cf66f0ea7956 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -46,7 +46,7 @@ extern int sysctl_page_lock_unfairness;
>   
>   void init_mm_internals(void);
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
> +#ifndef CONFIG_NUMA		/* Don't use mapnrs, do it properly */
>   extern unsigned long max_mapnr;
>   
>   static inline void set_max_mapnr(unsigned long limit)
> @@ -2457,7 +2457,7 @@ extern void get_pfn_range_for_nid(unsigned int nid,
>   			unsigned long *start_pfn, unsigned long *end_pfn);
>   extern unsigned long find_min_pfn_with_active_regions(void);
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   static inline int early_pfn_to_nid(unsigned long pfn)
>   {
>   	return 0;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2b41e252a995..ad42f440c704 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -985,7 +985,7 @@ extern int movable_zone;
>   #ifdef CONFIG_HIGHMEM
>   static inline int zone_movable_is_highmem(void)
>   {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	return movable_zone == ZONE_HIGHMEM;
>   #else
>   	return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
> @@ -1041,17 +1041,17 @@ extern int percpu_pagelist_fraction;
>   extern char numa_zonelist_order[];
>   #define NUMA_ZONELIST_ORDER_LEN	16
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   
>   extern struct pglist_data contig_page_data;
>   #define NODE_DATA(nid)		(&contig_page_data)
>   #define NODE_MEM_MAP(nid)	mem_map
>   
> -#else /* CONFIG_NEED_MULTIPLE_NODES */
> +#else /* CONFIG_NUMA */
>   
>   #include <asm/mmzone.h>
>   
> -#endif /* !CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* !CONFIG_NUMA */
>   
>   extern struct pglist_data *first_online_pgdat(void);
>   extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 825284baaf46..53eb8bc6026d 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -455,7 +455,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>   	VMCOREINFO_SYMBOL(_stext);
>   	VMCOREINFO_SYMBOL(vmap_area_list);
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   	VMCOREINFO_SYMBOL(mem_map);
>   	VMCOREINFO_SYMBOL(contig_page_data);
>   #endif
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 218b96ccc84a..bffe4bd859f3 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -59,15 +59,6 @@ config FLAT_NODE_MEM_MAP
>   	def_bool y
>   	depends on !SPARSEMEM
>   
> -#
> -# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
> -# to represent different areas of memory.  This variable allows
> -# those dependencies to exist individually.
> -#
> -config NEED_MULTIPLE_NODES
> -	def_bool y
> -	depends on NUMA
> -
>   #
>   # SPARSEMEM_EXTREME (which is the default) does some bootmem
>   # allocations when sparse_init() is called.  If this cannot
> diff --git a/mm/memblock.c b/mm/memblock.c
> index afaefa8fc6ab..123feef5259d 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -92,7 +92,7 @@
>    * system initialization completes.
>    */
>   
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   struct pglist_data __refdata contig_page_data;
>   EXPORT_SYMBOL(contig_page_data);
>   #endif
> @@ -607,7 +607,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
>   		 * area, insert that portion.
>   		 */
>   		if (rbase > base) {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   			WARN_ON(nid != memblock_get_region_node(rgn));
>   #endif
>   			WARN_ON(flags != rgn->flags);
> @@ -1205,7 +1205,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
>   int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
>   				      struct memblock_type *type, int nid)
>   {
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   	int start_rgn, end_rgn;
>   	int i, ret;
>   
> @@ -1849,7 +1849,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
>   		size = rgn->size;
>   		end = base + size - 1;
>   		flags = rgn->flags;
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   		if (memblock_get_region_node(rgn) != MAX_NUMNODES)
>   			snprintf(nid_buf, sizeof(nid_buf), " on node %d",
>   				 memblock_get_region_node(rgn));
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6fc22482eaa8..8f08135d3eb4 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1596,7 +1596,7 @@ void __free_pages_core(struct page *page, unsigned int order)
>   	__free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
>   }
>   
> -#ifdef CONFIG_NEED_MULTIPLE_NODES
> +#ifdef CONFIG_NUMA
>   
>   /*
>    * During memory init memblocks map pfns to nids. The search is expensive and
> @@ -1646,7 +1646,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
>   
>   	return nid;
>   }
> -#endif /* CONFIG_NEED_MULTIPLE_NODES */
> +#endif /* CONFIG_NUMA */
>   
>   void __init memblock_free_pages(struct page *page, unsigned long pfn,
>   							unsigned int order)
> @@ -7276,7 +7276,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
>   	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
>   				__func__, pgdat->node_id, (unsigned long)pgdat,
>   				(unsigned long)pgdat->node_mem_map);
> -#ifndef CONFIG_NEED_MULTIPLE_NODES
> +#ifndef CONFIG_NUMA
>   	/*
>   	 * With no DISCONTIG, the global mem_map is just set as node 0's
>   	 */
> 

Nice

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH 9/9] mm: replace CONFIG_FLAT_NODE_MEM_MAP with CONFIG_FLATMEM
From: David Hildenbrand @ 2021-06-09 10:58 UTC (permalink / raw)
  To: Mike Rapoport, Andrew Morton
  Cc: linux-ia64, linux-sh, linux-mips, linux-mm, sparclinux,
	linux-riscv, linux-arch, linux-s390, Jonathan Corbet, linux-doc,
	Mike Rapoport, Geert Uytterhoeven, Matt Turner, linux-snps-arc,
	linux-xtensa, Arnd Bergmann, linux-m68k, Ivan Kokshaysky,
	linux-arm-kernel, Richard Henderson, Vineet Gupta, kexec,
	linux-kernel, linux-alpha, linuxppc-dev
In-Reply-To: <20210602105348.13387-10-rppt@kernel.org>

On 02.06.21 12:53, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> After removal of the DISCONTIGMEM memory model the FLAT_NODE_MEM_MAP
> configuration option is equivalent to FLATMEM.
> 
> Drop CONFIG_FLAT_NODE_MEM_MAP and use CONFIG_FLATMEM instead.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   include/linux/mmzone.h | 4 ++--
>   kernel/crash_core.c    | 2 +-
>   mm/Kconfig             | 4 ----
>   mm/page_alloc.c        | 6 +++---
>   mm/page_ext.c          | 2 +-
>   5 files changed, 7 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index ad42f440c704..2698cdbfbf75 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -775,7 +775,7 @@ typedef struct pglist_data {
>   	struct zonelist node_zonelists[MAX_ZONELISTS];
>   
>   	int nr_zones; /* number of populated zones in this node */
> -#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
> +#ifdef CONFIG_FLATMEM	/* means !SPARSEMEM */
>   	struct page *node_mem_map;
>   #ifdef CONFIG_PAGE_EXTENSION
>   	struct page_ext *node_page_ext;
> @@ -865,7 +865,7 @@ typedef struct pglist_data {
>   
>   #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
>   #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
> -#ifdef CONFIG_FLAT_NODE_MEM_MAP
> +#ifdef CONFIG_FLATMEM
>   #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
>   #else
>   #define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 53eb8bc6026d..2b8446ea7105 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -483,7 +483,7 @@ static int __init crash_save_vmcoreinfo_init(void)
>   	VMCOREINFO_OFFSET(page, compound_head);
>   	VMCOREINFO_OFFSET(pglist_data, node_zones);
>   	VMCOREINFO_OFFSET(pglist_data, nr_zones);
> -#ifdef CONFIG_FLAT_NODE_MEM_MAP
> +#ifdef CONFIG_FLATMEM
>   	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
>   #endif
>   	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index bffe4bd859f3..ded98fb859ab 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -55,10 +55,6 @@ config FLATMEM
>   	def_bool y
>   	depends on !SPARSEMEM || FLATMEM_MANUAL
>   
> -config FLAT_NODE_MEM_MAP
> -	def_bool y
> -	depends on !SPARSEMEM
> -
>   #
>   # SPARSEMEM_EXTREME (which is the default) does some bootmem
>   # allocations when sparse_init() is called.  If this cannot
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8f08135d3eb4..f039736541eb 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6444,7 +6444,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
>   	}
>   }
>   
> -#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
> +#if !defined(CONFIG_FLATMEM)
>   /*
>    * Only struct pages that correspond to ranges defined by memblock.memory
>    * are zeroed and initialized by going through __init_single_page() during
> @@ -7241,7 +7241,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
>   	}
>   }
>   
> -#ifdef CONFIG_FLAT_NODE_MEM_MAP
> +#ifdef CONFIG_FLATMEM
>   static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
>   {
>   	unsigned long __maybe_unused start = 0;
> @@ -7289,7 +7289,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
>   }
>   #else
>   static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
> -#endif /* CONFIG_FLAT_NODE_MEM_MAP */
> +#endif /* CONFIG_FLATMEM */
>   
>   #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
>   static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
> diff --git a/mm/page_ext.c b/mm/page_ext.c
> index df6f74aac8e1..293b2685fc48 100644
> --- a/mm/page_ext.c
> +++ b/mm/page_ext.c
> @@ -191,7 +191,7 @@ void __init page_ext_init_flatmem(void)
>   	panic("Out of memory");
>   }
>   
> -#else /* CONFIG_FLAT_NODE_MEM_MAP */
> +#else /* CONFIG_FLATMEM */
>   
>   struct page_ext *lookup_page_ext(const struct page *page)
>   {
> 

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH v2 0/9] Remove DISCINTIGMEM memory model
From: Arnd Bergmann @ 2021-06-09 11:30 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-ia64, Linux-sh list, open list:BROADCOM NVRAM DRIVER,
	Linux-MM, sparclinux, linux-riscv, linux-arch, linux-s390,
	Jonathan Corbet, open list:DOCUMENTATION, Mike Rapoport,
	Geert Uytterhoeven, Matt Turner,
	open list:SYNOPSYS ARC ARCHITECTURE,
	open list:TENSILICA XTENSA PORT (xtensa), linux-m68k,
	Ivan Kokshaysky, Linux ARM, Richard Henderson, Vineet Gupta,
	kexec, Linux Kernel Mailing List, alpha, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20210604064916.26580-1-rppt@kernel.org>

On Fri, Jun 4, 2021 at 8:49 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> From: Mike Rapoport <rppt@linux.ibm.com>
>
> Hi,
>
> SPARSEMEM memory model was supposed to entirely replace DISCONTIGMEM a
> (long) while ago. The last architectures that used DISCONTIGMEM were
> updated to use other memory models in v5.11 and it is about the time to
> entirely remove DISCONTIGMEM from the kernel.
>
> This set removes DISCONTIGMEM from alpha, arc and m68k, simplifies memory
> model selection in mm/Kconfig and replaces usage of redundant
> CONFIG_NEED_MULTIPLE_NODES and CONFIG_FLAT_NODE_MEM_MAP with CONFIG_NUMA
> and CONFIG_FLATMEM respectively.
>
> I've also removed NUMA support on alpha that was BROKEN for more than 15
> years.
>
> There were also minor updates all over arch/ to remove mentions of
> DISCONTIGMEM in comments and #ifdefs.

Hi Mike and Andrew,

It looks like everyone is happy with this version so far. How should we merge it
for linux-next? I'm happy to take it through the asm-generic tree, but linux-mm
would fit at least as well. In case we go for linux-mm, feel free to add

Acked-by: Arnd Bergmann <arnd@arndb.de>

for the whole series.

^ permalink raw reply

* Re: [PATCH 2/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats
From: kajoljain @ 2021-06-09 12:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: nvdimm, santosh, maddy, ira.weiny, rnsastry, linux-kernel,
	atrajeev, aneesh.kumar, vaibhav, dan.j.williams, linuxppc-dev,
	tglx
In-Reply-To: <YL+qpL/+ReGfqXce@hirez.programming.kicks-ass.net>



On 6/8/21 11:06 PM, Peter Zijlstra wrote:
> On Tue, Jun 08, 2021 at 05:26:58PM +0530, Kajol Jain wrote:
>> +static int nvdimm_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
>> +{
>> +	struct nvdimm_pmu *nd_pmu;
>> +	u32 target;
>> +	int nodeid;
>> +	const struct cpumask *cpumask;
>> +
>> +	nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
>> +
>> +	/* Clear it, incase given cpu is set in nd_pmu->arch_cpumask */
>> +	cpumask_test_and_clear_cpu(cpu, &nd_pmu->arch_cpumask);
>> +
>> +	/*
>> +	 * If given cpu is not same as current designated cpu for
>> +	 * counter access, just return.
>> +	 */
>> +	if (cpu != nd_pmu->cpu)
>> +		return 0;
>> +
>> +	/* Check for any active cpu in nd_pmu->arch_cpumask */
>> +	target = cpumask_any(&nd_pmu->arch_cpumask);
>> +	nd_pmu->cpu = target;
>> +
>> +	/*
>> +	 * Incase we don't have any active cpu in nd_pmu->arch_cpumask,
>> +	 * check in given cpu's numa node list.
>> +	 */
>> +	if (target >= nr_cpu_ids) {
>> +		nodeid = cpu_to_node(cpu);
>> +		cpumask = cpumask_of_node(nodeid);
>> +		target = cpumask_any_but(cpumask, cpu);
>> +		nd_pmu->cpu = target;
>> +
>> +		if (target >= nr_cpu_ids)
>> +			return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int nvdimm_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
>> +{
>> +	struct nvdimm_pmu *nd_pmu;
>> +
>> +	nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
>> +
>> +	if (nd_pmu->cpu >= nr_cpu_ids)
>> +		nd_pmu->cpu = cpu;
>> +
>> +	return 0;
>> +}
> 
>> +static int nvdimm_pmu_cpu_hotplug_init(struct nvdimm_pmu *nd_pmu)
>> +{
>> +	int nodeid, rc;
>> +	const struct cpumask *cpumask;
>> +
>> +	/*
>> +	 * Incase cpu hotplug is not handled by arch specific code
>> +	 * they can still provide required cpumask which can be used
>> +	 * to get designatd cpu for counter access.
>> +	 * Check for any active cpu in nd_pmu->arch_cpumask.
>> +	 */
>> +	if (!cpumask_empty(&nd_pmu->arch_cpumask)) {
>> +		nd_pmu->cpu = cpumask_any(&nd_pmu->arch_cpumask);
>> +	} else {
>> +		/* pick active cpu from the cpumask of device numa node. */
>> +		nodeid = dev_to_node(nd_pmu->dev);
>> +		cpumask = cpumask_of_node(nodeid);
>> +		nd_pmu->cpu = cpumask_any(cpumask);
>> +	}
>> +
>> +	rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/nvdimm:online",
>> +				     nvdimm_pmu_cpu_online, nvdimm_pmu_cpu_offline);
>> +
> 
> Did you actually test this hotplug stuff?
> 
> That is, create a counter, unplug the CPU the counter was on, and
> continue counting? "perf stat -I" is a good option for this, concurrent
> with a hotplug.
>
> Because I don't think it's actually correct. The thing is perf core is
> strictly per-cpu, and it will place the event on a specific CPU context.
> If you then unplug that CPU, nothing will touch the events on that CPU
> anymore.
> 
> What drivers that span CPUs need to do is call
> perf_pmu_migrate_context() whenever the CPU they were assigned to goes
> away. Please have a look at arch/x86/events/rapl.c or
> arch/x86/events/amd/power.c for relatively simple drivers that have this
> property.
> 


Hi Peter,
    Primarily I tested off-lining multiple cpus and checking if cpumask file is updating as expected,
followed with perf stat commands.
But I missed the scenario where we are off-lining CPU while running perf stat. My bad, thanks
for pointing it out.
I will fix this issue and send new version of the patchset.

Thanks,
Kajol Jain
> 

^ permalink raw reply

* Re: [PATCH] powerpc/bpf: Use bctrl for making function calls
From: Naveen N. Rao @ 2021-06-09 13:11 UTC (permalink / raw)
  To: bpf, Christophe Leroy, linuxppc-dev
In-Reply-To: <4c371bd1-1fcf-54c1-d0a2-836d40887893@csgroup.eu>

Christophe Leroy wrote:
> 
> 
> Le 09/06/2021 à 11:00, Naveen N. Rao a écrit :
>> blrl corrupts the link stack. Instead use bctrl when making function
>> calls from BPF programs.
> 
> What's the link stack ? Is it the PPC64 branch predictor stack ?

c974809a26a13e ("powerpc/vdso: Avoid link stack corruption in 
__get_datapage()") has a good write up on the link stack.

> 
>> 
>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>>   arch/powerpc/include/asm/ppc-opcode.h |  1 +
>>   arch/powerpc/net/bpf_jit_comp32.c     |  4 ++--
>>   arch/powerpc/net/bpf_jit_comp64.c     | 12 ++++++------
>>   3 files changed, 9 insertions(+), 8 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
>> index ac41776661e963..1abacb8417d562 100644
>> --- a/arch/powerpc/include/asm/ppc-opcode.h
>> +++ b/arch/powerpc/include/asm/ppc-opcode.h
>> @@ -451,6 +451,7 @@
>>   #define PPC_RAW_MTLR(r)			(0x7c0803a6 | ___PPC_RT(r))
>>   #define PPC_RAW_MFLR(t)			(PPC_INST_MFLR | ___PPC_RT(t))
>>   #define PPC_RAW_BCTR()			(PPC_INST_BCTR)
>> +#define PPC_RAW_BCTRL()			(PPC_INST_BCTRL)
> 
> Can you use the numeric value instead of the PPC_INST_BCTRL, to avoid conflict with 
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/4ca2bfdca2f47a293d05f61eb3c4e487ee170f1f.1621506159.git.christophe.leroy@csgroup.eu/

Sure. I'll post a v2.

- Naveen


^ permalink raw reply

* Re: [PATCH v2 0/9] Remove DISCINTIGMEM memory model
From: Mike Rapoport @ 2021-06-09 14:50 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linux-ia64, Linux-sh list, open list:BROADCOM NVRAM DRIVER,
	Linux-MM, sparclinux, linux-riscv, linux-arch, linux-s390,
	Jonathan Corbet, open list:DOCUMENTATION, Mike Rapoport,
	Geert Uytterhoeven, Matt Turner,
	open list:SYNOPSYS ARC ARCHITECTURE,
	open list:TENSILICA XTENSA PORT (xtensa), linux-m68k,
	Ivan Kokshaysky, Linux ARM, Richard Henderson, Vineet Gupta,
	kexec, Linux Kernel Mailing List, alpha, Andrew Morton,
	linuxppc-dev
In-Reply-To: <CAK8P3a2tZDJDqgr9-1vJrnbDhd_36eKq8LMEznDkU7rvuAnAag@mail.gmail.com>

Hi Arnd,

On Wed, Jun 09, 2021 at 01:30:39PM +0200, Arnd Bergmann wrote:
> On Fri, Jun 4, 2021 at 8:49 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > From: Mike Rapoport <rppt@linux.ibm.com>
> >
> > Hi,
> >
> > SPARSEMEM memory model was supposed to entirely replace DISCONTIGMEM a
> > (long) while ago. The last architectures that used DISCONTIGMEM were
> > updated to use other memory models in v5.11 and it is about the time to
> > entirely remove DISCONTIGMEM from the kernel.
> >
> > This set removes DISCONTIGMEM from alpha, arc and m68k, simplifies memory
> > model selection in mm/Kconfig and replaces usage of redundant
> > CONFIG_NEED_MULTIPLE_NODES and CONFIG_FLAT_NODE_MEM_MAP with CONFIG_NUMA
> > and CONFIG_FLATMEM respectively.
> >
> > I've also removed NUMA support on alpha that was BROKEN for more than 15
> > years.
> >
> > There were also minor updates all over arch/ to remove mentions of
> > DISCONTIGMEM in comments and #ifdefs.
> 
> Hi Mike and Andrew,
> 
> It looks like everyone is happy with this version so far. How should we merge it
> for linux-next? I'm happy to take it through the asm-generic tree, but linux-mm
> would fit at least as well. In case we go for linux-mm, feel free to add

Andrew already took to mmotm.
 
> Acked-by: Arnd Bergmann <arnd@arndb.de>

Thanks!

> for the whole series.

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH 1/1] of/pci: Add IORESOURCE_MEM_64 to resource flags for 64-bit memory addresses
From: Bjorn Helgaas @ 2021-06-09 18:50 UTC (permalink / raw)
  To: Rob Herring
  Cc: Leonardo Bras, devicetree, Alexey Kardashevskiy, PCI,
	linuxppc-dev, linux-kernel@vger.kernel.org, Frank Rowand
In-Reply-To: <CAL_Jsq+WwAeziGN4EfPAWfA0fieAjfcxfi29=StOx0GeKjAe_g@mail.gmail.com>

On Thu, Apr 15, 2021 at 01:59:52PM -0500, Rob Herring wrote:
> On Thu, Apr 15, 2021 at 1:01 PM Leonardo Bras <leobras.c@gmail.com> wrote:
> >
> > Many other resource flag parsers already add this flag when the input
> > has bits 24 & 25 set, so update this one to do the same.

[Adding this to the thread for archaeological purposes since it didn't
make it to the commit log]

The other resource flag parsers appear to be:

  pci_parse_of_flags(u32 addr0, ...)    # powerpc/kernel/pci_of_scan.c
    unsigned int as = addr0 & OF_PCI_ADDR0_SPACE_MASK;
    if (as == OF_PCI_ADDR0_SPACE_MMIO32 || as == OF_PCI_ADDR0_SPACE_MMIO64)
      flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
      if (as == OF_PCI_ADDR0_SPACE_MMIO64)
        flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | IORESOURCE_MEM_64;

  pci_parse_of_flags(u32 addr0)         # sparc/kernel/pci.c
    if (addr0 & 0x02000000) {
      flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY;
      if (addr0 & 0x01000000)
        flags |= IORESOURCE_MEM_64 | PCI_BASE_ADDRESS_MEM_TYPE_64;

  of_bus_pci_get_flags(... addr)        # drivers/of/address.c (this one)
    u32 w = be32_to_cpup(addr);
    switch((w >> 24) & 0x03) {
    case 0x02: /* 32 bits */
      flags |= IORESOURCE_MEM;
      break;
    case 0x03: /* 64 bits */
      flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
      break;

It's super annoying to have three copies of essentially the same
thing.  Even more annoying that they test the same things in three
completely different ways.  But I remember looking at this several
years ago, and it wasn't as simple to unify these as I had hoped.

> Many others? Looks like sparc and powerpc to me. Those would be the
> ones I worry about breaking. Sparc doesn't use of/address.c so it's
> fine. Powerpc version of the flags code was only fixed in 2019, so I
> don't think powerpc will care either.

I'm guessing you're referring to df5be5be8735 ("powerpc/pci/of: Fix OF
flags parsing for 64bit BARs").

> I noticed both sparc and powerpc set PCI_BASE_ADDRESS_MEM_TYPE_64 in
> the flags. AFAICT, that's not set anywhere outside of arch code. So
> never for riscv, arm and arm64 at least. That leads me to
> pci_std_update_resource() which is where the PCI code sets BARs and
> just copies the flags in PCI_BASE_ADDRESS_MEM_MASK ignoring
> IORESOURCE_* flags. So it seems like 64-bit is still not handled and
> neither is prefetch.
> 
> > Some devices (like virtio-net) have more than one memory resource
> > (like MMIO32 and MMIO64) and without this flag it would be needed to
> > verify the address range to know which one is which.
> >
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > ---
> >  drivers/of/address.c | 5 ++++-
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/of/address.c b/drivers/of/address.c
> > index 73ddf2540f3f..dc7147843783 100644
> > --- a/drivers/of/address.c
> > +++ b/drivers/of/address.c
> > @@ -116,9 +116,12 @@ static unsigned int of_bus_pci_get_flags(const __be32 *addr)
> >                 flags |= IORESOURCE_IO;
> >                 break;
> >         case 0x02: /* 32 bits */
> > -       case 0x03: /* 64 bits */
> >                 flags |= IORESOURCE_MEM;
> >                 break;
> > +
> > +       case 0x03: /* 64 bits */
> > +               flags |= IORESOURCE_MEM | IORESOURCE_MEM_64;
> > +               break;
> >         }
> >         if (w & 0x40000000)
> >                 flags |= IORESOURCE_PREFETCH;
> > --
> > 2.30.2
> >

^ permalink raw reply

* Re: [RFC] powerpc/pseries: Interface to represent PAPR firmware attributes
From: Fabiano Rosas @ 2021-06-10  0:03 UTC (permalink / raw)
  To: Pratik Sampat, mpe, benh, paulus, linuxppc-dev, kvm-ppc,
	linux-kernel, pratik.r.sampat
In-Reply-To: <5c9cb57b-e9d8-0361-8be7-60dc9618db34@linux.ibm.com>

Pratik Sampat <psampat@linux.ibm.com> writes:

>>> 3. version info                      - 1 byte
>>> 4. A data array of size num attributes, which contains the following:
>>>    a. attribute ID              - 8 bytes
>>>    b. attribute value in number - 8 bytes
>>>    c. attribute name in string  - 64 bytes
>>>    d. attribute value in string - 64 bytes
>> Is this new hypercall already present in the spec? These seem a bit
>> underspecified to me.
>
> Yes, it is present in the spec. I probably summarized a little more than needed
> here and I could expand upon below.
>
> The input buffer recives the following data:
>
> 1. “flags”:
> 	a. Bit 0: singleAttribute
> 		If set to 1, only return the single attribute matching firstAttributeId.
> 	b. Bits 1-63: Reserved
> 2. “firstAttributeId”: The first attribute to retrieve
> 3. “bufferAddress”: The logical real address of the start of the output buffer
> 4. “bufferSize”: The size in bytes of the output buffer
> 	
>
>  From the document, the format of the output buffer is as follows:
>
> Table 1 --> output buffer
> ================================================================================
> | Field Name           | Byte   | Length   |  Description
> |                      | Offset | in Bytes |
> ================================================================================
> | NumberOf             |        |          | Number of Attributes in Buffer
> | AttributesInBuffer   | 0x000  | 0x08     |
> --------------------------------------------------------------------------------
> | AttributeArrayOffset | 0x008  | 0x08     | Byte offset to start of Array
> |                      |        |          | of Attributes
> |                      |        |          |
> --------------------------------------------------------------------------------
> | OutputBufferData     |        |          | Version of the Header.
> | HeaderVersion        | 0x010  | 0x01     | The header will be always
> | AttributesInBuffer   |        |          | backward compatible, and changes
> |                      |        |          | will not impact the Array of
> |                      |        |          | attributes.
> |                      |        |          | Current version = 0x01

This is not clear to me. In the event of a header version change, is the
total set of attributes guaranteed to remain the same? Or only the array
layout? We might not need to expose the version information after all.

> --------------------------------------------------------------------------------
> | ArrayOfAttributes    |        |          | The array will contain
> |                      |        |          | "NumberOfAttributesInBuffer"
> |                      |        |          | array elements not to exceed
> |                      |        |          | the size of the buffer.
> |                      |        |          | Layout of the array is
> |                      |        |          | detailed in Table 2.
> --------------------------------------------------------------------------------
>
>
> Table 2 --> Array of attributes
> ================================================================================
> | Field Name           | Byte   | Length   |  Description
> |                      | Offset | in Bytes |
> ================================================================================
> | 1st AttributeId      | 0x000  | 0x08     | The ID of the Attribute
> --------------------------------------------------------------------------------
> | 1st AttributeValue   | 0x008  | 0x08     | The numerical value of
> |                      |        |          | the attribute
> --------------------------------------------------------------------------------
> | 1st AttributeString  | 0x010  | 0x40     | The ASCII string
> | Description          |        |          | description of the
> |                      |        |          | attribute, up to 63
> |                      |        |          | characters plus a NULL
> |                      |        |          | terminator.

There is a slight disconnect in that this is called "description" by the
spec, which makes me think they could eventually have something more
verbose than what you'd expect from "name".

So they could give us either: "Frequency" or "The Frequency in GigaHertz".

> --------------------------------------------------------------------------------
> | 1st AttributeValue   | 0x050  | 0x40     | The ASCII string
> | StringDescription    |        |          | description of the
> |                      |        |          | attribute value, up to 63
> |                      |        |          | characters plus a NULL
> |                      |        |          | terminator. If this
> |                      |        |          | contains only a NULL
> |                      |        |          | terminator, then there is
> |                      |        |          | no ASCII string
> |                      |        |          | associated with AttributeValue.
> --------------------------------------------------------------------------------
> | ....                 |        |          |
>
>
>>
>>> The new H_CALL exports information in direct string value format, hence
>>> a new interface has been introduced in /sys/firmware/papr to export
>> Hm.. Maybe this should be something less generic than "papr"?
>
> The interface naming was inspired from /sys/firmware/opal's naming convention.
> We believed the name PAPR could serve as more generic name to be used by both
> Linux running on PHYP and linux on KVM.

Right, I agree with that rationale, but /opal has identifiable elements
in it whereas /papr would have the generic "attr_X_name", which does not
give much hint about what they are.

We also expect people to iterate the "attr_X_*" files, so if we decide
to add something else under /papr in the future, that would potentially
cause issues with any tool that just lists the content of the directory.

So maybe we should be proactive and put the hcall stuff inside a
subdirectory already. /papr/energy_scale_attrs comes to mind, but I
don't have a strong opinion on the particular name.

>
> If you have something more concrete in mind, please let me know. I'm open to
> suggestions.
>
>>
>>> this information to userspace in an extensible pass-through format.
>>> The H_CALL returns the name, numeric value and string value. As string
>>> values are in human readable format, therefore if the string value
>>> exists then that is given precedence over the numeric value.
>> So the hypervisor could simply not send the string representation? How
>> will the userspace tell the difference since they are reading everything
>> from a file?
>>
>> Overall I'd say we should give the data in a more structured way and let
>> the user-facing tool do the formatting and presentation.
>
> That's a valid concern, the design for this was inspired from hwmon's interface
> to housing the sensor information.
>
> One alternative to add more structure to this format could be to introduce:
> attr_X_name, attr_X_num_val, attr_X_str_val
>
> However, in some cases like min/max frequency the string value is empty. In
> that case the file attr_X_str_val will also be empty.
> Is that an acceptable format of having empty files that in some cases will
> never be populated?

I'm thinking yes, but I'm not sure. Let's see if someone else has a say
in this.

> We also went ahead to confirm with the SPEC team that if a string value exists
> in their buffer, that must be given precedence.

Huh.. That must be a recommendation only. The hypervisor has no control
over how people present the information in userspace.

>
> Another alternative format could to keep attr_X_name, attr_X_val intact but
> change what X means. Currently X is just an iteratively increasing number. But
> X can also serve as an ID which we get from H_CALL output buffer.

This seems like a good idea. It makes it easier to correlate the
attribute with what is in PAPR.

>
> In this case, we should also include some versioning so that the tool now also
> has cognizance of contents of each file.
>
>>> The format of exposing the sysfs information is as follows:
>>> /sys/firmware/papr/
>>>    |-- attr_0_name
>>>    |-- attr_0_val
>>>    |-- attr_1_name
>>>    |-- attr_1_val
>>> ...
>> How do we keep a stable interface with userspace? Say the hypervisor
>> decides to add or remove attributes, change their order, string
>> representation, etc? It will inform us via the version field, but that
>> is lost when we output this to sysfs.
>>
>> I get that if the userspace just iterate over the contents of the
>> directory then nothing breaks, but there is not much else it could do it
>> seems.
>
> Fair point, having the version exposed to the sysfs does seem crucial.
>
> Currently in ppc-utils we iterate over all the information, however as you
> rightly pointed out there may be other tools needing just specific information.
> The alternative I suggested a few sentences above to include ID based attribute
> naming and versioning maybe a more elegant way of solving this problem.
>
> What are your thoughts on a design like this?
>

Based on all the new information you provided, I'd say present all the
data and group it under the ID:

/sys/firmware/papr/energy_scale_attrs/
   |-- <id>/
     |-- desc
     |-- value
     |-- value_desc
   |-- <id>/
     |-- desc
     |-- value
     |-- value_desc

Is that workable?

>>> The energy information that is exported is useful for userspace tools
>>> such as powerpc-utils. Currently these tools infer the
>>> "power_mode_data" value in the lparcfg, which in turn is obtained from
>>> the to be deprecated H_GET_EM_PARMS H_CALL.
>>> On future platforms, such userspace utilities will have to look at the
>>> data returned from the new H_CALL being populated in this new sysfs
>>> interface and report this information directly without the need of
>>> interpretation.
>>>
>>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>
> Thanks
> Pratik

^ permalink raw reply

* [PATCH] fs: btrfs: Disable BTRFS on platforms having 256K pages
From: Christophe Leroy @ 2021-06-10  5:23 UTC (permalink / raw)
  To: Chris Mason, Josef Bacik, David Sterba
  Cc: linux-hexagon, linuxppc-dev, linux-kernel, linux-btrfs

With a config having PAGE_SIZE set to 256K, BTRFS build fails
with the following message

 include/linux/compiler_types.h:326:38: error: call to '__compiletime_assert_791' declared with attribute error: BUILD_BUG_ON failed: (BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0

BTRFS_MAX_COMPRESSED being 128K, BTRFS cannot support platforms with
256K pages at the time being.

There are two platforms that can select 256K pages:
 - hexagon
 - powerpc

Disable BTRFS when 256K page size is selected.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 fs/btrfs/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 68b95ad82126..520a0f6a7d9e 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -18,6 +18,8 @@ config BTRFS_FS
 	select RAID6_PQ
 	select XOR_BLOCKS
 	select SRCU
+	depends on !PPC_256K_PAGES	# powerpc
+	depends on !PAGE_SIZE_256KB	# hexagon
 
 	help
 	  Btrfs is a general purpose copy-on-write filesystem with extents,
-- 
2.25.0


^ permalink raw reply related

* [PATCH] ASoC:fsl_spdif:Remove superfluous error message around platform_get_irq()
From:  Zhongjun Tan @ 2021-06-10  4:00 UTC (permalink / raw)
  To: timur, nicoleotsuka, Xiubo.Lee, festevam, shengjiu.wang,
	lgirdwood, broonie, perex, tiwai
  Cc: alsa-devel, linuxppc-dev, linux-kernel, Tan Zhongjun

From: Tan Zhongjun <tanzhongjun@yulong.com>

The platform_get_irq() prints error message telling that interrupt is
missing, hence there is no need to duplicated that message.

Signed-off-by: Tan Zhongjun <tanzhongjun@yulong.com>
---
 sound/soc/fsl/fsl_spdif.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/soc/fsl/fsl_spdif.c b/sound/soc/fsl/fsl_spdif.c
index 2a76714eb8e6..29cefd459241 100644
--- a/sound/soc/fsl/fsl_spdif.c
+++ b/sound/soc/fsl/fsl_spdif.c
@@ -1368,10 +1368,8 @@ static int fsl_spdif_probe(struct platform_device *pdev)
 
 	for (i = 0; i < spdif_priv->soc->interrupts; i++) {
 		irq = platform_get_irq(pdev, i);
-		if (irq < 0) {
-			dev_err(&pdev->dev, "no irq for node %s\n", pdev->name);
+		if (irq < 0)
 			return irq;
-		}
 
 		ret = devm_request_irq(&pdev->dev, irq, spdif_isr, 0,
 				       dev_name(&pdev->dev), spdif_priv);
-- 
2.17.1


^ permalink raw reply related

* [PATCH] powerpc/signal64: Don't read sigaction arguments back from user memory
From: Michael Ellerman @ 2021-06-10  7:29 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: cmr

When delivering a signal to a sigaction style handler (SA_SIGINFO), we
pass pointers to the siginfo and ucontext via r4 and r5.

Currently we populate the values in those registers by reading the
pointers out of the sigframe in user memory, even though the values in
user memory were written by the kernel just prior:

  unsafe_put_user(&frame->info, &frame->pinfo, badframe_block);
  unsafe_put_user(&frame->uc, &frame->puc, badframe_block);
  ...
  if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
  	err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
  	err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);

ie. we write &frame->info into frame->pinfo, and then read frame->pinfo
back into r4, and similarly for &frame->uc.

The code has always been like this, since linux-fullhistory commit
d4f2d95eca2c ("Forward port of 2.4 ppc64 signal changes.").

There's no reason for us to read the values back from user memory,
rather than just setting the value in the gpr[4/5] directly. In fact
reading the value back from user memory opens up the possibility of
another user thread changing the values before we read them back.
Although any process doing that would be racing against the kernel
delivering the signal, and would risk corrupting the stack, so that
would be a userspace bug.

Note that this is 64-bit only code, so there's no subtlety with the size
of pointers differing between kernel and user. Also the frame variable
is not modified to point elsewhere during the function.

In the past reading the values back from user memory was not costly, but
now that we have KUAP on some CPUs it is, so we'd rather avoid it for
that reason too.

So change the code to just set the values directly, using the same
values we have written to the sigframe previously in the function.

Note also that this matches what our 32-bit signal code does.

Using a version of will-it-scale's signal1_threads that sets SA_SIGINFO,
this results in a ~4% increase in signals per second on a Power9, from
229,777 to 239,766.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/signal_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index dca66481d0c2..f58e7a98d0df 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -948,8 +948,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	regs->gpr[3] = ksig->sig;
 	regs->result = 0;
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		err |= get_user(regs->gpr[4], (unsigned long __user *)&frame->pinfo);
-		err |= get_user(regs->gpr[5], (unsigned long __user *)&frame->puc);
+		regs->gpr[4] = (unsigned long)&frame->info;
+		regs->gpr[5] = (unsigned long)&frame->uc;
 		regs->gpr[6] = (unsigned long) frame;
 	} else {
 		regs->gpr[4] = (unsigned long)&frame->uc.uc_mcontext;
-- 
2.25.1

^ permalink raw reply related

* Re: [RFC] powerpc/pseries: Interface to represent PAPR firmware attributes
From: Pratik Sampat @ 2021-06-10  8:01 UTC (permalink / raw)
  To: Fabiano Rosas, mpe, benh, paulus, linuxppc-dev, kvm-ppc,
	linux-kernel, pratik.r.sampat
In-Reply-To: <87tum6vb58.fsf@linux.ibm.com>



On 10/06/21 5:33 am, Fabiano Rosas wrote:
> Pratik Sampat <psampat@linux.ibm.com> writes:
>
>>>> 3. version info                      - 1 byte
>>>> 4. A data array of size num attributes, which contains the following:
>>>>     a. attribute ID              - 8 bytes
>>>>     b. attribute value in number - 8 bytes
>>>>     c. attribute name in string  - 64 bytes
>>>>     d. attribute value in string - 64 bytes
>>> Is this new hypercall already present in the spec? These seem a bit
>>> underspecified to me.
>> Yes, it is present in the spec. I probably summarized a little more than needed
>> here and I could expand upon below.
>>
>> The input buffer recives the following data:
>>
>> 1. “flags”:
>> 	a. Bit 0: singleAttribute
>> 		If set to 1, only return the single attribute matching firstAttributeId.
>> 	b. Bits 1-63: Reserved
>> 2. “firstAttributeId”: The first attribute to retrieve
>> 3. “bufferAddress”: The logical real address of the start of the output buffer
>> 4. “bufferSize”: The size in bytes of the output buffer
>> 	
>>
>>   From the document, the format of the output buffer is as follows:
>>
>> Table 1 --> output buffer
>> ================================================================================
>> | Field Name           | Byte   | Length   |  Description
>> |                      | Offset | in Bytes |
>> ================================================================================
>> | NumberOf             |        |          | Number of Attributes in Buffer
>> | AttributesInBuffer   | 0x000  | 0x08     |
>> --------------------------------------------------------------------------------
>> | AttributeArrayOffset | 0x008  | 0x08     | Byte offset to start of Array
>> |                      |        |          | of Attributes
>> |                      |        |          |
>> --------------------------------------------------------------------------------
>> | OutputBufferData     |        |          | Version of the Header.
>> | HeaderVersion        | 0x010  | 0x01     | The header will be always
>> | AttributesInBuffer   |        |          | backward compatible, and changes
>> |                      |        |          | will not impact the Array of
>> |                      |        |          | attributes.
>> |                      |        |          | Current version = 0x01
> This is not clear to me. In the event of a header version change, is the
> total set of attributes guaranteed to remain the same? Or only the array
> layout? We might not need to expose the version information after all.

I believe, the way versioning currently works is that if any new attribute is
added/modified to the list, this will entail a new version.

Regardless, the older attributes and their ids will not change and will still
be backwards compatible.

If the versioning does change, this patch does introduce a version check and
will fail to populate the sysfs and, a tool like powerpc-utils will not read
incorrect/non-coherent information.

So I'm inclined also believe now that verisoning information may not be needed
to expose to userspace.

>> --------------------------------------------------------------------------------
>> | ArrayOfAttributes    |        |          | The array will contain
>> |                      |        |          | "NumberOfAttributesInBuffer"
>> |                      |        |          | array elements not to exceed
>> |                      |        |          | the size of the buffer.
>> |                      |        |          | Layout of the array is
>> |                      |        |          | detailed in Table 2.
>> --------------------------------------------------------------------------------
>>
>>
>> Table 2 --> Array of attributes
>> ================================================================================
>> | Field Name           | Byte   | Length   |  Description
>> |                      | Offset | in Bytes |
>> ================================================================================
>> | 1st AttributeId      | 0x000  | 0x08     | The ID of the Attribute
>> --------------------------------------------------------------------------------
>> | 1st AttributeValue   | 0x008  | 0x08     | The numerical value of
>> |                      |        |          | the attribute
>> --------------------------------------------------------------------------------
>> | 1st AttributeString  | 0x010  | 0x40     | The ASCII string
>> | Description          |        |          | description of the
>> |                      |        |          | attribute, up to 63
>> |                      |        |          | characters plus a NULL
>> |                      |        |          | terminator.
> There is a slight disconnect in that this is called "description" by the
> spec, which makes me think they could eventually have something more
> verbose than what you'd expect from "name".
>
> So they could give us either: "Frequency" or "The Frequency in GigaHertz".

Yes, the description can be more verbose, like I can see attributes with the
description as "Minimum Frequency (MHz)". That's probably why parsing based on
IDs is a better approach.

>
>> --------------------------------------------------------------------------------
>> | 1st AttributeValue   | 0x050  | 0x40     | The ASCII string
>> | StringDescription    |        |          | description of the
>> |                      |        |          | attribute value, up to 63
>> |                      |        |          | characters plus a NULL
>> |                      |        |          | terminator. If this
>> |                      |        |          | contains only a NULL
>> |                      |        |          | terminator, then there is
>> |                      |        |          | no ASCII string
>> |                      |        |          | associated with AttributeValue.
>> --------------------------------------------------------------------------------
>> | ....                 |        |          |
>>
>>
>>>> The new H_CALL exports information in direct string value format, hence
>>>> a new interface has been introduced in /sys/firmware/papr to export
>>> Hm.. Maybe this should be something less generic than "papr"?
>> The interface naming was inspired from /sys/firmware/opal's naming convention.
>> We believed the name PAPR could serve as more generic name to be used by both
>> Linux running on PHYP and linux on KVM.
> Right, I agree with that rationale, but /opal has identifiable elements
> in it whereas /papr would have the generic "attr_X_name", which does not
> give much hint about what they are.
>
> We also expect people to iterate the "attr_X_*" files, so if we decide
> to add something else under /papr in the future, that would potentially
> cause issues with any tool that just lists the content of the directory.
>
> So maybe we should be proactive and put the hcall stuff inside a
> subdirectory already. /papr/energy_scale_attrs comes to mind, but I
> don't have a strong opinion on the particular name.

Encapsulating it within another directory like energy_scale_attrs does make
sense and keeps the PAPR directory open to more such information going forward.

>> If you have something more concrete in mind, please let me know. I'm open to
>> suggestions.
>>
>>>> this information to userspace in an extensible pass-through format.
>>>> The H_CALL returns the name, numeric value and string value. As string
>>>> values are in human readable format, therefore if the string value
>>>> exists then that is given precedence over the numeric value.
>>> So the hypervisor could simply not send the string representation? How
>>> will the userspace tell the difference since they are reading everything
>>> from a file?
>>>
>>> Overall I'd say we should give the data in a more structured way and let
>>> the user-facing tool do the formatting and presentation.
>> That's a valid concern, the design for this was inspired from hwmon's interface
>> to housing the sensor information.
>>
>> One alternative to add more structure to this format could be to introduce:
>> attr_X_name, attr_X_num_val, attr_X_str_val
>>
>> However, in some cases like min/max frequency the string value is empty. In
>> that case the file attr_X_str_val will also be empty.
>> Is that an acceptable format of having empty files that in some cases will
>> never be populated?
> I'm thinking yes, but I'm not sure. Let's see if someone else has a say
> in this.

Sure, if we can have empty sysfs files, then this presents a coherent interface.

@mpe, can you weigh in here, can we have an interface where we have the following structure:
/sys/firmware/papr/energy_scale_attrs/
    |-- <id>/
      |-- desc
      |-- value
      |-- value_desc
where value_desc can be empty in some case?
If so, can we leave them empty or do we need to have them populated with a
string "NULL"/"NONE"?

>
>> We also went ahead to confirm with the SPEC team that if a string value exists
>> in their buffer, that must be given precedence.
> Huh.. That must be a recommendation only. The hypervisor has no control
> over how people present the information in userspace.
>
>> Another alternative format could to keep attr_X_name, attr_X_val intact but
>> change what X means. Currently X is just an iteratively increasing number. But
>> X can also serve as an ID which we get from H_CALL output buffer.
> This seems like a good idea. It makes it easier to correlate the
> attribute with what is in PAPR.
>
>> In this case, we should also include some versioning so that the tool now also
>> has cognizance of contents of each file.
>>
>>>> The format of exposing the sysfs information is as follows:
>>>> /sys/firmware/papr/
>>>>     |-- attr_0_name
>>>>     |-- attr_0_val
>>>>     |-- attr_1_name
>>>>     |-- attr_1_val
>>>> ...
>>> How do we keep a stable interface with userspace? Say the hypervisor
>>> decides to add or remove attributes, change their order, string
>>> representation, etc? It will inform us via the version field, but that
>>> is lost when we output this to sysfs.
>>>
>>> I get that if the userspace just iterate over the contents of the
>>> directory then nothing breaks, but there is not much else it could do it
>>> seems.
>> Fair point, having the version exposed to the sysfs does seem crucial.
>>
>> Currently in ppc-utils we iterate over all the information, however as you
>> rightly pointed out there may be other tools needing just specific information.
>> The alternative I suggested a few sentences above to include ID based attribute
>> naming and versioning maybe a more elegant way of solving this problem.
>>
>> What are your thoughts on a design like this?
>>
> Based on all the new information you provided, I'd say present all the
> data and group it under the ID:
>
> /sys/firmware/papr/energy_scale_attrs/
>     |-- <id>/
>       |-- desc
>       |-- value
>       |-- value_desc
>     |-- <id>/
>       |-- desc
>       |-- value
>       |-- value_desc
>
> Is that workable?

If we can confirm if value descriptions can be empty, then I too think this is
a good interface to introduce for energy attributes.

Thanks for your feedback.
Pratik

>>>> The energy information that is exported is useful for userspace tools
>>>> such as powerpc-utils. Currently these tools infer the
>>>> "power_mode_data" value in the lparcfg, which in turn is obtained from
>>>> the to be deprecated H_GET_EM_PARMS H_CALL.
>>>> On future platforms, such userspace utilities will have to look at the
>>>> data returned from the new H_CALL being populated in this new sysfs
>>>> interface and report this information directly without the need of
>>>> interpretation.
>>>>
>>>> Signed-off-by: Pratik R. Sampat <psampat@linux.ibm.com>
>> Thanks
>> Pratik


^ permalink raw reply

* [PATCH 0/5] cpufreq: cppc: Fix suspend/resume specific races with FIE code
From: Viresh Kumar @ 2021-06-10  8:23 UTC (permalink / raw)
  To: Rafael Wysocki, Qian Cai, Benjamin Herrenschmidt, Jonathan Corbet,
	Len Brown, Michael Ellerman, Paul Mackerras, Srinivas Pandruvada,
	Viresh Kumar
  Cc: Vincent Guittot, linux-doc, Dirk Brandewie, linuxppc-dev,
	linux-pm, linux-kernel, Ionela Voinescu

Hi Qian,

It would be helpful if you can test this patchset and confirm if the races you
mentioned went away or not and that the FIE code works as we wanted it to.

I don't have a real setup and so it won't be easy for me to test this out.

I have already sent a temporary fix for 5.13 and this patchset is targeted for
5.14 and is based over that.

-------------------------8<-------------------------

The CPPC driver currently stops the frequency invariance related
kthread_work and irq_work from cppc_freq_invariance_exit() which is only
called during driver's removal.

This is not sufficient as the CPUs can get hot-plugged out while the
driver is in use, the same also happens during system suspend/resume.

In such a cases we can reach a state where the CPU is removed by the
kernel but its kthread_work or irq_work aren't stopped.

Fix this by implementing the start_cpu() and stop_cpu() callbacks in the
cpufreq core, which will be called for each CPU's addition/removal.

A similar call was already available in the cpufreq core, which isn't required
anymore and so its users are migrated to use exit() callback instead.

This is targeted for v5.14-rc1.

--
Viresh

Viresh Kumar (5):
  cpufreq: cppc: Migrate to ->exit() callback instead of ->stop_cpu()
  cpufreq: intel_pstate: Migrate to ->exit() callback instead of
    ->stop_cpu()
  cpufreq: powerenv: Migrate to ->exit() callback instead of
    ->stop_cpu()
  cpufreq: Add start_cpu() and stop_cpu() callbacks
  cpufreq: cppc: Fix suspend/resume specific races with the FIE code

 Documentation/cpu-freq/cpu-drivers.rst |   7 +-
 drivers/cpufreq/Kconfig.arm            |   1 -
 drivers/cpufreq/cppc_cpufreq.c         | 163 ++++++++++++++-----------
 drivers/cpufreq/cpufreq.c              |  11 +-
 drivers/cpufreq/intel_pstate.c         |   9 +-
 drivers/cpufreq/powernv-cpufreq.c      |  23 ++--
 include/linux/cpufreq.h                |   5 +-
 7 files changed, 119 insertions(+), 100 deletions(-)

-- 
2.31.1.272.g89b43f80a514

^ permalink raw reply

* [PATCH 3/5] cpufreq: powerenv: Migrate to ->exit() callback instead of ->stop_cpu()
From: Viresh Kumar @ 2021-06-10  8:23 UTC (permalink / raw)
  To: Rafael Wysocki, Qian Cai, Viresh Kumar, Michael Ellerman,
	Benjamin Herrenschmidt, Paul Mackerras
  Cc: Ionela Voinescu, Vincent Guittot, linuxppc-dev, linux-kernel,
	linux-pm
In-Reply-To: <cover.1623313323.git.viresh.kumar@linaro.org>

commit 367dc4aa932b ("cpufreq: Add stop CPU callback to cpufreq_driver
interface") added the stop_cpu() callback to allow the drivers to do
clean up before the CPU is completely down and its state cannot be
modified.

At that time the CPU hotplug framework used to call the cpufreq core's
registered notifier for different events like CPU_DOWN_PREPARE and
CPU_POST_DEAD. The stop_cpu() callback was called during the
CPU_DOWN_PREPARE event.

This is no longer the case, cpuhp_cpufreq_offline() is called only once
by the CPU hotplug core now and we don't really need two separate
callbacks for cpufreq drivers, i.e. stop_cpu() and exit(), as everything
can be done from the exit() callback itself.

Migrate to using the exit() callback instead of stop_cpu().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/powernv-cpufreq.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index e439b43c19eb..005600cef273 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -875,7 +875,15 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-	/* timer is deleted in cpufreq_cpu_stop() */
+	struct powernv_smp_call_data freq_data;
+	struct global_pstate_info *gpstates = policy->driver_data;
+
+	freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
+	freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
+	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
+	if (gpstates)
+		del_timer_sync(&gpstates->timer);
+
 	kfree(policy->driver_data);
 
 	return 0;
@@ -1007,18 +1015,6 @@ static struct notifier_block powernv_cpufreq_opal_nb = {
 	.priority	= 0,
 };
 
-static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
-{
-	struct powernv_smp_call_data freq_data;
-	struct global_pstate_info *gpstates = policy->driver_data;
-
-	freq_data.pstate_id = idx_to_pstate(powernv_pstate_info.min);
-	freq_data.gpstate_id = idx_to_pstate(powernv_pstate_info.min);
-	smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
-	if (gpstates)
-		del_timer_sync(&gpstates->timer);
-}
-
 static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq)
 {
@@ -1042,7 +1038,6 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
 	.target_index	= powernv_cpufreq_target_index,
 	.fast_switch	= powernv_fast_switch,
 	.get		= powernv_cpufreq_get,
-	.stop_cpu	= powernv_cpufreq_stop_cpu,
 	.attr		= powernv_cpu_freq_attr,
 };
 
-- 
2.31.1.272.g89b43f80a514


^ permalink raw reply related

* [PATCH 1/6] selftest/mremap_test: Update the test to handle pagesize other than 4K
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210610083549.386085-1-aneesh.kumar@linux.ibm.com>

Instead of hardcoding 4K page size fetch it using sysconf(). For the performance
measurements test still assume 2M and 1G are hugepage sizes.

Reviewed-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 tools/testing/selftests/vm/mremap_test.c | 113 ++++++++++++-----------
 1 file changed, 61 insertions(+), 52 deletions(-)

diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index 9c391d016922..c9a5461eb786 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -45,14 +45,15 @@ enum {
 	_4MB = 4ULL << 20,
 	_1GB = 1ULL << 30,
 	_2GB = 2ULL << 30,
-	PTE = _4KB,
 	PMD = _2MB,
 	PUD = _1GB,
 };
 
+#define PTE page_size
+
 #define MAKE_TEST(source_align, destination_align, size,	\
 		  overlaps, should_fail, test_name)		\
-{								\
+(struct test){							\
 	.name = test_name,					\
 	.config = {						\
 		.src_alignment = source_align,			\
@@ -252,12 +253,17 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
 	return 0;
 }
 
+#define MAX_TEST 13
+#define MAX_PERF_TEST 3
 int main(int argc, char **argv)
 {
 	int failures = 0;
 	int i, run_perf_tests;
 	unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
 	unsigned int pattern_seed;
+	struct test test_cases[MAX_TEST];
+	struct test perf_test_cases[MAX_PERF_TEST];
+	int page_size;
 	time_t t;
 
 	pattern_seed = (unsigned int) time(&t);
@@ -268,56 +274,59 @@ int main(int argc, char **argv)
 	ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
 		       threshold_mb, pattern_seed);
 
-	struct test test_cases[] = {
-		/* Expected mremap failures */
-		MAKE_TEST(_4KB, _4KB, _4KB, OVERLAPPING, EXPECT_FAILURE,
-		  "mremap - Source and Destination Regions Overlapping"),
-		MAKE_TEST(_4KB, _1KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
-		  "mremap - Destination Address Misaligned (1KB-aligned)"),
-		MAKE_TEST(_1KB, _4KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
-		  "mremap - Source Address Misaligned (1KB-aligned)"),
-
-		/* Src addr PTE aligned */
-		MAKE_TEST(PTE, PTE, _8KB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "8KB mremap - Source PTE-aligned, Destination PTE-aligned"),
-
-		/* Src addr 1MB aligned */
-		MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"),
-		MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"),
-
-		/* Src addr PMD aligned */
-		MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "4MB mremap - Source PMD-aligned, Destination PTE-aligned"),
-		MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"),
-		MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "4MB mremap - Source PMD-aligned, Destination PMD-aligned"),
-
-		/* Src addr PUD aligned */
-		MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination PTE-aligned"),
-		MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"),
-		MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination PMD-aligned"),
-		MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "2GB mremap - Source PUD-aligned, Destination PUD-aligned"),
-	};
-
-	struct test perf_test_cases[] = {
-		/*
-		 * mremap 1GB region - Page table level aligned time
-		 * comparison.
-		 */
-		MAKE_TEST(PTE, PTE, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "1GB mremap - Source PTE-aligned, Destination PTE-aligned"),
-		MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "1GB mremap - Source PMD-aligned, Destination PMD-aligned"),
-		MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-		  "1GB mremap - Source PUD-aligned, Destination PUD-aligned"),
-	};
+	page_size = sysconf(_SC_PAGESIZE);
+
+	/* Expected mremap failures */
+	test_cases[0] =	MAKE_TEST(page_size, page_size, page_size,
+				  OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Source and Destination Regions Overlapping");
+
+	test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
+				  NON_OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Destination Address Misaligned (1KB-aligned)");
+	test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
+				  NON_OVERLAPPING, EXPECT_FAILURE,
+				  "mremap - Source Address Misaligned (1KB-aligned)");
+
+	/* Src addr PTE aligned */
+	test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
+				  NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
+
+	/* Src addr 1MB aligned */
+	test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
+	test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+	/* Src addr PMD aligned */
+	test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
+	test_cases[7] =	MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
+	test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
+
+	/* Src addr PUD aligned */
+	test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				  "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
+	test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
+	test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
+	test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				   "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+	perf_test_cases[0] =  MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+					"1GB mremap - Source PTE-aligned, Destination PTE-aligned");
+	/*
+	 * mremap 1GB region - Page table level aligned time
+	 * comparison.
+	 */
+	perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				       "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
+	perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+				       "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
 
 	run_perf_tests =  (threshold_mb == VALIDATION_NO_THRESHOLD) ||
 				(threshold_mb * _1MB >= _1GB);
-- 
2.31.1


^ permalink raw reply related

* [PATCH 0/6] mremap fixes
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev

This patch series is split out series from [PATCH v7 00/11] Speedup mremap on ppc64
(https://lore.kernel.org/linux-mm/20210607055131.156184-1-aneesh.kumar@linux.ibm.com)
dropping ppc64 specific changes.

I will send the ppc64 specific changes separately once we agree on how to handle the
TLB flush changes.


Aneesh Kumar K.V (6):
  selftest/mremap_test: Update the test to handle pagesize other than 4K
  selftest/mremap_test: Avoid crash with static build
  mm/mremap: Convert huge PUD move to separate helper
  mm/mremap: Don't enable optimized PUD move if page table levels is 2
  mm/mremap: Use pmd/pud_poplulate to update page table entries
  mm/mremap: hold the rmap lock in write mode when moving page table
    entries.

 mm/mremap.c                              |  93 +++++++++++++++---
 tools/testing/selftests/vm/mremap_test.c | 118 ++++++++++++-----------
 2 files changed, 143 insertions(+), 68 deletions(-)

-- 
2.31.1


^ permalink raw reply

* [PATCH 2/6] selftest/mremap_test: Avoid crash with static build
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210610083549.386085-1-aneesh.kumar@linux.ibm.com>

With a large mmap map size, we can overlap with the text area and using
MAP_FIXED results in unmapping that area. Switch to MAP_FIXED_NOREPLACE
and handle the EEXIST error.

Reviewed-by: Kalesh Singh <kaleshsingh@google.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 tools/testing/selftests/vm/mremap_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
index c9a5461eb786..0624d1bd71b5 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -75,9 +75,10 @@ static void *get_source_mapping(struct config c)
 retry:
 	addr += c.src_alignment;
 	src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
-			MAP_FIXED | MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+			MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+			-1, 0);
 	if (src_addr == MAP_FAILED) {
-		if (errno == EPERM)
+		if (errno == EPERM || errno == EEXIST)
 			goto retry;
 		goto error;
 	}
-- 
2.31.1


^ permalink raw reply related

* [PATCH 4/6] mm/mremap: Don't enable optimized PUD move if page table levels is 2
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210610083549.386085-1-aneesh.kumar@linux.ibm.com>

With two level page table don't enable move_normal_pud.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 92ab7d24a587..795a7d628b53 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,7 +276,7 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma,
 }
 #endif
 
-#ifdef CONFIG_HAVE_MOVE_PUD
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 {
-- 
2.31.1


^ permalink raw reply related

* [PATCH 3/6] mm/mremap: Convert huge PUD move to separate helper
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210610083549.386085-1-aneesh.kumar@linux.ibm.com>

With TRANSPARENT_HUGEPAGE_PUD enabled the kernel can find huge PUD entries.
Add a helper to move huge PUD entries on mremap().

This will be used by a later patch to optimize mremap of PUD_SIZE aligned
level 4 PTE mapped address

This also make sure we support mremap on huge PUD entries even with
CONFIG_HAVE_MOVE_PUD disabled.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 7 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 47c255b60150..92ab7d24a587 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -324,10 +324,62 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
 }
 #endif
 
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_PUD
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+	spinlock_t *old_ptl, *new_ptl;
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t pud;
+
+	/*
+	 * The destination pud shouldn't be established, free_pgtables()
+	 * should have released it.
+	 */
+	if (WARN_ON_ONCE(!pud_none(*new_pud)))
+		return false;
+
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * ptlocks because exclusive mmap_lock prevents deadlock.
+	 */
+	old_ptl = pud_lock(vma->vm_mm, old_pud);
+	new_ptl = pud_lockptr(mm, new_pud);
+	if (new_ptl != old_ptl)
+		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+	/* Clear the pud */
+	pud = *old_pud;
+	pud_clear(old_pud);
+
+	VM_BUG_ON(!pud_none(*new_pud));
+
+	/* Set the new pud */
+	/* mark soft_ditry when we add pud level soft dirty support */
+	set_pud_at(mm, new_addr, new_pud, pud);
+	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
+	spin_unlock(old_ptl);
+
+	return true;
+}
+#else
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+	WARN_ON_ONCE(1);
+	return false;
+
+}
+#endif
+
 enum pgt_entry {
 	NORMAL_PMD,
 	HPAGE_PMD,
 	NORMAL_PUD,
+	HPAGE_PUD,
 };
 
 /*
@@ -347,6 +399,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
 		mask = PMD_MASK;
 		size = PMD_SIZE;
 		break;
+	case HPAGE_PUD:
 	case NORMAL_PUD:
 		mask = PUD_MASK;
 		size = PUD_SIZE;
@@ -395,6 +448,11 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
 			move_huge_pmd(vma, old_addr, new_addr, old_entry,
 				      new_entry);
 		break;
+	case HPAGE_PUD:
+		moved = move_huge_pud(vma, old_addr, new_addr, old_entry,
+				      new_entry);
+		break;
+
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -414,6 +472,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 	unsigned long extent, old_end;
 	struct mmu_notifier_range range;
 	pmd_t *old_pmd, *new_pmd;
+	pud_t *old_pud, *new_pud;
 
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
@@ -429,15 +488,22 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		 * PUD level if possible.
 		 */
 		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
-		if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
-			pud_t *old_pud, *new_pud;
 
-			old_pud = get_old_pud(vma->vm_mm, old_addr);
-			if (!old_pud)
+		old_pud = get_old_pud(vma->vm_mm, old_addr);
+		if (!old_pud)
+			continue;
+		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+		if (!new_pud)
+			break;
+		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+			if (extent == HPAGE_PUD_SIZE) {
+				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
+					       old_pud, new_pud, need_rmap_locks);
+				/* We ignore and continue on error? */
 				continue;
-			new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
-			if (!new_pud)
-				break;
+			}
+		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+
 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
 					   old_pud, new_pud, need_rmap_locks))
 				continue;
-- 
2.31.1


^ permalink raw reply related

* [PATCH 5/6] mm/mremap: Use pmd/pud_poplulate to update page table entries
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Linus Torvalds, npiggin, kaleshsingh, joel,
	Kirill A . Shutemov, linuxppc-dev
In-Reply-To: <20210610083549.386085-1-aneesh.kumar@linux.ibm.com>

pmd/pud_populate is the right interface to be used to set the respective
page table entries. Some architectures like ppc64 do assume that set_pmd/pud_at
can only be used to set a hugepage PTE. Since we are not setting up a hugepage
PTE here, use the pmd/pud_populate interface.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 795a7d628b53..dacfa9111ab1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -26,6 +26,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #include "internal.h"
 
@@ -258,8 +259,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 
 	VM_BUG_ON(!pmd_none(*new_pmd));
 
-	/* Set the new pmd */
-	set_pmd_at(mm, new_addr, new_pmd, pmd);
+	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
 	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
@@ -306,8 +306,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 
 	VM_BUG_ON(!pud_none(*new_pud));
 
-	/* Set the new pud */
-	set_pud_at(mm, new_addr, new_pud, pud);
+	pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
 	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
-- 
2.31.1


^ permalink raw reply related

* [PATCH 6/6] mm/mremap: hold the rmap lock in write mode when moving page table entries.
From: Aneesh Kumar K.V @ 2021-06-10  8:35 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: Aneesh Kumar K.V, Hugh Dickins, Linus Torvalds, npiggin,
	kaleshsingh, joel, Kirill A . Shutemov, linuxppc-dev,
	Kirill A . Shutemov
In-Reply-To: <20210610083549.386085-1-aneesh.kumar@linux.ibm.com>

To avoid a race between rmap walk and mremap, mremap does take_rmap_locks().
The lock was taken to ensure that rmap walk don't miss a page table entry due to
PTE moves via move_pagetables(). The kernel does further optimization of
this lock such that if we are going to find the newly added vma after the
old vma, the rmap lock is not taken. This is because rmap walk would find the
vmas in the same order and if we don't find the page table attached to
older vma we would find it with the new vma which we would iterate later.

As explained in commit eb66ae030829 ("mremap: properly flush TLB before releasing the page")
mremap is special in that it doesn't take ownership of the page. The
optimized version for PUD/PMD aligned mremap also doesn't hold the ptl lock.
This can result in stale TLB entries as show below.

This patch updates the rmap locking requirement in mremap to handle the race condition
explained below with optimized mremap::

Optmized PMD move

    CPU 1                           CPU 2                                   CPU 3

    mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

    mmap_write_lock_killable()

                                    addr = old_addr
                                    lock(pte_ptl)
    lock(pmd_ptl)
    pmd = *old_pmd
    pmd_clear(old_pmd)
    flush_tlb_range(old_addr)

    *new_pmd = pmd
                                                                            *new_addr = 10; and fills
                                                                            TLB with new addr
                                                                            and old pfn

    unlock(pmd_ptl)
                                    ptep_clear_flush()
                                    old pfn is free.
                                                                            Stale TLB entry

Optimized PUD move also suffers from a similar race.
Both the above race condition can be fixed if we force mremap path to take rmap lock.

Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index dacfa9111ab1..b8eed7645cea 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -504,7 +504,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
 
 			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
-					   old_pud, new_pud, need_rmap_locks))
+					   old_pud, new_pud, true))
 				continue;
 		}
 
@@ -531,7 +531,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 			 * moving at the PMD level if possible.
 			 */
 			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
-					   old_pmd, new_pmd, need_rmap_locks))
+					   old_pmd, new_pmd, true))
 				continue;
 		}
 
-- 
2.31.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox