[v1 resend 01/12] mm/zone_device: support large zone device private folios

All of lore.kernel.org
 help / color / mirror / Atom feed

* [v1 resend 01/12] mm/zone_device: support large zone device private folios
  2025-07-03 23:34 [v1 resend 00/12] THP support for zone device page migration Balbir Singh
@ 2025-07-03 23:35 ` Balbir Singh
  2025-07-07  5:28   ` Alistair Popple
  0 siblings, 1 reply; 4+ messages in thread
From: Balbir Singh @ 2025-07-03 23:35 UTC (permalink / raw)
  To: linux-mm
  Cc: akpm, linux-kernel, Balbir Singh, Karol Herbst, Lyude Paul,
	Danilo Krummrich, David Airlie, Simona Vetter,
	Jérôme Glisse, Shuah Khan, David Hildenbrand,
	Barry Song, Baolin Wang, Ryan Roberts, Matthew Wilcox, Peter Xu,
	Zi Yan, Kefeng Wang, Jane Chu, Alistair Popple, Donet Tom

Add routines to support allocation of large order zone device folios
and helper functions for zone device folios, to check if a folio is
device private and helpers for setting zone device data.

When large folios are used, the existing page_free() callback in
pgmap is called when the folio is freed, this is true for both
PAGE_SIZE and higher order pages.

Cc: Karol Herbst <kherbst@redhat.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Donet Tom <donettom@linux.ibm.com>

Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
 include/linux/memremap.h | 22 +++++++++++++++++-
 mm/memremap.c            | 50 +++++++++++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 4aa151914eab..11d586dd8ef1 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -169,6 +169,18 @@ static inline bool folio_is_device_private(const struct folio *folio)
 	return is_device_private_page(&folio->page);
 }
 
+static inline void *folio_zone_device_data(const struct folio *folio)
+{
+	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
+	return folio->page.zone_device_data;
+}
+
+static inline void folio_set_zone_device_data(struct folio *folio, void *data)
+{
+	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
+	folio->page.zone_device_data = data;
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
@@ -199,7 +211,7 @@ static inline bool folio_is_fsdax(const struct folio *folio)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
-void zone_device_page_init(struct page *page);
+void init_zone_device_folio(struct folio *folio, unsigned int order);
 void *memremap_pages(struct dev_pagemap *pgmap, int nid);
 void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
@@ -209,6 +221,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
 
 unsigned long memremap_compat_align(void);
+
+static inline void zone_device_page_init(struct page *page)
+{
+	struct folio *folio = page_folio(page);
+
+	init_zone_device_folio(folio, 0);
+}
+
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct dev_pagemap *pgmap)
diff --git a/mm/memremap.c b/mm/memremap.c
index b0ce0d8254bd..4085a3893e64 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -427,20 +427,21 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
 void free_zone_device_folio(struct folio *folio)
 {
 	struct dev_pagemap *pgmap = folio->pgmap;
+	unsigned int nr = folio_nr_pages(folio);
+	int i;
+	bool anon = folio_test_anon(folio);
+	struct page *page = folio_page(folio, 0);
 
 	if (WARN_ON_ONCE(!pgmap))
 		return;
 
 	mem_cgroup_uncharge(folio);
 
-	/*
-	 * Note: we don't expect anonymous compound pages yet. Once supported
-	 * and we could PTE-map them similar to THP, we'd have to clear
-	 * PG_anon_exclusive on all tail pages.
-	 */
-	if (folio_test_anon(folio)) {
-		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-		__ClearPageAnonExclusive(folio_page(folio, 0));
+	WARN_ON_ONCE(folio_test_large(folio) && !anon);
+
+	for (i = 0; i < nr; i++) {
+		if (anon)
+			__ClearPageAnonExclusive(folio_page(folio, i));
 	}
 
 	/*
@@ -464,10 +465,19 @@ void free_zone_device_folio(struct folio *folio)
 
 	switch (pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
+		if (folio_test_large(folio)) {
+			folio_unqueue_deferred_split(folio);
+
+			percpu_ref_put_many(&folio->pgmap->ref, nr - 1);
+		}
+		pgmap->ops->page_free(page);
+		put_dev_pagemap(pgmap);
+		page->mapping = NULL;
+		break;
 	case MEMORY_DEVICE_COHERENT:
 		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
 			break;
-		pgmap->ops->page_free(folio_page(folio, 0));
+		pgmap->ops->page_free(page);
 		put_dev_pagemap(pgmap);
 		break;
 
@@ -491,14 +501,28 @@ void free_zone_device_folio(struct folio *folio)
 	}
 }
 
-void zone_device_page_init(struct page *page)
+void init_zone_device_folio(struct folio *folio, unsigned int order)
 {
+	struct page *page = folio_page(folio, 0);
+
+	VM_BUG_ON(order > MAX_ORDER_NR_PAGES);
+
+	WARN_ON_ONCE(order && order != HPAGE_PMD_ORDER);
+
 	/*
 	 * Drivers shouldn't be allocating pages after calling
 	 * memunmap_pages().
 	 */
-	WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
-	set_page_count(page, 1);
+	WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order));
+	folio_set_count(folio, 1);
 	lock_page(page);
+
+	/*
+	 * Only PMD level migration is supported for THP migration
+	 */
+	if (order > 1) {
+		prep_compound_page(page, order);
+		folio_set_large_rmappable(folio);
+	}
 }
-EXPORT_SYMBOL_GPL(zone_device_page_init);
+EXPORT_SYMBOL_GPL(init_zone_device_folio);
-- 
2.49.0



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [v1 resend 01/12] mm/zone_device: support large zone device private folios
@ 2025-07-04 16:48 kernel test robot
  0 siblings, 0 replies; 4+ messages in thread
From: kernel test robot @ 2025-07-04 16:48 UTC (permalink / raw)
  To: oe-kbuild; +Cc: lkp

:::::: 
:::::: Manual check reason: "__compiletime_assert_NNN"
:::::: 

BCC: lkp@intel.com
CC: llvm@lists.linux.dev
CC: oe-kbuild-all@lists.linux.dev
In-Reply-To: <20250703233511.2028395-2-balbirs@nvidia.com>
References: <20250703233511.2028395-2-balbirs@nvidia.com>
TO: Balbir Singh <balbirs@nvidia.com>
TO: linux-mm@kvack.org
CC: akpm@linux-foundation.org
CC: linux-kernel@vger.kernel.org
CC: Balbir Singh <balbirs@nvidia.com>
CC: Karol Herbst <kherbst@redhat.com>
CC: Lyude Paul <lyude@redhat.com>
CC: Danilo Krummrich <dakr@kernel.org>
CC: David Airlie <airlied@gmail.com>
CC: Simona Vetter <simona@ffwll.ch>
CC: "Jérôme Glisse" <jglisse@redhat.com>
CC: Shuah Khan <skhan@linuxfoundation.org>
CC: David Hildenbrand <david@redhat.com>
CC: Barry Song <baohua@kernel.org>
CC: Baolin Wang <baolin.wang@linux.alibaba.com>
CC: Ryan Roberts <ryan.roberts@arm.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Peter Xu <peterx@redhat.com>
CC: Zi Yan <ziy@nvidia.com>
CC: Kefeng Wang <wangkefeng.wang@huawei.com>
CC: Jane Chu <jane.chu@oracle.com>
CC: Alistair Popple <apopple@nvidia.com>
CC: Donet Tom <donettom@linux.ibm.com>

Hi Balbir,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on akpm-mm/mm-nonmm-unstable shuah-kselftest/next shuah-kselftest/fixes linus/master v6.16-rc4 next-20250704]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Balbir-Singh/mm-zone_device-support-large-zone-device-private-folios/20250704-073807
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20250703233511.2028395-2-balbirs%40nvidia.com
patch subject: [v1 resend 01/12] mm/zone_device: support large zone device private folios
:::::: branch date: 17 hours ago
:::::: commit date: 17 hours ago
config: arm64-randconfig-002-20250704 (https://download.01.org/0day-ci/archive/20250705/202507050047.XLLs8aYY-lkp@intel.com/config)
compiler: clang version 21.0.0git (https://github.com/llvm/llvm-project 61529d9e36fa86782a2458e6bdeedf7f376ef4b5)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250705/202507050047.XLLs8aYY-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/r/202507050047.XLLs8aYY-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/memremap.c:510:33: error: call to '__compiletime_assert_714' declared with 'error' attribute: BUILD_BUG failed
     510 |         WARN_ON_ONCE(order && order != HPAGE_PMD_ORDER);
         |                                        ^
   include/linux/huge_mm.h:114:26: note: expanded from macro 'HPAGE_PMD_ORDER'
     114 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
         |                          ^
   include/linux/huge_mm.h:110:28: note: expanded from macro 'HPAGE_PMD_SHIFT'
     110 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
         |                            ^
   include/linux/build_bug.h:59:21: note: expanded from macro 'BUILD_BUG'
      59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
         |                     ^
   note: (skipping 2 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
   include/linux/compiler_types.h:556:2: note: expanded from macro '_compiletime_assert'
     556 |         __compiletime_assert(condition, msg, prefix, suffix)
         |         ^
   include/linux/compiler_types.h:549:4: note: expanded from macro '__compiletime_assert'
     549 |                         prefix ## suffix();                             \
         |                         ^
   <scratch space>:16:1: note: expanded from here
      16 | __compiletime_assert_714
         | ^
   1 error generated.


vim +510 mm/memremap.c

75e55d8a107edb Christoph Hellwig 2022-02-16  503  
082c3dc9c91f7e Balbir Singh      2025-07-04  504  void init_zone_device_folio(struct folio *folio, unsigned int order)
ef233450898f88 Alistair Popple   2022-09-28  505  {
082c3dc9c91f7e Balbir Singh      2025-07-04  506  	struct page *page = folio_page(folio, 0);
082c3dc9c91f7e Balbir Singh      2025-07-04  507  
082c3dc9c91f7e Balbir Singh      2025-07-04  508  	VM_BUG_ON(order > MAX_ORDER_NR_PAGES);
082c3dc9c91f7e Balbir Singh      2025-07-04  509  
082c3dc9c91f7e Balbir Singh      2025-07-04 @510  	WARN_ON_ONCE(order && order != HPAGE_PMD_ORDER);

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [v1 resend 01/12] mm/zone_device: support large zone device private folios
  2025-07-03 23:35 ` [v1 resend 01/12] mm/zone_device: support large zone device private folios Balbir Singh
@ 2025-07-07  5:28   ` Alistair Popple
  2025-07-08  6:47     ` Balbir Singh
  0 siblings, 1 reply; 4+ messages in thread
From: Alistair Popple @ 2025-07-07  5:28 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, linux-kernel, Karol Herbst, Lyude Paul,
	Danilo Krummrich, David Airlie, Simona Vetter,
	Jérôme Glisse, Shuah Khan, David Hildenbrand,
	Barry Song, Baolin Wang, Ryan Roberts, Matthew Wilcox, Peter Xu,
	Zi Yan, Kefeng Wang, Jane Chu, Donet Tom

On Fri, Jul 04, 2025 at 09:35:00AM +1000, Balbir Singh wrote:
> Add routines to support allocation of large order zone device folios
> and helper functions for zone device folios, to check if a folio is
> device private and helpers for setting zone device data.
> 
> When large folios are used, the existing page_free() callback in
> pgmap is called when the folio is freed, this is true for both
> PAGE_SIZE and higher order pages.
> 
> Cc: Karol Herbst <kherbst@redhat.com>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: "Jérôme Glisse" <jglisse@redhat.com>
> Cc: Shuah Khan <shuah@kernel.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
> Cc: Jane Chu <jane.chu@oracle.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Donet Tom <donettom@linux.ibm.com>
> 
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
>  include/linux/memremap.h | 22 +++++++++++++++++-
>  mm/memremap.c            | 50 +++++++++++++++++++++++++++++-----------
>  2 files changed, 58 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 4aa151914eab..11d586dd8ef1 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -169,6 +169,18 @@ static inline bool folio_is_device_private(const struct folio *folio)
>  	return is_device_private_page(&folio->page);
>  }
>  
> +static inline void *folio_zone_device_data(const struct folio *folio)
> +{
> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
> +	return folio->page.zone_device_data;
> +}
> +
> +static inline void folio_set_zone_device_data(struct folio *folio, void *data)
> +{
> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
> +	folio->page.zone_device_data = data;
> +}
> +
>  static inline bool is_pci_p2pdma_page(const struct page *page)
>  {
>  	return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
> @@ -199,7 +211,7 @@ static inline bool folio_is_fsdax(const struct folio *folio)
>  }
>  
>  #ifdef CONFIG_ZONE_DEVICE
> -void zone_device_page_init(struct page *page);
> +void init_zone_device_folio(struct folio *folio, unsigned int order);
>  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
>  void memunmap_pages(struct dev_pagemap *pgmap);
>  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
> @@ -209,6 +221,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
>  
>  unsigned long memremap_compat_align(void);
> +
> +static inline void zone_device_page_init(struct page *page)
> +{
> +	struct folio *folio = page_folio(page);
> +
> +	init_zone_device_folio(folio, 0);

Minor nit, but why not call this zone_device_folio_init() to keep the naming
consistent with zone_device_page_init()?

> +}
> +
>  #else
>  static inline void *devm_memremap_pages(struct device *dev,
>  		struct dev_pagemap *pgmap)
> diff --git a/mm/memremap.c b/mm/memremap.c
> index b0ce0d8254bd..4085a3893e64 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -427,20 +427,21 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  void free_zone_device_folio(struct folio *folio)
>  {
>  	struct dev_pagemap *pgmap = folio->pgmap;
> +	unsigned int nr = folio_nr_pages(folio);
> +	int i;
> +	bool anon = folio_test_anon(folio);
> +	struct page *page = folio_page(folio, 0);
>  
>  	if (WARN_ON_ONCE(!pgmap))
>  		return;
>  
>  	mem_cgroup_uncharge(folio);
>  
> -	/*
> -	 * Note: we don't expect anonymous compound pages yet. Once supported
> -	 * and we could PTE-map them similar to THP, we'd have to clear
> -	 * PG_anon_exclusive on all tail pages.
> -	 */
> -	if (folio_test_anon(folio)) {
> -		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
> -		__ClearPageAnonExclusive(folio_page(folio, 0));
> +	WARN_ON_ONCE(folio_test_large(folio) && !anon);
> +
> +	for (i = 0; i < nr; i++) {

The above comment says we should do this for all tail pages, but this appears to
do it for the head page as well. Is there a particular reason for that?

> +		if (anon)
> +			__ClearPageAnonExclusive(folio_page(folio, i));
>  	}
>  
>  	/*
> @@ -464,10 +465,19 @@ void free_zone_device_folio(struct folio *folio)
>  
>  	switch (pgmap->type) {
>  	case MEMORY_DEVICE_PRIVATE:
> +		if (folio_test_large(folio)) {
> +			folio_unqueue_deferred_split(folio);
> +
> +			percpu_ref_put_many(&folio->pgmap->ref, nr - 1);
> +		}
> +		pgmap->ops->page_free(page);
> +		put_dev_pagemap(pgmap);

Why is this needed/added, and where is the associated get_dev_pagemap()? Note
that the whole {get|put}_dev_pagemap() thing is basically unused now. Which
reminds me I should send a patch to remove it.

> +		page->mapping = NULL;
> +		break;
>  	case MEMORY_DEVICE_COHERENT:
>  		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
>  			break;
> -		pgmap->ops->page_free(folio_page(folio, 0));
> +		pgmap->ops->page_free(page);
>  		put_dev_pagemap(pgmap);
>  		break;
>  
> @@ -491,14 +501,28 @@ void free_zone_device_folio(struct folio *folio)
>  	}
>  }
>  
> -void zone_device_page_init(struct page *page)
> +void init_zone_device_folio(struct folio *folio, unsigned int order)

See above for some bike-shedding on the name.

>  {
> +	struct page *page = folio_page(folio, 0);
> +
> +	VM_BUG_ON(order > MAX_ORDER_NR_PAGES);
> +
> +	WARN_ON_ONCE(order && order != HPAGE_PMD_ORDER);
> +
>  	/*
>  	 * Drivers shouldn't be allocating pages after calling
>  	 * memunmap_pages().
>  	 */
> -	WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
> -	set_page_count(page, 1);
> +	WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order));
> +	folio_set_count(folio, 1);
>  	lock_page(page);
> +
> +	/*
> +	 * Only PMD level migration is supported for THP migration
> +	 */
> +	if (order > 1) {
> +		prep_compound_page(page, order);

Shouldn't this happen for order > 0 not 1? What about calling
INIT_LIST_HEAD(&folio->_deferred_list)? Last time I looked prep_compound_page()
didn't do that and I see above you are calling folio_unqueue_deferred_split() so
I assume you need to do this for DEVICE_PRIVATE pages too.

> +		folio_set_large_rmappable(folio);
> +	}
>  }
> -EXPORT_SYMBOL_GPL(zone_device_page_init);
> +EXPORT_SYMBOL_GPL(init_zone_device_folio);
> -- 
> 2.49.0
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [v1 resend 01/12] mm/zone_device: support large zone device private folios
  2025-07-07  5:28   ` Alistair Popple
@ 2025-07-08  6:47     ` Balbir Singh
  0 siblings, 0 replies; 4+ messages in thread
From: Balbir Singh @ 2025-07-08  6:47 UTC (permalink / raw)
  To: Alistair Popple
  Cc: linux-mm, akpm, linux-kernel, Karol Herbst, Lyude Paul,
	Danilo Krummrich, David Airlie, Simona Vetter,
	Jérôme Glisse, Shuah Khan, David Hildenbrand,
	Barry Song, Baolin Wang, Ryan Roberts, Matthew Wilcox, Peter Xu,
	Zi Yan, Kefeng Wang, Jane Chu, Donet Tom

On 7/7/25 15:28, Alistair Popple wrote:
> On Fri, Jul 04, 2025 at 09:35:00AM +1000, Balbir Singh wrote:
>> Add routines to support allocation of large order zone device folios
>> and helper functions for zone device folios, to check if a folio is
>> device private and helpers for setting zone device data.
>>
>> When large folios are used, the existing page_free() callback in
>> pgmap is called when the folio is freed, this is true for both
>> PAGE_SIZE and higher order pages.
>>
>> Cc: Karol Herbst <kherbst@redhat.com>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: "Jérôme Glisse" <jglisse@redhat.com>
>> Cc: Shuah Khan <shuah@kernel.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Matthew Wilcox <willy@infradead.org>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
>> Cc: Jane Chu <jane.chu@oracle.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Donet Tom <donettom@linux.ibm.com>
>>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>>  include/linux/memremap.h | 22 +++++++++++++++++-
>>  mm/memremap.c            | 50 +++++++++++++++++++++++++++++-----------
>>  2 files changed, 58 insertions(+), 14 deletions(-)
>>
>> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
>> index 4aa151914eab..11d586dd8ef1 100644
>> --- a/include/linux/memremap.h
>> +++ b/include/linux/memremap.h
>> @@ -169,6 +169,18 @@ static inline bool folio_is_device_private(const struct folio *folio)
>>  	return is_device_private_page(&folio->page);
>>  }
>>  
>> +static inline void *folio_zone_device_data(const struct folio *folio)
>> +{
>> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
>> +	return folio->page.zone_device_data;
>> +}
>> +
>> +static inline void folio_set_zone_device_data(struct folio *folio, void *data)
>> +{
>> +	VM_BUG_ON_FOLIO(!folio_is_device_private(folio), folio);
>> +	folio->page.zone_device_data = data;
>> +}
>> +
>>  static inline bool is_pci_p2pdma_page(const struct page *page)
>>  {
>>  	return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
>> @@ -199,7 +211,7 @@ static inline bool folio_is_fsdax(const struct folio *folio)
>>  }
>>  
>>  #ifdef CONFIG_ZONE_DEVICE
>> -void zone_device_page_init(struct page *page);
>> +void init_zone_device_folio(struct folio *folio, unsigned int order);
>>  void *memremap_pages(struct dev_pagemap *pgmap, int nid);
>>  void memunmap_pages(struct dev_pagemap *pgmap);
>>  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
>> @@ -209,6 +221,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>>  bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
>>  
>>  unsigned long memremap_compat_align(void);
>> +
>> +static inline void zone_device_page_init(struct page *page)
>> +{
>> +	struct folio *folio = page_folio(page);
>> +
>> +	init_zone_device_folio(folio, 0);
> 
> Minor nit, but why not call this zone_device_folio_init() to keep the naming
> consistent with zone_device_page_init()?
> 

Ack, will do!

>> +}
>> +
>>  #else
>>  static inline void *devm_memremap_pages(struct device *dev,
>>  		struct dev_pagemap *pgmap)
>> diff --git a/mm/memremap.c b/mm/memremap.c
>> index b0ce0d8254bd..4085a3893e64 100644
>> --- a/mm/memremap.c
>> +++ b/mm/memremap.c
>> @@ -427,20 +427,21 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
>>  void free_zone_device_folio(struct folio *folio)
>>  {
>>  	struct dev_pagemap *pgmap = folio->pgmap;
>> +	unsigned int nr = folio_nr_pages(folio);
>> +	int i;
>> +	bool anon = folio_test_anon(folio);
>> +	struct page *page = folio_page(folio, 0);
>>  
>>  	if (WARN_ON_ONCE(!pgmap))
>>  		return;
>>  
>>  	mem_cgroup_uncharge(folio);
>>  
>> -	/*
>> -	 * Note: we don't expect anonymous compound pages yet. Once supported
>> -	 * and we could PTE-map them similar to THP, we'd have to clear
>> -	 * PG_anon_exclusive on all tail pages.
>> -	 */
>> -	if (folio_test_anon(folio)) {
>> -		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
>> -		__ClearPageAnonExclusive(folio_page(folio, 0));
>> +	WARN_ON_ONCE(folio_test_large(folio) && !anon);
>> +
>> +	for (i = 0; i < nr; i++) {
> 
> The above comment says we should do this for all tail pages, but this appears to
> do it for the head page as well. Is there a particular reason for that?
> 

The original code clears the head page (when the folio is not large), the only
page. I don't think the head page can be skipped.

>> +		if (anon)
>> +			__ClearPageAnonExclusive(folio_page(folio, i));
>>  	}
>>  
>>  	/*
>> @@ -464,10 +465,19 @@ void free_zone_device_folio(struct folio *folio)
>>  
>>  	switch (pgmap->type) {
>>  	case MEMORY_DEVICE_PRIVATE:
>> +		if (folio_test_large(folio)) {
>> +			folio_unqueue_deferred_split(folio);
>> +
>> +			percpu_ref_put_many(&folio->pgmap->ref, nr - 1);
>> +		}
>> +		pgmap->ops->page_free(page);
>> +		put_dev_pagemap(pgmap);
> 
> Why is this needed/added, and where is the associated get_dev_pagemap()? Note
> that the whole {get|put}_dev_pagemap() thing is basically unused now. Which
> reminds me I should send a patch to remove it.
> 

Thanks, I'll remove these bits

>> +		page->mapping = NULL;
>> +		break;
>>  	case MEMORY_DEVICE_COHERENT:
>>  		if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
>>  			break;
>> -		pgmap->ops->page_free(folio_page(folio, 0));
>> +		pgmap->ops->page_free(page);
>>  		put_dev_pagemap(pgmap);
>>  		break;
>>  
>> @@ -491,14 +501,28 @@ void free_zone_device_folio(struct folio *folio)
>>  	}
>>  }
>>  
>> -void zone_device_page_init(struct page *page)
>> +void init_zone_device_folio(struct folio *folio, unsigned int order)
> 
> See above for some bike-shedding on the name.
> 

Ack

>>  {
>> +	struct page *page = folio_page(folio, 0);
>> +
>> +	VM_BUG_ON(order > MAX_ORDER_NR_PAGES);
>> +
>> +	WARN_ON_ONCE(order && order != HPAGE_PMD_ORDER);
>> +
>>  	/*
>>  	 * Drivers shouldn't be allocating pages after calling
>>  	 * memunmap_pages().
>>  	 */
>> -	WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
>> -	set_page_count(page, 1);
>> +	WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order));
>> +	folio_set_count(folio, 1);
>>  	lock_page(page);
>> +
>> +	/*
>> +	 * Only PMD level migration is supported for THP migration
>> +	 */
>> +	if (order > 1) {
>> +		prep_compound_page(page, order);
> 
> Shouldn't this happen for order > 0 not 1? What about calling
> INIT_LIST_HEAD(&folio->_deferred_list)? Last time I looked prep_compound_page()
> didn't do that and I see above you are calling folio_unqueue_deferred_split() so
> I assume you need to do this for DEVICE_PRIVATE pages too.
> 

order == 1 has no deferred_list. prep_compound_page handles the INIT_LIST_HEAD



>> +		folio_set_large_rmappable(folio);
>> +	}
>>  }
>> -EXPORT_SYMBOL_GPL(zone_device_page_init);
>> +EXPORT_SYMBOL_GPL(init_zone_device_folio);
>> -- 
>> 2.49.0
>>


Thanks for the review
Balbir


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-07-08  6:47 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-04 16:48 [v1 resend 01/12] mm/zone_device: support large zone device private folios kernel test robot
  -- strict thread matches above, loose matches on Subject: below --
2025-07-03 23:34 [v1 resend 00/12] THP support for zone device page migration Balbir Singh
2025-07-03 23:35 ` [v1 resend 01/12] mm/zone_device: support large zone device private folios Balbir Singh
2025-07-07  5:28   ` Alistair Popple
2025-07-08  6:47     ` Balbir Singh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.