All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
@ 2026-04-29 10:49 David Hildenbrand (Arm)
  2026-04-29 15:29 ` Lance Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-04-29 10:49 UTC (permalink / raw)
  To: Dave Hansen, Andy Lutomirski, Peter Zijlstra, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Mike Rapoport (Microsoft), Jason Gunthorpe, Lu Baolu,
	Andrew Morton, Lu Baolu, Lance Yang
  Cc: linux-kernel, linux-mm, stable, David Hildenbrand (Arm)

In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
from freeing non-boot page tables through __free_pages() to
pagetable_free().

However, the function is also called to free vmemmap pages.

Given that vmemmap pages are not page tables, already the page_ptdesc(page)
is wrong. But worse, pagetable_free() calls

	__free_pages(page, compound_order(page));

As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
except for HVO, which doesn't apply here -- we will only free the first
page when freeing a PMD-sized vmemmap page, leaking the other ones.

Fix it by properly decoupling pagetable and vmemmap freeing.
free_pagetable() no longer has to mess with SECTION_INFO, as only the
vmemmap is marked like that in register_page_bootmem_memmap().

The indentation in remove_pmd_table() is messed up, let's fix that
while touching it.

Note that we'll try to get rid of that bootmem info handling soon. For
now, we'll handle it similar to free_pagetable(), just avoiding the
ifdef.

Tested-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
Cc: stable@vger.kernel.org
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
---
Reproduced and tested with a simple VM with a virtio-mem device,
repeatedly adding and removing memory.

Found by code inspection while working on bootmem_info removal.
---
Changes in v2:
- Don't mess with the altmap with PTEs and add a comment why.
- Simplify "unsigned long nr_pages" handling.
- Link to v1: https://lore.kernel.org/r/20260428-vmemmap-v1-1-b2aa1e6db2c0@kernel.org
---
 arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df2261fa4f98..7e20b22d658b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 		enum bootmem_type type = bootmem_type(page);
 
-		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
+		if (type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else {
@@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(struct page *page, int order)
 	}
 }
 
-static void __meminit free_hugepage_table(struct page *page,
+static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
 		struct vmem_altmap *altmap)
 {
-	if (altmap)
-		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
-	else
-		free_pagetable(page, get_order(PMD_SIZE));
+	unsigned long nr_pages = 1u << order;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, nr_pages);
+	} else if (PageReserved(page)) {
+		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
+		    bootmem_type(page) == SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+	} else {
+		__free_pages(page, order);
+	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			/* We never populate base pages from the altmap. */
+			free_vmemmap_pages(pte_page(*pte), 0, NULL);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);
@@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE)) {
 				if (!direct)
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
+					free_vmemmap_pages(pmd_page(*pmd),
+							   PMD_ORDER, altmap);
 
 				spin_lock(&init_mm.page_table_lock);
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else if (vmemmap_pmd_is_unused(addr, next)) {
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
-					spin_lock(&init_mm.page_table_lock);
-					pmd_clear(pmd);
-					spin_unlock(&init_mm.page_table_lock);
+				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
+						   altmap);
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
 			}
 			continue;
 		}

---

base-commit: a2ddbfd1af0f54ea84bf17f0400088815d012e8d

change-id: 20260428-vmemmap-ab4b949aa727

--

Cheers,

David



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-04-29 10:49 [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages David Hildenbrand (Arm)
@ 2026-04-29 15:29 ` Lance Yang
  2026-05-08  9:19 ` David Hildenbrand (Arm)
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Lance Yang @ 2026-04-29 15:29 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Mike Rapoport (Microsoft), Dave Hansen, Borislav Petkov,
	Jason Gunthorpe, Andy Lutomirski, linux-kernel, H. Peter Anvin,
	Andrew Morton, Peter Zijlstra, Lu Baolu, linux-mm, stable, x86,
	Thomas Gleixner, Ingo Molnar



On 2026/4/29 18:49, David Hildenbrand (Arm) wrote:
> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> from freeing non-boot page tables through __free_pages() to
> pagetable_free().
> 
> However, the function is also called to free vmemmap pages.
> 
> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> is wrong. But worse, pagetable_free() calls
> 
> 	__free_pages(page, compound_order(page));
> 
> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> except for HVO, which doesn't apply here -- we will only free the first
> page when freeing a PMD-sized vmemmap page, leaking the other ones.
> 
> Fix it by properly decoupling pagetable and vmemmap freeing.
> free_pagetable() no longer has to mess with SECTION_INFO, as only the
> vmemmap is marked like that in register_page_bootmem_memmap().
> 
> The indentation in remove_pmd_table() is messed up, let's fix that
> while touching it.
> 
> Note that we'll try to get rid of that bootmem info handling soon. For
> now, we'll handle it similar to free_pagetable(), just avoiding the
> ifdef.
> 
> Tested-by: Lance Yang <lance.yang@linux.dev>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> Cc: stable@vger.kernel.org
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> Reproduced and tested with a simple VM with a virtio-mem device,
> repeatedly adding and removing memory.
> 
> Found by code inspection while working on bootmem_info removal.
> ---

Retested. Works as expected :)

Cheers, Lance


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-04-29 10:49 [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages David Hildenbrand (Arm)
  2026-04-29 15:29 ` Lance Yang
@ 2026-05-08  9:19 ` David Hildenbrand (Arm)
  2026-05-08  9:23   ` Peter Zijlstra
  2026-05-27 18:43 ` [tip: x86/mm] x86/mm: Fix " tip-bot2 for David Hildenbrand (Arm)
  2026-05-27 22:51 ` [PATCH v2] x86/mm: fix " Alison Schofield
  3 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-08  9:19 UTC (permalink / raw)
  To: Dave Hansen, Andy Lutomirski, Peter Zijlstra, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Mike Rapoport (Microsoft), Jason Gunthorpe, Lu Baolu,
	Andrew Morton, Lance Yang
  Cc: linux-kernel, linux-mm, stable

On 4/29/26 12:49, David Hildenbrand (Arm) wrote:
> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> from freeing non-boot page tables through __free_pages() to
> pagetable_free().
> 
> However, the function is also called to free vmemmap pages.
> 
> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> is wrong. But worse, pagetable_free() calls
> 
> 	__free_pages(page, compound_order(page));
> 
> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> except for HVO, which doesn't apply here -- we will only free the first
> page when freeing a PMD-sized vmemmap page, leaking the other ones.
> 
> Fix it by properly decoupling pagetable and vmemmap freeing.
> free_pagetable() no longer has to mess with SECTION_INFO, as only the
> vmemmap is marked like that in register_page_bootmem_memmap().
> 
> The indentation in remove_pmd_table() is messed up, let's fix that
> while touching it.
> 
> Note that we'll try to get rid of that bootmem info handling soon. For
> now, we'll handle it similar to free_pagetable(), just avoiding the
> ifdef.
> 
> Tested-by: Lance Yang <lance.yang@linux.dev>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> Cc: stable@vger.kernel.org
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> Reproduced and tested with a simple VM with a virtio-mem device,
> repeatedly adding and removing memory.
> 
> Found by code inspection while working on bootmem_info removal.
> ---

@x86 maintainers, do you want to take this through your tree or should we merge
this through the MM tree?

I have another MM series coming up that will touch this code (no fixes, though).

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-05-08  9:19 ` David Hildenbrand (Arm)
@ 2026-05-08  9:23   ` Peter Zijlstra
  2026-05-08 10:51     ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2026-05-08  9:23 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Dave Hansen, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Mike Rapoport (Microsoft),
	Jason Gunthorpe, Lu Baolu, Andrew Morton, Lance Yang,
	linux-kernel, linux-mm, stable

On Fri, May 08, 2026 at 11:19:26AM +0200, David Hildenbrand (Arm) wrote:
> On 4/29/26 12:49, David Hildenbrand (Arm) wrote:
> > In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> > from freeing non-boot page tables through __free_pages() to
> > pagetable_free().
> > 
> > However, the function is also called to free vmemmap pages.
> > 
> > Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> > is wrong. But worse, pagetable_free() calls
> > 
> > 	__free_pages(page, compound_order(page));
> > 
> > As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> > except for HVO, which doesn't apply here -- we will only free the first
> > page when freeing a PMD-sized vmemmap page, leaking the other ones.
> > 
> > Fix it by properly decoupling pagetable and vmemmap freeing.
> > free_pagetable() no longer has to mess with SECTION_INFO, as only the
> > vmemmap is marked like that in register_page_bootmem_memmap().
> > 
> > The indentation in remove_pmd_table() is messed up, let's fix that
> > while touching it.
> > 
> > Note that we'll try to get rid of that bootmem info handling soon. For
> > now, we'll handle it similar to free_pagetable(), just avoiding the
> > ifdef.
> > 
> > Tested-by: Lance Yang <lance.yang@linux.dev>
> > Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> > ---
> > Reproduced and tested with a simple VM with a virtio-mem device,
> > repeatedly adding and removing memory.
> > 
> > Found by code inspection while working on bootmem_info removal.
> > ---
> 
> @x86 maintainers, do you want to take this through your tree or should we merge
> this through the MM tree?
> 
> I have another MM series coming up that will touch this code (no fixes, though).

I'm thinking this should go in rather more urgent, yes?

It looks good to me, Dave you want to stick this in x86/urgent?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-05-08  9:23   ` Peter Zijlstra
@ 2026-05-08 10:51     ` David Hildenbrand (Arm)
  2026-05-22  0:35       ` Andrew Morton
  0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-08 10:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Hansen, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, x86, H. Peter Anvin, Mike Rapoport (Microsoft),
	Jason Gunthorpe, Lu Baolu, Andrew Morton, Lance Yang,
	linux-kernel, linux-mm, stable

On 5/8/26 11:23, Peter Zijlstra wrote:
> On Fri, May 08, 2026 at 11:19:26AM +0200, David Hildenbrand (Arm) wrote:
>> On 4/29/26 12:49, David Hildenbrand (Arm) wrote:
>>> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
>>> from freeing non-boot page tables through __free_pages() to
>>> pagetable_free().
>>>
>>> However, the function is also called to free vmemmap pages.
>>>
>>> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
>>> is wrong. But worse, pagetable_free() calls
>>>
>>> 	__free_pages(page, compound_order(page));
>>>
>>> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
>>> except for HVO, which doesn't apply here -- we will only free the first
>>> page when freeing a PMD-sized vmemmap page, leaking the other ones.
>>>
>>> Fix it by properly decoupling pagetable and vmemmap freeing.
>>> free_pagetable() no longer has to mess with SECTION_INFO, as only the
>>> vmemmap is marked like that in register_page_bootmem_memmap().
>>>
>>> The indentation in remove_pmd_table() is messed up, let's fix that
>>> while touching it.
>>>
>>> Note that we'll try to get rid of that bootmem info handling soon. For
>>> now, we'll handle it similar to free_pagetable(), just avoiding the
>>> ifdef.
>>>
>>> Tested-by: Lance Yang <lance.yang@linux.dev>
>>> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
>>> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
>>> ---
>>> Reproduced and tested with a simple VM with a virtio-mem device,
>>> repeatedly adding and removing memory.
>>>
>>> Found by code inspection while working on bootmem_info removal.
>>> ---
>>
>> @x86 maintainers, do you want to take this through your tree or should we merge
>> this through the MM tree?
>>
>> I have another MM series coming up that will touch this code (no fixes, though).
> 
> I'm thinking this should go in rather more urgent, yes?

Yes, please :)

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-05-08 10:51     ` David Hildenbrand (Arm)
@ 2026-05-22  0:35       ` Andrew Morton
  2026-05-22 22:35         ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 10+ messages in thread
From: Andrew Morton @ 2026-05-22  0:35 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Peter Zijlstra, Dave Hansen, Andy Lutomirski, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Mike Rapoport (Microsoft), Jason Gunthorpe, Lu Baolu, Lance Yang,
	linux-kernel, linux-mm, stable

On Fri, 8 May 2026 12:51:31 +0200 "David Hildenbrand (Arm)" <david@kernel.org> wrote:

> >>> Tested-by: Lance Yang <lance.yang@linux.dev>
> >>> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> >>> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> >>> Cc: stable@vger.kernel.org
> >>> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> >>> ---
> >>> Reproduced and tested with a simple VM with a virtio-mem device,
> >>> repeatedly adding and removing memory.
> >>>
> >>> Found by code inspection while working on bootmem_info removal.
> >>> ---
> >>
> >> @x86 maintainers, do you want to take this through your tree or should we merge
> >> this through the MM tree?
> >>
> >> I have another MM series coming up that will touch this code (no fixes, though).
> > 
> > I'm thinking this should go in rather more urgent, yes?
> 
> Yes, please :)

I'm not seeing this in linux-next so I (re) queued it in mm.git's
mm-hotfixes-unstble queue, for a 7.1-rcX merge.


From: "David Hildenbrand (Arm)" <david@kernel.org>
Subject: x86/mm: fix freeing of PMD-sized vmemmap pages
Date: Wed, 29 Apr 2026 12:49:14 +0200

In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched from
freeing non-boot page tables through __free_pages() to pagetable_free().

However, the function is also called to free vmemmap pages.

Given that vmemmap pages are not page tables, already the
page_ptdesc(page) is wrong.  But worse, pagetable_free() calls

	__free_pages(page, compound_order(page));

As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
except for HVO, which doesn't apply here -- we will only free the first
page when freeing a PMD-sized vmemmap page, leaking the other ones.

Fix it by properly decoupling pagetable and vmemmap freeing. 
free_pagetable() no longer has to mess with SECTION_INFO, as only the
vmemmap is marked like that in register_page_bootmem_memmap().

The indentation in remove_pmd_table() is messed up, let's fix that while
touching it.

Note that we'll try to get rid of that bootmem info handling soon.  For
now, we'll handle it similar to free_pagetable(), just avoiding the ifdef.

Link: https://lore.kernel.org/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org
Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
Tested-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baolu Lu <baolu.lu@linux.intel.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86/mm/init_64.c |   40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

--- a/arch/x86/mm/init_64.c~x86-mm-fix-freeing-of-pmd-sized-vmemmap-pages
+++ a/arch/x86/mm/init_64.c
@@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(str
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 		enum bootmem_type type = bootmem_type(page);
 
-		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
+		if (type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else {
@@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(str
 	}
 }
 
-static void __meminit free_hugepage_table(struct page *page,
+static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
 		struct vmem_altmap *altmap)
 {
-	if (altmap)
-		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
-	else
-		free_pagetable(page, get_order(PMD_SIZE));
+	unsigned long nr_pages = 1u << order;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, nr_pages);
+	} else if (PageReserved(page)) {
+		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
+		    bootmem_type(page) == SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+	} else {
+		__free_pages(page, order);
+	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsig
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			/* We never populate base pages from the altmap. */
+			free_vmemmap_pages(pte_page(*pte), 0, NULL);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);
@@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsig
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE)) {
 				if (!direct)
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
+					free_vmemmap_pages(pmd_page(*pmd),
+							   PMD_ORDER, altmap);
 
 				spin_lock(&init_mm.page_table_lock);
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else if (vmemmap_pmd_is_unused(addr, next)) {
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
-					spin_lock(&init_mm.page_table_lock);
-					pmd_clear(pmd);
-					spin_unlock(&init_mm.page_table_lock);
+				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
+						   altmap);
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
 			}
 			continue;
 		}
_


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-05-22  0:35       ` Andrew Morton
@ 2026-05-22 22:35         ` David Hildenbrand (Arm)
  2026-05-27 18:31           ` Dave Hansen
  0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-22 22:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, Dave Hansen, Andy Lutomirski, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Mike Rapoport (Microsoft), Jason Gunthorpe, Lu Baolu, Lance Yang,
	linux-kernel, linux-mm, stable

On 5/22/26 02:35, Andrew Morton wrote:
> On Fri, 8 May 2026 12:51:31 +0200 "David Hildenbrand (Arm)" <david@kernel.org> wrote:
> 
>>>
>>> I'm thinking this should go in rather more urgent, yes?
>>
>> Yes, please :)
> 
> I'm not seeing this in linux-next so I (re) queued it in mm.git's
> mm-hotfixes-unstble queue, for a 7.1-rcX merge.

Thanks. Dave is aware and didn't get to it yet.

So I'll let him speak up if he wants to let this sit a bit longer here.

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-05-22 22:35         ` David Hildenbrand (Arm)
@ 2026-05-27 18:31           ` Dave Hansen
  0 siblings, 0 replies; 10+ messages in thread
From: Dave Hansen @ 2026-05-27 18:31 UTC (permalink / raw)
  To: David Hildenbrand (Arm), Andrew Morton
  Cc: Peter Zijlstra, Dave Hansen, Andy Lutomirski, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Mike Rapoport (Microsoft), Jason Gunthorpe, Lu Baolu, Lance Yang,
	linux-kernel, linux-mm, stable

On 5/22/26 15:35, David Hildenbrand (Arm) wrote:
>> I'm not seeing this in linux-next so I (re) queued it in mm.git's
>> mm-hotfixes-unstble queue, for a 7.1-rcX merge.
> Thanks. Dave is aware and didn't get to it yet.
> 
> So I'll let him speak up if he wants to let this sit a bit longer here.

This isn't a new bug and it's getting a bit late in the -rc's. I'll
queue it for the next merge window.

Thanks for the reminder.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip: x86/mm] x86/mm: Fix freeing of PMD-sized vmemmap pages
  2026-04-29 10:49 [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages David Hildenbrand (Arm)
  2026-04-29 15:29 ` Lance Yang
  2026-05-08  9:19 ` David Hildenbrand (Arm)
@ 2026-05-27 18:43 ` tip-bot2 for David Hildenbrand (Arm)
  2026-05-27 22:51 ` [PATCH v2] x86/mm: fix " Alison Schofield
  3 siblings, 0 replies; 10+ messages in thread
From: tip-bot2 for David Hildenbrand (Arm) @ 2026-05-27 18:43 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: David Hildenbrand (Arm), Andrew Morton, Dave Hansen,
	Mike Rapoport (Microsoft), Lance Yang, stable, x86, linux-kernel

The following commit has been merged into the x86/mm branch of tip:

Commit-ID:     39406c05f8f150f1685839acd38ffdd69ff92031
Gitweb:        https://git.kernel.org/tip/39406c05f8f150f1685839acd38ffdd69ff92031
Author:        David Hildenbrand (Arm) <david@kernel.org>
AuthorDate:    Wed, 29 Apr 2026 12:49:14 +02:00
Committer:     Dave Hansen <dave.hansen@linux.intel.com>
CommitterDate: Wed, 27 May 2026 11:39:38 -07:00

x86/mm: Fix freeing of PMD-sized vmemmap pages

Commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), switched from
freeing non-boot page tables through __free_pages() to
pagetable_free().

However, the function is also called to free vmemmap pages.

Given that vmemmap pages are not page tables, already the page_ptdesc(page)
is wrong. But worse, pagetable_free() calls:

	__free_pages(page, compound_order(page));

Since vmemmap pages are not compound pages (see vmemmap_alloc_block())
-- except for HVO, which doesn't apply here -- only first page of a
PMD-sized vmemmap page is freed, leaking the other ones.

Fix it by properly decoupling pagetable and vmemmap freeing.
free_pagetable() no longer has to mess with SECTION_INFO, as only the
vmemmap is marked like that in register_page_bootmem_memmap().

The indentation in remove_pmd_table() is messed up. Fix that while
touching it.

Bootmem info handling will soon be fixed up. For now, handle it
similar to free_pagetable(), just avoiding the ifdef.

[ dhansen: changelog munging. More imperative voice ]

Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Lance Yang <lance.yang@linux.dev>
Link: https://lore.kernel.org/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org
Link: https://patch.msgid.link/20260429-vmemmap-v2-1-8dfcacffd877@kernel.org
Cc: stable@vger.kernel.org
---
 arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df2261f..7e20b22 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 		enum bootmem_type type = bootmem_type(page);
 
-		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
+		if (type == MIX_SECTION_INFO) {
 			while (nr_pages--)
 				put_page_bootmem(page++);
 		} else {
@@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(struct page *page, int order)
 	}
 }
 
-static void __meminit free_hugepage_table(struct page *page,
+static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
 		struct vmem_altmap *altmap)
 {
-	if (altmap)
-		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
-	else
-		free_pagetable(page, get_order(PMD_SIZE));
+	unsigned long nr_pages = 1u << order;
+
+	if (altmap) {
+		vmem_altmap_free(altmap, nr_pages);
+	} else if (PageReserved(page)) {
+		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
+		    bootmem_type(page) == SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else {
+			free_reserved_pages(page, nr_pages);
+		}
+	} else {
+		__free_pages(page, order);
+	}
 }
 
 static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
@@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 			return;
 
 		if (!direct)
-			free_pagetable(pte_page(*pte), 0);
+			/* We never populate base pages from the altmap. */
+			free_vmemmap_pages(pte_page(*pte), 0, NULL);
 
 		spin_lock(&init_mm.page_table_lock);
 		pte_clear(&init_mm, addr, pte);
@@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE)) {
 				if (!direct)
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
+					free_vmemmap_pages(pmd_page(*pmd),
+							   PMD_ORDER, altmap);
 
 				spin_lock(&init_mm.page_table_lock);
 				pmd_clear(pmd);
 				spin_unlock(&init_mm.page_table_lock);
 				pages++;
 			} else if (vmemmap_pmd_is_unused(addr, next)) {
-					free_hugepage_table(pmd_page(*pmd),
-							    altmap);
-					spin_lock(&init_mm.page_table_lock);
-					pmd_clear(pmd);
-					spin_unlock(&init_mm.page_table_lock);
+				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
+						   altmap);
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
 			}
 			continue;
 		}

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages
  2026-04-29 10:49 [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages David Hildenbrand (Arm)
                   ` (2 preceding siblings ...)
  2026-05-27 18:43 ` [tip: x86/mm] x86/mm: Fix " tip-bot2 for David Hildenbrand (Arm)
@ 2026-05-27 22:51 ` Alison Schofield
  3 siblings, 0 replies; 10+ messages in thread
From: Alison Schofield @ 2026-05-27 22:51 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Dave Hansen, Andy Lutomirski, Peter Zijlstra, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin,
	Mike Rapoport (Microsoft), Jason Gunthorpe, Lu Baolu,
	Andrew Morton, Lance Yang, linux-kernel, linux-mm, stable, nvdimm

On Wed, Apr 29, 2026 at 12:49:14PM +0200, David Hildenbrand (Arm) wrote:
> In commit bf9e4e30f353 ("x86/mm: use pagetable_free()"), we switched
> from freeing non-boot page tables through __free_pages() to
> pagetable_free().
> 
> However, the function is also called to free vmemmap pages.
> 
> Given that vmemmap pages are not page tables, already the page_ptdesc(page)
> is wrong. But worse, pagetable_free() calls
> 
> 	__free_pages(page, compound_order(page));
> 
> As vmemmap pages are not compound pages (see vmemmap_alloc_block()) --
> except for HVO, which doesn't apply here -- we will only free the first
> page when freeing a PMD-sized vmemmap page, leaking the other ones.

Hi David,

Sneaking in here to share with nvdimm/dax folks as this affects their
nfit_test environment usage.

+ nvdimm@lists.linux.dev

NVDIMM, DAX folks,

This fixes a memory leak present since v6.19 that surfaces during DAX
and NVDIMM unit testing, as well as ad-hoc nfit_test usage. If you are
seeing the system gradually run out of memory across repeated test runs
or namespace reconfiguration cycles, this is likely the cause.

In my setup, a VM with 5.4 GiB MemAvailable and a 4 GiB nfit_test
namespace lost about 1.1 GiB of MemAvailable per DAX or NVDIMM test suite
run. The VM OOM's partway through the 4th consecutive run of either. The
number of survivable runs scales roughly with available VM memory.

Symptoms typically begin with "page allocation failure: order 0" messages
from unrelated processes. If a test run is active when memory is
sufficiently depleted, it eventually terminates w OOM.

I've tested both this posted fix and a revert of the Fixes commit and both
resolve the leak in my setup. If neither is an option, periodic reboot of
the test environment may be needed for longer test sessions.

-- Alison

> 
> Fix it by properly decoupling pagetable and vmemmap freeing.
> free_pagetable() no longer has to mess with SECTION_INFO, as only the
> vmemmap is marked like that in register_page_bootmem_memmap().
> 
> The indentation in remove_pmd_table() is messed up, let's fix that
> while touching it.
> 
> Note that we'll try to get rid of that bootmem info handling soon. For
> now, we'll handle it similar to free_pagetable(), just avoiding the
> ifdef.
> 
> Tested-by: Lance Yang <lance.yang@linux.dev>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Fixes: bf9e4e30f353 ("x86/mm: use pagetable_free()")
> Cc: stable@vger.kernel.org
> Signed-off-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> Reproduced and tested with a simple VM with a virtio-mem device,
> repeatedly adding and removing memory.
> 
> Found by code inspection while working on bootmem_info removal.
> ---
> Changes in v2:
> - Don't mess with the altmap with PTEs and add a comment why.
> - Simplify "unsigned long nr_pages" handling.
> - Link to v1: https://lore.kernel.org/r/20260428-vmemmap-v1-1-b2aa1e6db2c0@kernel.org
> ---
>  arch/x86/mm/init_64.c | 40 ++++++++++++++++++++++++++--------------
>  1 file changed, 26 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index df2261fa4f98..7e20b22d658b 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1014,7 +1014,7 @@ static void __meminit free_pagetable(struct page *page, int order)
>  #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
>  		enum bootmem_type type = bootmem_type(page);
>  
> -		if (type == SECTION_INFO || type == MIX_SECTION_INFO) {
> +		if (type == MIX_SECTION_INFO) {
>  			while (nr_pages--)
>  				put_page_bootmem(page++);
>  		} else {
> @@ -1028,13 +1028,24 @@ static void __meminit free_pagetable(struct page *page, int order)
>  	}
>  }
>  
> -static void __meminit free_hugepage_table(struct page *page,
> +static void __meminit free_vmemmap_pages(struct page *page, unsigned int order,
>  		struct vmem_altmap *altmap)
>  {
> -	if (altmap)
> -		vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
> -	else
> -		free_pagetable(page, get_order(PMD_SIZE));
> +	unsigned long nr_pages = 1u << order;
> +
> +	if (altmap) {
> +		vmem_altmap_free(altmap, nr_pages);
> +	} else if (PageReserved(page)) {
> +		if (IS_ENABLED(CONFIG_HAVE_BOOTMEM_INFO_NODE) &&
> +		    bootmem_type(page) == SECTION_INFO) {
> +			while (nr_pages--)
> +				put_page_bootmem(page++);
> +		} else {
> +			free_reserved_pages(page, nr_pages);
> +		}
> +	} else {
> +		__free_pages(page, order);
> +	}
>  }
>  
>  static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> @@ -1118,7 +1129,8 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
>  			return;
>  
>  		if (!direct)
> -			free_pagetable(pte_page(*pte), 0);
> +			/* We never populate base pages from the altmap. */
> +			free_vmemmap_pages(pte_page(*pte), 0, NULL);
>  
>  		spin_lock(&init_mm.page_table_lock);
>  		pte_clear(&init_mm, addr, pte);
> @@ -1153,19 +1165,19 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
>  			if (IS_ALIGNED(addr, PMD_SIZE) &&
>  			    IS_ALIGNED(next, PMD_SIZE)) {
>  				if (!direct)
> -					free_hugepage_table(pmd_page(*pmd),
> -							    altmap);
> +					free_vmemmap_pages(pmd_page(*pmd),
> +							   PMD_ORDER, altmap);
>  
>  				spin_lock(&init_mm.page_table_lock);
>  				pmd_clear(pmd);
>  				spin_unlock(&init_mm.page_table_lock);
>  				pages++;
>  			} else if (vmemmap_pmd_is_unused(addr, next)) {
> -					free_hugepage_table(pmd_page(*pmd),
> -							    altmap);
> -					spin_lock(&init_mm.page_table_lock);
> -					pmd_clear(pmd);
> -					spin_unlock(&init_mm.page_table_lock);
> +				free_vmemmap_pages(pmd_page(*pmd), PMD_ORDER,
> +						   altmap);
> +				spin_lock(&init_mm.page_table_lock);
> +				pmd_clear(pmd);
> +				spin_unlock(&init_mm.page_table_lock);
>  			}
>  			continue;
>  		}
> 
> ---
> 
> base-commit: a2ddbfd1af0f54ea84bf17f0400088815d012e8d
> 
> change-id: 20260428-vmemmap-ab4b949aa727
> 
> --
> 
> Cheers,
> 
> David
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-05-27 22:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-29 10:49 [PATCH v2] x86/mm: fix freeing of PMD-sized vmemmap pages David Hildenbrand (Arm)
2026-04-29 15:29 ` Lance Yang
2026-05-08  9:19 ` David Hildenbrand (Arm)
2026-05-08  9:23   ` Peter Zijlstra
2026-05-08 10:51     ` David Hildenbrand (Arm)
2026-05-22  0:35       ` Andrew Morton
2026-05-22 22:35         ` David Hildenbrand (Arm)
2026-05-27 18:31           ` Dave Hansen
2026-05-27 18:43 ` [tip: x86/mm] x86/mm: Fix " tip-bot2 for David Hildenbrand (Arm)
2026-05-27 22:51 ` [PATCH v2] x86/mm: fix " Alison Schofield

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.