[PATCH] mm: free surplus huge pages properly on NUMA systems

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm: free surplus huge pages properly on NUMA systems
@ 2025-05-15 19:13 Andrey Alekhin
  2025-05-16  7:56 ` David Hildenbrand
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Andrey Alekhin @ 2025-05-15 19:13 UTC (permalink / raw)
  To: muchun.song; +Cc: osalvador, linux-mm, Andrey Alekhin

== History ==

Wrong values of huge pages counters were detected on Red Hat 9.0 (linux
5.14) when runing ltp test hugemmap10. Inspection of linux source code
showed that the problem was not fixed even in linux 6.14.

== Problem ==

free_huge_folio function does not properly free surplus huge pages on
NUMA systems. free_huge_folio checks surplus huge page counter only on
current node (where folio is allocated), but gather_surplus_pages
function can allocate surplus huge pages on any node.

The following sequence is possible on NUMA system:

n - overall number of huge pages
f - number of free huge pages
s - number of surplus huge pages
huge page counters:  [before]
                        |
                     [after]

		Process runs on node #1
			     |
       node0               node1
1) addr1 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
   [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=0
                  |
   [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1

2) echo 1 > /proc/sys/vm/nr_hugepages (cur_nid=1)
   [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1
                  |
   [n=0 f=0 s=0]       [n=1 f=1 s=0]  r=1
3) addr2 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
   [n=0 f=0 s=0]       [n=1 f=1 s=0]  r=1
                  |
   [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2
   New surplus huge page is reserved on node0, not on node1. In linux 6.14
   it is unlikely but possible and legal.

4) write to second page (touch)
   [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2
                  |
   [n=1 f=1 s=1]       [n=1 f=0 s=0]  r=1
    Reserverd page is mapped on node1

5) munmap(addr2) // 1 huge page is unmaped
   [n=1 f=1 s=1]       [n=1 f=0 s=0]  r=1
                  |
   [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=1
   Huge page is freed, but it is not freed as surplus page. Huge page
   counters in system are now: [nr_hugepages=2 free_huge_pages=2
   surplus_hugepages=1]. But they must be: [nr_hugepages=1 free_huge_pages=1
   surplus_hugepages=0].

== Solution ==

Check huge page counters on all available nodes when page is freed in
free_huge_folio. This check guarantees that surplus huge pages are always
freed correctly if they present in system.

Signed-off-by: Andrey Alekhin <andrei.aleohin@gmail.com>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6ea1be71aa42..2d38d12f4943 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1822,6 +1822,23 @@ struct hstate *size_to_hstate(unsigned long size)
 	return NULL;
 }
 
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+	struct mempolicy *mpol = get_task_policy(current);
+
+	/*
+	 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+	 * (from policy_nodemask) specifically for hugetlb case
+	 */
+	if (mpol->mode == MPOL_BIND &&
+		(apply_policy_zone(mpol, gfp_zone(gfp)) &&
+		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+		return &mpol->nodes;
+#endif
+	return NULL;
+}
+
 void free_huge_folio(struct folio *folio)
 {
 	/*
@@ -1833,6 +1850,8 @@ void free_huge_folio(struct folio *folio)
 	struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
 	bool restore_reserve;
 	unsigned long flags;
+	int node;
+	nodemask_t *mbind_nodemask, alloc_nodemask;
 
 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
 	VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
@@ -1883,6 +1902,25 @@ void free_huge_folio(struct folio *folio)
 		remove_hugetlb_folio(h, folio, true);
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
 		update_and_free_hugetlb_folio(h, folio, true);
+	} else if (h->surplus_huge_pages) {
+		mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
+		if (mbind_nodemask)
+			nodes_and(alloc_nodemask, *mbind_nodemask,
+					cpuset_current_mems_allowed);
+		else
+			alloc_nodemask = cpuset_current_mems_allowed;
+
+		for_each_node_mask(node, alloc_nodemask) {
+			if (h->surplus_huge_pages_node[node]) {
+				h->surplus_huge_pages_node[node]--;
+				h->surplus_huge_pages--;
+				break;
+			}
+		}
+
+		remove_hugetlb_folio(h, folio, false);
+		spin_unlock_irqrestore(&hugetlb_lock, flags);
+		update_and_free_hugetlb_folio(h, folio, true);
 	} else {
 		arch_clear_hugetlb_flags(folio);
 		enqueue_hugetlb_folio(h, folio);
@@ -2389,23 +2427,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
 }
 
-static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
-{
-#ifdef CONFIG_NUMA
-	struct mempolicy *mpol = get_task_policy(current);
-
-	/*
-	 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
-	 * (from policy_nodemask) specifically for hugetlb case
-	 */
-	if (mpol->mode == MPOL_BIND &&
-		(apply_policy_zone(mpol, gfp_zone(gfp)) &&
-		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
-		return &mpol->nodes;
-#endif
-	return NULL;
-}
-
 /*
  * Increase the hugetlb pool such that it can accommodate a reservation
  * of size 'delta'.
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] mm: free surplus huge pages properly on NUMA systems
  2025-05-15 19:13 [PATCH] mm: free surplus huge pages properly on NUMA systems Andrey Alekhin
@ 2025-05-16  7:56 ` David Hildenbrand
  2025-05-20  6:50 ` Oscar Salvador
  2025-05-20 10:26 ` Oscar Salvador
  2 siblings, 0 replies; 5+ messages in thread
From: David Hildenbrand @ 2025-05-16  7:56 UTC (permalink / raw)
  To: Andrey Alekhin, muchun.song; +Cc: osalvador, linux-mm

On 15.05.25 21:13, Andrey Alekhin wrote:
> == History ==

Nit: The patch subject should start with "mm/hugetlb:".


-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] mm: free surplus huge pages properly on NUMA systems
  2025-05-15 19:13 [PATCH] mm: free surplus huge pages properly on NUMA systems Andrey Alekhin
  2025-05-16  7:56 ` David Hildenbrand
@ 2025-05-20  6:50 ` Oscar Salvador
  2025-05-20 10:26 ` Oscar Salvador
  2 siblings, 0 replies; 5+ messages in thread
From: Oscar Salvador @ 2025-05-20  6:50 UTC (permalink / raw)
  To: Andrey Alekhin; +Cc: muchun.song, linux-mm

On Thu, May 15, 2025 at 10:13:27PM +0300, Andrey Alekhin wrote:
> == History ==
> 
> Wrong values of huge pages counters were detected on Red Hat 9.0 (linux
> 5.14) when runing ltp test hugemmap10. Inspection of linux source code
> showed that the problem was not fixed even in linux 6.14.

Besides code inspection, have you tried this on a current upstream
kernel? We have recently changed the way we deal with surplus huge
pages.

> == Problem ==
> 
> free_huge_folio function does not properly free surplus huge pages on
> NUMA systems. free_huge_folio checks surplus huge page counter only on
> current node (where folio is allocated), but gather_surplus_pages
> function can allocate surplus huge pages on any node.
> 
> The following sequence is possible on NUMA system:
> 
> n - overall number of huge pages
> f - number of free huge pages
> s - number of surplus huge pages
> huge page counters:  [before]
>                         |
>                      [after]
> 
> 		Process runs on node #1
> 			     |
>        node0               node1
> 1) addr1 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=0
>                   |
>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1

I take that 'r' means reserved?

>  void free_huge_folio(struct folio *folio)
>  {
>  	/*
> @@ -1833,6 +1850,8 @@ void free_huge_folio(struct folio *folio)
>  	struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
>  	bool restore_reserve;
>  	unsigned long flags;
> +	int node;
> +	nodemask_t *mbind_nodemask, alloc_nodemask;
>  
>  	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
>  	VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
> @@ -1883,6 +1902,25 @@ void free_huge_folio(struct folio *folio)
>  		remove_hugetlb_folio(h, folio, true);
>  		spin_unlock_irqrestore(&hugetlb_lock, flags);
>  		update_and_free_hugetlb_folio(h, folio, true);
> +	} else if (h->surplus_huge_pages) {
> +		mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
> +		if (mbind_nodemask)
> +			nodes_and(alloc_nodemask, *mbind_nodemask,
> +					cpuset_current_mems_allowed);
> +		else
> +			alloc_nodemask = cpuset_current_mems_allowed;
> +
> +		for_each_node_mask(node, alloc_nodemask) {
> +			if (h->surplus_huge_pages_node[node]) {
> +				h->surplus_huge_pages_node[node]--;
> +				h->surplus_huge_pages--;
> +				break;
> +			}
> +		}

I am not really happy with this, it feels quite ad-hoc to be honest.
If we are really having this, and I need to take closer look, we should join
the two 'else if' that handle surplus pages because as it stands it is
not really obvious what is going on.
And we might need some comments as well.

Let me think about this.


-- 
Oscar Salvador
SUSE Labs


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] mm: free surplus huge pages properly on NUMA systems
  2025-05-15 19:13 [PATCH] mm: free surplus huge pages properly on NUMA systems Andrey Alekhin
  2025-05-16  7:56 ` David Hildenbrand
  2025-05-20  6:50 ` Oscar Salvador
@ 2025-05-20 10:26 ` Oscar Salvador
  2025-05-21 16:28   ` Andrey Alekhin
  2 siblings, 1 reply; 5+ messages in thread
From: Oscar Salvador @ 2025-05-20 10:26 UTC (permalink / raw)
  To: Andrey Alekhin; +Cc: muchun.song, linux-mm

On Thu, May 15, 2025 at 10:13:27PM +0300, Andrey Alekhin wrote:
> The following sequence is possible on NUMA system:
> 
> n - overall number of huge pages
> f - number of free huge pages
> s - number of surplus huge pages
> huge page counters:  [before]
>                         |
>                      [after]
> 
> 		Process runs on node #1
> 			     |
>        node0               node1
> 1) addr1 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=0
>                   |
>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1
> 
> 2) echo 1 > /proc/sys/vm/nr_hugepages (cur_nid=1)
>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1
>                   |
>    [n=0 f=0 s=0]       [n=1 f=1 s=0]  r=1
> 3) addr2 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
>    [n=0 f=0 s=0]       [n=1 f=1 s=0]  r=1
>                   |
>    [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2
>    New surplus huge page is reserved on node0, not on node1. In linux 6.14
>    it is unlikely but possible and legal.
> 
> 4) write to second page (touch)
>    [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2
>                   |
>    [n=1 f=1 s=1]       [n=1 f=0 s=0]  r=1
>     Reserverd page is mapped on node1
> 
> 5) munmap(addr2) // 1 huge page is unmaped
>    [n=1 f=1 s=1]       [n=1 f=0 s=0]  r=1
>                   |
>    [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=1
>    Huge page is freed, but it is not freed as surplus page. Huge page
>    counters in system are now: [nr_hugepages=2 free_huge_pages=2
>    surplus_hugepages=1]. But they must be: [nr_hugepages=1 free_huge_pages=1
>    surplus_hugepages=0].

But sure once you do the munmap for addr1, stats will be corrected
again, right?

>  void free_huge_folio(struct folio *folio)
>  {
>  	/*
> @@ -1833,6 +1850,8 @@ void free_huge_folio(struct folio *folio)
>  	struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
>  	bool restore_reserve;
>  	unsigned long flags;
> +	int node;
> +	nodemask_t *mbind_nodemask, alloc_nodemask;
>  
>  	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
>  	VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
> @@ -1883,6 +1902,25 @@ void free_huge_folio(struct folio *folio)
>  		remove_hugetlb_folio(h, folio, true);
>  		spin_unlock_irqrestore(&hugetlb_lock, flags);
>  		update_and_free_hugetlb_folio(h, folio, true);
> +	} else if (h->surplus_huge_pages) {
> +		mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
> +		if (mbind_nodemask)
> +			nodes_and(alloc_nodemask, *mbind_nodemask,
> +					cpuset_current_mems_allowed);
> +		else
> +			alloc_nodemask = cpuset_current_mems_allowed;
> +
> +		for_each_node_mask(node, alloc_nodemask) {
> +			if (h->surplus_huge_pages_node[node]) {
> +				h->surplus_huge_pages_node[node]--;
> +				h->surplus_huge_pages--;
> +				break;
> +			}
> +		}

And I am not convinced about this one.
Apart from the fact that free_huge_folio() can be called from a workqueue,
why would we need to do this dance?
What if the node is not in the policy anymore? What happens to the its
counters?

I have to think about this some more, but I am not really convinced we
need this.


-- 
Oscar Salvador
SUSE Labs


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Re: [PATCH] mm: free surplus huge pages properly on NUMA systems
  2025-05-20 10:26 ` Oscar Salvador
@ 2025-05-21 16:28   ` Andrey Alekhin
  0 siblings, 0 replies; 5+ messages in thread
From: Andrey Alekhin @ 2025-05-21 16:28 UTC (permalink / raw)
  To: osalvador; +Cc: muchun.song, linux-mm, Andrey Alekhin

On Tue, 20 May 2025 03:26:18 -0700, Oscar Salvador wrote:
> On Thu, May 15, 2025 at 10:13:27PM +0300, Andrey Alekhin wrote:
>> The following sequence is possible on NUMA system:
>> 
>> n - overall number of huge pages
>> f - number of free huge pages
>> s - number of surplus huge pages
>> huge page counters:  [before]
>>                         |
>>                      [after]
>>
>>               Process runs on node #1
>>                            |
>>        node0               node1
>> 1) addr1 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
>>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=0
>>                   |
>>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1
>>
>> 2) echo 1 > /proc/sys/vm/nr_hugepages (cur_nid=1)
>>    [n=2 f=2 s=0]       [n=1 f=1 s=0]  r=1
>>                   |
>>    [n=0 f=0 s=0]       [n=1 f=1 s=0]  r=1
>> 3) addr2 = mmap(MAP_SHARED, ...) // 1 huge page is mmaped (cur_nid=1)
>>    [n=0 f=0 s=0]       [n=1 f=1 s=0]  r=1
>>                   |
>>    [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2
>>    New surplus huge page is reserved on node0, not on node1. In linux 6.14
>>    it is unlikely but possible and legal.
>>
>> 4) write to second page (touch)
>>    [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2
>>                   |
>>    [n=1 f=1 s=1]       [n=1 f=0 s=0]  r=1
>>     Reserverd page is mapped on node1
>>
>> 5) munmap(addr2) // 1 huge page is unmaped
>>    [n=1 f=1 s=1]       [n=1 f=0 s=0]  r=1
>>                   |
>>    [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=1
>>    Huge page is freed, but it is not freed as surplus page. Huge page
>>    counters in system are now: [nr_hugepages=2 free_huge_pages=2
>>    surplus_hugepages=1]. But they must be: [nr_hugepages=1 free_huge_pages=1
>>    surplus_hugepages=0].
>
>
> But sure once you do the munmap for addr1, stats will be corrected
> again, right?

Yes, after doing munmap for addr1 stats will be corrected again.
But counters after doing munmap for addr2 are wrong. In this example
there is only one excessive allocated huge page in system between
munmaping addr2 and addr1. But the problem is scalable and this
number could be much bigger.

More detailed explanation of the sequence leading to the problem is given below.
Test hugemmap10 from ltp set does this sequence.

1) mmap of 1 SHARED huge page -> addr1 is returned. 1 free huge page is reserved
for vma.

2) The number of huge pages in system is forcefully decreased to 1. Only 1 huge
page in the system now and it is reserved for vma.

3) mmap of 1 additional SHARED huge page -> addr2 is returned. Mappings for addr1
and addr2 use similar flags (both shared), and kernel uses the same vma for them.
For this vma 1 additional huge page is reserved. Because there is no free huge
pages in the system this page is allocated and added to h->hugepage_freelists[nid]
list by gather_surplus_pages function. The counters of surplus huge pages are
adjusted too.

There is a key difference in the process of allocation of huge pages between 5.14
and current kernel. In 5.14 kernel gather_surplus_pages function starts trying to
allocate new huge page on the node with number 0. In the current kernel
gather_surplus_pages starts allocation on the node of current CPU.
This means that in 5.14 kernel huge page is often allocated on another node,
whereas in the current kernel huge page is almost always allocated on the node of
current CPU.

As a result 2 pages are reserved for our vma and 2 cases are possible:

- 2 reserved pages in h->hugepage_freelists[1]
  This case is almost always actual on the current kernel.
       n0                   n1
  [n=0 f=0 s=0]       [n=2 f=2 s=1]  r=2
- 1 reserved page in h->hugepage_freelists[0] and 1 reserved page in
  h->hugepage_freelists[1] . This case often occurs on 5.14 kernel, but it is
  unlikely on the current kernel.
       n0                   n1
  [n=1 f=1 s=1]       [n=1 f=1 s=0]  r=2

4) Write data to addr2 -> page fault occurs, huge page is mmaped. In both cases
described above reserved huge page is taken from h->hugepage_freelists[1] (current node).

5) munmap is called for addr2. Now huge page needs to be freed in accordance with
h->nr_huge_pages and h->surplus_huge_pages counters. In our case h->nr_huge_pages = 2,
h->surplus_huge_pages = 1 . After munmap the counters should be:
h->nr_huge_pages = 1, h->surplus_huge_pages = 0.

- In case of 2 reserved pages in h->hugepage_freelists[1] this is really true, because
  h->surplus_huge_pages_node[1] = 1 , and surplus huge page is freed in free_huge_folio.
  This is the reason why the problem is not easily reproducible on the current kernel.
  But it is not guaranteed, that huge page is always allocated on current node.
  In case when allocation on current node fails new huge page can be allocated on
  another node. And this is the second case.
- In case of 1 reserved page in h->hugepage_freelists[0] and 1 in h->hugepage_freelists[1]
  h->surplus_huge_pages_node[1] = 0 and surplus huge page is not freed properly.
  1 excessive allocated huge page remains in the system. Huge page counters have wrong
  values: h->nr_huge_pages = 2, h->surplus_huge_pages = 1. Potential number of excessive
  allocated huge pages depends on the order of unmapping and amount of free memory
  on each node.

> And I am not convinced about this one.
> Apart from the fact that free_huge_folio() can be called from a workqueue,
> why would we need to do this dance?

In my opinion it does not matter where free_huge_folio is called. We need to obey the
restriction on the number of huge pages in the system. All huge pages allocated on
top of this number should be considered as surplus and freed when doing munmap.

>>  void free_huge_folio(struct folio *folio)
>>  {
>>       /*
>> @@ -1833,6 +1850,8 @@ void free_huge_folio(struct folio *folio)
>>       struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
>>       bool restore_reserve;
>>       unsigned long flags;
>> +     int node;
>> +     nodemask_t *mbind_nodemask, alloc_nodemask;
>> 
>>       VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
>>       VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
>> @@ -1883,6 +1902,25 @@ void free_huge_folio(struct folio *folio)
>>               remove_hugetlb_folio(h, folio, true);
>>               spin_unlock_irqrestore(&hugetlb_lock, flags);
>>               update_and_free_hugetlb_folio(h, folio, true);
>> +     } else if (h->surplus_huge_pages) {
>> +             mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
>> +             if (mbind_nodemask)
>> +                     nodes_and(alloc_nodemask, *mbind_nodemask,
>> +                                     cpuset_current_mems_allowed);
>> +             else
>> +                     alloc_nodemask = cpuset_current_mems_allowed;
>> +
>> +             for_each_node_mask(node, alloc_nodemask) {
>> +                     if (h->surplus_huge_pages_node[node]) {
>> +                             h->surplus_huge_pages_node[node]--;
>> +                             h->surplus_huge_pages--;
>> +                             break;
>> +                     }
>> +             }
>
> What if the node is not in the policy anymore? What happens to the its
> counters?

There is a problem here in my patch. I have followed mempolicy for current task
when calculating the mask of nodes similarly to the allocation process in
gather_surplus_pages. But changing of mempolicy and therefore mask of nodes is
possible between allocation of huge page and freeing it. At least it is possible
through system call set_mempolicy. Such change does not make numa nodes
unavailable.

I suppose that all available nodes should be scanned:

void free_huge_folio(struct folio *folio)
{
     /*
@@ -1833,6 +1850,8 @@ void free_huge_folio(struct folio *folio)
     struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
     bool restore_reserve;
     unsigned long flags;
+     int node;

     VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
     VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
@@ -1883,6 +1902,25 @@ void free_huge_folio(struct folio *folio)
             remove_hugetlb_folio(h, folio, true);
             spin_unlock_irqrestore(&hugetlb_lock, flags);
             update_and_free_hugetlb_folio(h, folio, true);
+     } else if (h->surplus_huge_pages) {
+             for_each_node_mask(node, cpuset_current_mems_allowed) {
+                     if (h->surplus_huge_pages_node[node]) {
+                             h->surplus_huge_pages_node[node]--;
+                             h->surplus_huge_pages--;
+                             break;
+                     }
+             }

> I have to think about this some more, but I am not really convinced we
> need this.

Ok, for this purpose I have described the problem in details above.

--

Andrey Alekhin

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-05-21 16:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-15 19:13 [PATCH] mm: free surplus huge pages properly on NUMA systems Andrey Alekhin
2025-05-16  7:56 ` David Hildenbrand
2025-05-20  6:50 ` Oscar Salvador
2025-05-20 10:26 ` Oscar Salvador
2025-05-21 16:28   ` Andrey Alekhin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).