[PATCH RFC 0/2] mm: collect the number of anon mTHP

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC 0/2] mm: collect the number of anon mTHP
@ 2024-08-08  1:04 Barry Song
  2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
  2024-08-08  1:04 ` [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped Barry Song
  0 siblings, 2 replies; 28+ messages in thread
From: Barry Song @ 2024-08-08  1:04 UTC (permalink / raw)
  To: akpm, linux-mm
  Cc: chrisl, david, kaleshsingh, kasong, linux-kernel, ryan.roberts,
	ioworker0, baolin.wang, ziy, hanchuanhua, Barry Song

From: Barry Song <v-songbaohua@oppo.com>

Knowing the number of anon mTHPs in the system is crucial for performance
analysis. It helps in understanding the ratio and distribution of
mTHPs versus small folios throughout the system.

Additionally, partial unmapping by userspace can lead to significant waste
of mTHPs over time and increase memory reclamation pressure. We need this
information for comprehensive system tuning.

Barry Song (2):
  mm: collect the number of anon large folios
  mm: collect the number of anon large folios partially unmapped

 Documentation/admin-guide/mm/transhuge.rst | 10 ++++++++++
 include/linux/huge_mm.h                    | 16 ++++++++++++++--
 mm/huge_memory.c                           |  8 ++++++++
 mm/rmap.c                                  |  3 +++
 4 files changed, 35 insertions(+), 2 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  1:04 [PATCH RFC 0/2] mm: collect the number of anon mTHP Barry Song
@ 2024-08-08  1:04 ` Barry Song
  2024-08-08  7:08   ` Barry Song
                     ` (3 more replies)
  2024-08-08  1:04 ` [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped Barry Song
  1 sibling, 4 replies; 28+ messages in thread
From: Barry Song @ 2024-08-08  1:04 UTC (permalink / raw)
  To: akpm, linux-mm
  Cc: chrisl, david, kaleshsingh, kasong, linux-kernel, ryan.roberts,
	ioworker0, baolin.wang, ziy, hanchuanhua, Barry Song

From: Barry Song <v-songbaohua@oppo.com>

When a new anonymous mTHP is added to the rmap, we increase the count.
We reduce the count whenever an mTHP is completely unmapped.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 Documentation/admin-guide/mm/transhuge.rst |  5 +++++
 include/linux/huge_mm.h                    | 15 +++++++++++++--
 mm/huge_memory.c                           |  2 ++
 mm/rmap.c                                  |  3 +++
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 058485daf186..715f181543f6 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -527,6 +527,11 @@ split_deferred
         it would free up some memory. Pages on split queue are going to
         be split under memory pressure, if splitting is possible.
 
+anon_num
+       the number of anon huge pages we have in the whole system.
+       These huge pages could be still entirely mapped and have partially
+       unmapped and unused subpages.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..294c348fe3cc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -281,6 +281,7 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT,
 	MTHP_STAT_SPLIT_FAILED,
 	MTHP_STAT_SPLIT_DEFERRED,
+	MTHP_STAT_NR_ANON,
 	__MTHP_STAT_COUNT
 };
 
@@ -291,14 +292,24 @@ struct mthp_stat {
 #ifdef CONFIG_SYSFS
 DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
 
-static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
 {
 	if (order <= 0 || order > PMD_ORDER)
 		return;
 
-	this_cpu_inc(mthp_stats.stats[order][item]);
+	this_cpu_add(mthp_stats.stats[order][item], delta);
+}
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	mod_mthp_stat(order, item, 1);
 }
+
 #else
+static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
+{
+}
+
 static inline void count_mthp_stat(int order, enum mthp_stat_item item)
 {
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 697fcf89f975..b6bc2a3791e3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
 DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
+DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
 
 static struct attribute *stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
 	&split_attr.attr,
 	&split_failed_attr.attr,
 	&split_deferred_attr.attr,
+	&anon_num_attr.attr,
 	NULL,
 };
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 901950200957..2b722f26224c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 	}
 
 	__folio_mod_stat(folio, nr, nr_pmdmapped);
+	mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
 }
 
 static __always_inline void __folio_add_file_rmap(struct folio *folio,
@@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 	    list_empty(&folio->_deferred_list))
 		deferred_split_folio(folio);
 	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
+	if (folio_test_anon(folio) && !atomic_read(mapped))
+		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
 
 	/*
 	 * It would be tidy to reset folio_test_anon mapping when fully
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped
  2024-08-08  1:04 [PATCH RFC 0/2] mm: collect the number of anon mTHP Barry Song
  2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
@ 2024-08-08  1:04 ` Barry Song
  2024-08-09  8:23   ` Ryan Roberts
  1 sibling, 1 reply; 28+ messages in thread
From: Barry Song @ 2024-08-08  1:04 UTC (permalink / raw)
  To: akpm, linux-mm
  Cc: chrisl, david, kaleshsingh, kasong, linux-kernel, ryan.roberts,
	ioworker0, baolin.wang, ziy, hanchuanhua, Barry Song

From: Barry Song <v-songbaohua@oppo.com>

When an mTHP is added to the deferred_list, its partial pages
are unused, leading to wasted memory and potentially increasing
memory reclamation pressure. Tracking this number indicates
the extent to which userspace is partially unmapping mTHPs.

Detailing the specifics of how unmapping occurs is quite difficult
and not that useful, so we adopt a simple approach: each time an
mTHP enters the deferred_list, we increment the count by 1; whenever
it leaves for any reason, we decrement the count by 1.

Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 5 +++++
 include/linux/huge_mm.h                    | 1 +
 mm/huge_memory.c                           | 6 ++++++
 3 files changed, 12 insertions(+)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 715f181543f6..5028d61cbe0c 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -532,6 +532,11 @@ anon_num
        These huge pages could be still entirely mapped and have partially
        unmapped and unused subpages.
 
+anon_num_partial_unused
+       the number of anon huge pages which have been partially unmapped
+       we have in the whole system. These unmapped subpages are also
+       unused and temporarily wasting memory.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 294c348fe3cc..4b27a9797150 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -282,6 +282,7 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_FAILED,
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
+	MTHP_STAT_NR_ANON_SPLIT_DEFERRED,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b6bc2a3791e3..6083144f9fa0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -579,6 +579,7 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
 DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
+DEFINE_MTHP_STAT_ATTR(anon_num_partial_unused, MTHP_STAT_NR_ANON_SPLIT_DEFERRED);
 
 static struct attribute *stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -593,6 +594,7 @@ static struct attribute *stats_attrs[] = {
 	&split_failed_attr.attr,
 	&split_deferred_attr.attr,
 	&anon_num_attr.attr,
+	&anon_num_partial_unused_attr.attr,
 	NULL,
 };
 
@@ -3229,6 +3231,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		if (folio_order(folio) > 1 &&
 		    !list_empty(&folio->_deferred_list)) {
 			ds_queue->split_queue_len--;
+			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
 			/*
 			 * Reinitialize page_deferred_list after removing the
 			 * page from the split_queue, otherwise a subsequent
@@ -3291,6 +3294,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
 	if (!list_empty(&folio->_deferred_list)) {
 		ds_queue->split_queue_len--;
+		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
 		list_del_init(&folio->_deferred_list);
 	}
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -3332,6 +3336,7 @@ void deferred_split_folio(struct folio *folio)
 		if (folio_test_pmd_mappable(folio))
 			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
 		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, 1);
 		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
 		ds_queue->split_queue_len++;
 #ifdef CONFIG_MEMCG
@@ -3379,6 +3384,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 			list_move(&folio->_deferred_list, &list);
 		} else {
 			/* We lost race with folio_put() */
+			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
 			list_del_init(&folio->_deferred_list);
 			ds_queue->split_queue_len--;
 		}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
@ 2024-08-08  7:08   ` Barry Song
  2024-08-08  8:03     ` David Hildenbrand
  2024-08-09  5:17   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 28+ messages in thread
From: Barry Song @ 2024-08-08  7:08 UTC (permalink / raw)
  To: 21cnbao
  Cc: akpm, baolin.wang, chrisl, david, hanchuanhua, ioworker0,
	kaleshsingh, kasong, linux-kernel, linux-mm, ryan.roberts,
	v-songbaohua, ziy

On Thu, Aug 8, 2024 at 1:05 PM Barry Song <21cnbao@gmail.com> wrote:
>
> From: Barry Song <v-songbaohua@oppo.com>
>
> When a new anonymous mTHP is added to the rmap, we increase the count.
> We reduce the count whenever an mTHP is completely unmapped.
>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>  include/linux/huge_mm.h                    | 15 +++++++++++++--
>  mm/huge_memory.c                           |  2 ++
>  mm/rmap.c                                  |  3 +++
>  4 files changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 058485daf186..715f181543f6 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -527,6 +527,11 @@ split_deferred
>          it would free up some memory. Pages on split queue are going to
>          be split under memory pressure, if splitting is possible.
>
> +anon_num
> +       the number of anon huge pages we have in the whole system.
> +       These huge pages could be still entirely mapped and have partially
> +       unmapped and unused subpages.
> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..294c348fe3cc 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>         MTHP_STAT_SPLIT,
>         MTHP_STAT_SPLIT_FAILED,
>         MTHP_STAT_SPLIT_DEFERRED,
> +       MTHP_STAT_NR_ANON,
>         __MTHP_STAT_COUNT
>  };
>
> @@ -291,14 +292,24 @@ struct mthp_stat {
>  #ifdef CONFIG_SYSFS
>  DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>
> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>  {
>         if (order <= 0 || order > PMD_ORDER)
>                 return;
>
> -       this_cpu_inc(mthp_stats.stats[order][item]);
> +       this_cpu_add(mthp_stats.stats[order][item], delta);
> +}
> +
> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> +{
> +       mod_mthp_stat(order, item, 1);
>  }
> +
>  #else
> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
> +{
> +}
> +
>  static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>  {
>  }
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 697fcf89f975..b6bc2a3791e3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>  DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>  DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>
>  static struct attribute *stats_attrs[] = {
>         &anon_fault_alloc_attr.attr,
> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>         &split_attr.attr,
>         &split_failed_attr.attr,
>         &split_deferred_attr.attr,
> +       &anon_num_attr.attr,
>         NULL,
>  };
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 901950200957..2b722f26224c 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>         }
>
>         __folio_mod_stat(folio, nr, nr_pmdmapped);
> +       mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>  }
>
>  static __always_inline void __folio_add_file_rmap(struct folio *folio,
> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>             list_empty(&folio->_deferred_list))
>                 deferred_split_folio(folio);
>         __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> +       if (folio_test_anon(folio) && !atomic_read(mapped))

could have a risk here two processes unmap at the same time, so
they both get zero on atomic_read(mapped)? should read the value
of atomic_dec_return() instead to confirm we are the last one
doing unmap?

diff --git a/mm/rmap.c b/mm/rmap.c
index 2b722f26224c..f4a0f343f995 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1530,6 +1530,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 	atomic_t *mapped = &folio->_nr_pages_mapped;
 	int last, nr = 0, nr_pmdmapped = 0;
 	bool partially_mapped = false;
+	bool last_unmapped = false;
 
 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
@@ -1545,6 +1546,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 			last = atomic_add_negative(-1, &page->_mapcount);
 			if (last) {
 				last = atomic_dec_return_relaxed(mapped);
+				if (!last)
+					last_unmapped = true;
 				if (last < ENTIRELY_MAPPED)
 					nr++;
 			}
@@ -1557,6 +1560,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 		last = atomic_add_negative(-1, &folio->_entire_mapcount);
 		if (last) {
 			nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
+			if (nr == 0)
+				last_unmapped = true;
 			if (likely(nr < ENTIRELY_MAPPED)) {
 				nr_pmdmapped = folio_nr_pages(folio);
 				nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
@@ -1583,7 +1588,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 	    list_empty(&folio->_deferred_list))
 		deferred_split_folio(folio);
 	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
-	if (folio_test_anon(folio) && !atomic_read(mapped))
+	if (folio_test_anon(folio) && last_unmapped)
 		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
 
 	/*
 
> +               mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
>
>         /*
>          * It would be tidy to reset folio_test_anon mapping when fully
> --
> 2.34.1
>

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  7:08   ` Barry Song
@ 2024-08-08  8:03     ` David Hildenbrand
  2024-08-08  8:08       ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-08  8:03 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On 08.08.24 09:08, Barry Song wrote:
> On Thu, Aug 8, 2024 at 1:05 PM Barry Song <21cnbao@gmail.com> wrote:
>>
>> From: Barry Song <v-songbaohua@oppo.com>
>>
>> When a new anonymous mTHP is added to the rmap, we increase the count.
>> We reduce the count whenever an mTHP is completely unmapped.
>>
>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> ---
>>   Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>>   include/linux/huge_mm.h                    | 15 +++++++++++++--
>>   mm/huge_memory.c                           |  2 ++
>>   mm/rmap.c                                  |  3 +++
>>   4 files changed, 23 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index 058485daf186..715f181543f6 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -527,6 +527,11 @@ split_deferred
>>           it would free up some memory. Pages on split queue are going to
>>           be split under memory pressure, if splitting is possible.
>>
>> +anon_num
>> +       the number of anon huge pages we have in the whole system.
>> +       These huge pages could be still entirely mapped and have partially
>> +       unmapped and unused subpages.
>> +
>>   As the system ages, allocating huge pages may be expensive as the
>>   system uses memory compaction to copy data around memory to free a
>>   huge page for use. There are some counters in ``/proc/vmstat`` to help
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..294c348fe3cc 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>>          MTHP_STAT_SPLIT,
>>          MTHP_STAT_SPLIT_FAILED,
>>          MTHP_STAT_SPLIT_DEFERRED,
>> +       MTHP_STAT_NR_ANON,
>>          __MTHP_STAT_COUNT
>>   };
>>
>> @@ -291,14 +292,24 @@ struct mthp_stat {
>>   #ifdef CONFIG_SYSFS
>>   DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>>
>> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>   {
>>          if (order <= 0 || order > PMD_ORDER)
>>                  return;
>>
>> -       this_cpu_inc(mthp_stats.stats[order][item]);
>> +       this_cpu_add(mthp_stats.stats[order][item], delta);
>> +}
>> +
>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>> +{
>> +       mod_mthp_stat(order, item, 1);
>>   }
>> +
>>   #else
>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>> +{
>> +}
>> +
>>   static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>   {
>>   }
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 697fcf89f975..b6bc2a3791e3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>>   DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>   DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>   DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>>
>>   static struct attribute *stats_attrs[] = {
>>          &anon_fault_alloc_attr.attr,
>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>>          &split_attr.attr,
>>          &split_failed_attr.attr,
>>          &split_deferred_attr.attr,
>> +       &anon_num_attr.attr,
>>          NULL,
>>   };
>>
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 901950200957..2b722f26224c 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>          }
>>
>>          __folio_mod_stat(folio, nr, nr_pmdmapped);
>> +       mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>>   }
>>
>>   static __always_inline void __folio_add_file_rmap(struct folio *folio,
>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>              list_empty(&folio->_deferred_list))
>>                  deferred_split_folio(folio);
>>          __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>> +       if (folio_test_anon(folio) && !atomic_read(mapped))
> 
> could have a risk here two processes unmap at the same time, so
> they both get zero on atomic_read(mapped)? should read the value
> of atomic_dec_return() instead to confirm we are the last one
> doing unmap?

I would appreciate if we leave the rmap out here.

Can't we handle that when actually freeing the folio? folio_test_anon() 
is sticky until freed.

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  8:03     ` David Hildenbrand
@ 2024-08-08  8:08       ` David Hildenbrand
  2024-08-08  8:17         ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-08  8:08 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On 08.08.24 10:03, David Hildenbrand wrote:
> On 08.08.24 09:08, Barry Song wrote:
>> On Thu, Aug 8, 2024 at 1:05 PM Barry Song <21cnbao@gmail.com> wrote:
>>>
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> When a new anonymous mTHP is added to the rmap, we increase the count.
>>> We reduce the count whenever an mTHP is completely unmapped.
>>>
>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>> ---
>>>    Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>>>    include/linux/huge_mm.h                    | 15 +++++++++++++--
>>>    mm/huge_memory.c                           |  2 ++
>>>    mm/rmap.c                                  |  3 +++
>>>    4 files changed, 23 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>>> index 058485daf186..715f181543f6 100644
>>> --- a/Documentation/admin-guide/mm/transhuge.rst
>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>>> @@ -527,6 +527,11 @@ split_deferred
>>>            it would free up some memory. Pages on split queue are going to
>>>            be split under memory pressure, if splitting is possible.
>>>
>>> +anon_num
>>> +       the number of anon huge pages we have in the whole system.
>>> +       These huge pages could be still entirely mapped and have partially
>>> +       unmapped and unused subpages.
>>> +
>>>    As the system ages, allocating huge pages may be expensive as the
>>>    system uses memory compaction to copy data around memory to free a
>>>    huge page for use. There are some counters in ``/proc/vmstat`` to help
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..294c348fe3cc 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>>>           MTHP_STAT_SPLIT,
>>>           MTHP_STAT_SPLIT_FAILED,
>>>           MTHP_STAT_SPLIT_DEFERRED,
>>> +       MTHP_STAT_NR_ANON,
>>>           __MTHP_STAT_COUNT
>>>    };
>>>
>>> @@ -291,14 +292,24 @@ struct mthp_stat {
>>>    #ifdef CONFIG_SYSFS
>>>    DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>>>
>>> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>>    {
>>>           if (order <= 0 || order > PMD_ORDER)
>>>                   return;
>>>
>>> -       this_cpu_inc(mthp_stats.stats[order][item]);
>>> +       this_cpu_add(mthp_stats.stats[order][item], delta);
>>> +}
>>> +
>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>> +{
>>> +       mod_mthp_stat(order, item, 1);
>>>    }
>>> +
>>>    #else
>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>> +{
>>> +}
>>> +
>>>    static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>    {
>>>    }
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 697fcf89f975..b6bc2a3791e3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>>>    DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>>    DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>>    DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>>>
>>>    static struct attribute *stats_attrs[] = {
>>>           &anon_fault_alloc_attr.attr,
>>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>>>           &split_attr.attr,
>>>           &split_failed_attr.attr,
>>>           &split_deferred_attr.attr,
>>> +       &anon_num_attr.attr,
>>>           NULL,
>>>    };
>>>
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 901950200957..2b722f26224c 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>>           }
>>>
>>>           __folio_mod_stat(folio, nr, nr_pmdmapped);
>>> +       mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>>>    }
>>>
>>>    static __always_inline void __folio_add_file_rmap(struct folio *folio,
>>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>>               list_empty(&folio->_deferred_list))
>>>                   deferred_split_folio(folio);
>>>           __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>>> +       if (folio_test_anon(folio) && !atomic_read(mapped))
>>
>> could have a risk here two processes unmap at the same time, so
>> they both get zero on atomic_read(mapped)? should read the value
>> of atomic_dec_return() instead to confirm we are the last one
>> doing unmap?
> 
> I would appreciate if we leave the rmap out here.
> 
> Can't we handle that when actually freeing the folio? folio_test_anon()
> is sticky until freed.

To be clearer: we increment the counter when we set a folio anon, which 
should indeed only happen in folio_add_new_anon_rmap(). We'll have to 
ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().

Then, when we free an anon folio we decrement the counter. (hugetlb 
should clear the anon flag when an anon folio gets freed back to its 
allocator -- likely that is already done).

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  8:08       ` David Hildenbrand
@ 2024-08-08  8:17         ` David Hildenbrand
  2024-08-08  9:20           ` Barry Song
  2024-08-09  7:04           ` Barry Song
  0 siblings, 2 replies; 28+ messages in thread
From: David Hildenbrand @ 2024-08-08  8:17 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On 08.08.24 10:08, David Hildenbrand wrote:
> On 08.08.24 10:03, David Hildenbrand wrote:
>> On 08.08.24 09:08, Barry Song wrote:
>>> On Thu, Aug 8, 2024 at 1:05 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>
>>>> From: Barry Song <v-songbaohua@oppo.com>
>>>>
>>>> When a new anonymous mTHP is added to the rmap, we increase the count.
>>>> We reduce the count whenever an mTHP is completely unmapped.
>>>>
>>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>>> ---
>>>>     Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>>>>     include/linux/huge_mm.h                    | 15 +++++++++++++--
>>>>     mm/huge_memory.c                           |  2 ++
>>>>     mm/rmap.c                                  |  3 +++
>>>>     4 files changed, 23 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>>>> index 058485daf186..715f181543f6 100644
>>>> --- a/Documentation/admin-guide/mm/transhuge.rst
>>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>>>> @@ -527,6 +527,11 @@ split_deferred
>>>>             it would free up some memory. Pages on split queue are going to
>>>>             be split under memory pressure, if splitting is possible.
>>>>
>>>> +anon_num
>>>> +       the number of anon huge pages we have in the whole system.
>>>> +       These huge pages could be still entirely mapped and have partially
>>>> +       unmapped and unused subpages.
>>>> +
>>>>     As the system ages, allocating huge pages may be expensive as the
>>>>     system uses memory compaction to copy data around memory to free a
>>>>     huge page for use. There are some counters in ``/proc/vmstat`` to help
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index e25d9ebfdf89..294c348fe3cc 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>>>>            MTHP_STAT_SPLIT,
>>>>            MTHP_STAT_SPLIT_FAILED,
>>>>            MTHP_STAT_SPLIT_DEFERRED,
>>>> +       MTHP_STAT_NR_ANON,
>>>>            __MTHP_STAT_COUNT
>>>>     };
>>>>
>>>> @@ -291,14 +292,24 @@ struct mthp_stat {
>>>>     #ifdef CONFIG_SYSFS
>>>>     DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>>>>
>>>> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>>>     {
>>>>            if (order <= 0 || order > PMD_ORDER)
>>>>                    return;
>>>>
>>>> -       this_cpu_inc(mthp_stats.stats[order][item]);
>>>> +       this_cpu_add(mthp_stats.stats[order][item], delta);
>>>> +}
>>>> +
>>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>> +{
>>>> +       mod_mthp_stat(order, item, 1);
>>>>     }
>>>> +
>>>>     #else
>>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>>> +{
>>>> +}
>>>> +
>>>>     static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>>     {
>>>>     }
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 697fcf89f975..b6bc2a3791e3 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>>>>     DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>>>     DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>>>     DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>>>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>>>>
>>>>     static struct attribute *stats_attrs[] = {
>>>>            &anon_fault_alloc_attr.attr,
>>>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>>>>            &split_attr.attr,
>>>>            &split_failed_attr.attr,
>>>>            &split_deferred_attr.attr,
>>>> +       &anon_num_attr.attr,
>>>>            NULL,
>>>>     };
>>>>
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 901950200957..2b722f26224c 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>>>            }
>>>>
>>>>            __folio_mod_stat(folio, nr, nr_pmdmapped);
>>>> +       mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>>>>     }
>>>>
>>>>     static __always_inline void __folio_add_file_rmap(struct folio *folio,
>>>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>>>                list_empty(&folio->_deferred_list))
>>>>                    deferred_split_folio(folio);
>>>>            __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>>>> +       if (folio_test_anon(folio) && !atomic_read(mapped))
>>>
>>> could have a risk here two processes unmap at the same time, so
>>> they both get zero on atomic_read(mapped)? should read the value
>>> of atomic_dec_return() instead to confirm we are the last one
>>> doing unmap?
>>
>> I would appreciate if we leave the rmap out here.
>>
>> Can't we handle that when actually freeing the folio? folio_test_anon()
>> is sticky until freed.
> 
> To be clearer: we increment the counter when we set a folio anon, which
> should indeed only happen in folio_add_new_anon_rmap(). We'll have to
> ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().
> 
> Then, when we free an anon folio we decrement the counter. (hugetlb
> should clear the anon flag when an anon folio gets freed back to its
> allocator -- likely that is already done).
> 

Sorry that I am talking to myself: I'm wondering if we also have to 
adjust the counter when splitting a large folio to multiple 
smaller-but-still-large folios.

-- 
Cheers,

David / dhildenb


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  8:17         ` David Hildenbrand
@ 2024-08-08  9:20           ` Barry Song
  2024-08-09  7:04           ` Barry Song
  1 sibling, 0 replies; 28+ messages in thread
From: Barry Song @ 2024-08-08  9:20 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On Thu, Aug 8, 2024 at 8:17 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 08.08.24 10:08, David Hildenbrand wrote:
> > On 08.08.24 10:03, David Hildenbrand wrote:
> >> On 08.08.24 09:08, Barry Song wrote:
> >>> On Thu, Aug 8, 2024 at 1:05 PM Barry Song <21cnbao@gmail.com> wrote:
> >>>>
> >>>> From: Barry Song <v-songbaohua@oppo.com>
> >>>>
> >>>> When a new anonymous mTHP is added to the rmap, we increase the count.
> >>>> We reduce the count whenever an mTHP is completely unmapped.
> >>>>
> >>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >>>> ---
> >>>>     Documentation/admin-guide/mm/transhuge.rst |  5 +++++
> >>>>     include/linux/huge_mm.h                    | 15 +++++++++++++--
> >>>>     mm/huge_memory.c                           |  2 ++
> >>>>     mm/rmap.c                                  |  3 +++
> >>>>     4 files changed, 23 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> >>>> index 058485daf186..715f181543f6 100644
> >>>> --- a/Documentation/admin-guide/mm/transhuge.rst
> >>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
> >>>> @@ -527,6 +527,11 @@ split_deferred
> >>>>             it would free up some memory. Pages on split queue are going to
> >>>>             be split under memory pressure, if splitting is possible.
> >>>>
> >>>> +anon_num
> >>>> +       the number of anon huge pages we have in the whole system.
> >>>> +       These huge pages could be still entirely mapped and have partially
> >>>> +       unmapped and unused subpages.
> >>>> +
> >>>>     As the system ages, allocating huge pages may be expensive as the
> >>>>     system uses memory compaction to copy data around memory to free a
> >>>>     huge page for use. There are some counters in ``/proc/vmstat`` to help
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index e25d9ebfdf89..294c348fe3cc 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
> >>>>            MTHP_STAT_SPLIT,
> >>>>            MTHP_STAT_SPLIT_FAILED,
> >>>>            MTHP_STAT_SPLIT_DEFERRED,
> >>>> +       MTHP_STAT_NR_ANON,
> >>>>            __MTHP_STAT_COUNT
> >>>>     };
> >>>>
> >>>> @@ -291,14 +292,24 @@ struct mthp_stat {
> >>>>     #ifdef CONFIG_SYSFS
> >>>>     DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
> >>>>
> >>>> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
> >>>>     {
> >>>>            if (order <= 0 || order > PMD_ORDER)
> >>>>                    return;
> >>>>
> >>>> -       this_cpu_inc(mthp_stats.stats[order][item]);
> >>>> +       this_cpu_add(mthp_stats.stats[order][item], delta);
> >>>> +}
> >>>> +
> >>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>> +{
> >>>> +       mod_mthp_stat(order, item, 1);
> >>>>     }
> >>>> +
> >>>>     #else
> >>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
> >>>> +{
> >>>> +}
> >>>> +
> >>>>     static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>>>     {
> >>>>     }
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 697fcf89f975..b6bc2a3791e3 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
> >>>>     DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
> >>>>     DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> >>>>     DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> >>>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
> >>>>
> >>>>     static struct attribute *stats_attrs[] = {
> >>>>            &anon_fault_alloc_attr.attr,
> >>>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
> >>>>            &split_attr.attr,
> >>>>            &split_failed_attr.attr,
> >>>>            &split_deferred_attr.attr,
> >>>> +       &anon_num_attr.attr,
> >>>>            NULL,
> >>>>     };
> >>>>
> >>>> diff --git a/mm/rmap.c b/mm/rmap.c
> >>>> index 901950200957..2b722f26224c 100644
> >>>> --- a/mm/rmap.c
> >>>> +++ b/mm/rmap.c
> >>>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
> >>>>            }
> >>>>
> >>>>            __folio_mod_stat(folio, nr, nr_pmdmapped);
> >>>> +       mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
> >>>>     }
> >>>>
> >>>>     static __always_inline void __folio_add_file_rmap(struct folio *folio,
> >>>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> >>>>                list_empty(&folio->_deferred_list))
> >>>>                    deferred_split_folio(folio);
> >>>>            __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >>>> +       if (folio_test_anon(folio) && !atomic_read(mapped))
> >>>
> >>> could have a risk here two processes unmap at the same time, so
> >>> they both get zero on atomic_read(mapped)? should read the value
> >>> of atomic_dec_return() instead to confirm we are the last one
> >>> doing unmap?
> >>
> >> I would appreciate if we leave the rmap out here.
> >>
> >> Can't we handle that when actually freeing the folio? folio_test_anon()
> >> is sticky until freed.
> >
> > To be clearer: we increment the counter when we set a folio anon, which
> > should indeed only happen in folio_add_new_anon_rmap(). We'll have to
> > ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().
> >
> > Then, when we free an anon folio we decrement the counter. (hugetlb
> > should clear the anon flag when an anon folio gets freed back to its
> > allocator -- likely that is already done).
> >
>
> Sorry that I am talking to myself: I'm wondering if we also have to
> adjust the counter when splitting a large folio to multiple
> smaller-but-still-large folios.

yes, if we don't use remove_rmap. because we could allocate them as
mTHP but free them as nr_pages small folios.

>
> --
> Cheers,
>
> David / dhildenb
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
  2024-08-08  7:08   ` Barry Song
@ 2024-08-09  5:17   ` kernel test robot
  2024-08-09  5:28   ` kernel test robot
  2024-08-09  8:13   ` Ryan Roberts
  3 siblings, 0 replies; 28+ messages in thread
From: kernel test robot @ 2024-08-09  5:17 UTC (permalink / raw)
  To: Barry Song; +Cc: oe-kbuild-all

Hi Barry,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on next-20240807]
[also build test ERROR on v6.11-rc2]
[cannot apply to akpm-mm/mm-everything linus/master v6.11-rc2 v6.11-rc1 v6.10]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Barry-Song/mm-collect-the-number-of-anon-large-folios/20240808-090627
base:   next-20240807
patch link:    https://lore.kernel.org/r/20240808010457.228753-2-21cnbao%40gmail.com
patch subject: [PATCH RFC 1/2] mm: collect the number of anon large folios
config: openrisc-allnoconfig (https://download.01.org/0day-ci/archive/20240809/202408091325.dYV3OVPV-lkp@intel.com/config)
compiler: or1k-linux-gcc (GCC) 14.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240809/202408091325.dYV3OVPV-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408091325.dYV3OVPV-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/rmap.c: In function 'folio_add_new_anon_rmap':
>> mm/rmap.c:1470:9: error: implicit declaration of function 'mod_mthp_stat'; did you mean 'mod_memcg_state'? [-Wimplicit-function-declaration]
    1470 |         mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
         |         ^~~~~~~~~~~~~
         |         mod_memcg_state
>> mm/rmap.c:1470:43: error: 'MTHP_STAT_NR_ANON' undeclared (first use in this function)
    1470 |         mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
         |                                           ^~~~~~~~~~~~~~~~~
   mm/rmap.c:1470:43: note: each undeclared identifier is reported only once for each function it appears in
   mm/rmap.c: In function '__folio_remove_rmap':
   mm/rmap.c:1587:51: error: 'MTHP_STAT_NR_ANON' undeclared (first use in this function)
    1587 |                 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
         |                                                   ^~~~~~~~~~~~~~~~~


vim +1470 mm/rmap.c

  1402	
  1403	/**
  1404	 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
  1405	 * @folio:	The folio to add the mapping to.
  1406	 * @vma:	the vm area in which the mapping is added
  1407	 * @address:	the user virtual address mapped
  1408	 * @flags:	The rmap flags
  1409	 *
  1410	 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
  1411	 * This means the inc-and-test can be bypassed.
  1412	 * The folio doesn't necessarily need to be locked while it's exclusive
  1413	 * unless two threads map it concurrently. However, the folio must be
  1414	 * locked if it's shared.
  1415	 *
  1416	 * If the folio is pmd-mappable, it is accounted as a THP.
  1417	 */
  1418	void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
  1419			unsigned long address, rmap_t flags)
  1420	{
  1421		const int nr = folio_nr_pages(folio);
  1422		const bool exclusive = flags & RMAP_EXCLUSIVE;
  1423		int nr_pmdmapped = 0;
  1424	
  1425		VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
  1426		VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
  1427		VM_BUG_ON_VMA(address < vma->vm_start ||
  1428				address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
  1429	
  1430		/*
  1431		 * VM_DROPPABLE mappings don't swap; instead they're just dropped when
  1432		 * under memory pressure.
  1433		 */
  1434		if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
  1435			__folio_set_swapbacked(folio);
  1436		__folio_set_anon(folio, vma, address, exclusive);
  1437	
  1438		if (likely(!folio_test_large(folio))) {
  1439			/* increment count (starts at -1) */
  1440			atomic_set(&folio->_mapcount, 0);
  1441			if (exclusive)
  1442				SetPageAnonExclusive(&folio->page);
  1443		} else if (!folio_test_pmd_mappable(folio)) {
  1444			int i;
  1445	
  1446			for (i = 0; i < nr; i++) {
  1447				struct page *page = folio_page(folio, i);
  1448	
  1449				/* increment count (starts at -1) */
  1450				atomic_set(&page->_mapcount, 0);
  1451				if (exclusive)
  1452					SetPageAnonExclusive(page);
  1453			}
  1454	
  1455			/* increment count (starts at -1) */
  1456			atomic_set(&folio->_large_mapcount, nr - 1);
  1457			atomic_set(&folio->_nr_pages_mapped, nr);
  1458		} else {
  1459			/* increment count (starts at -1) */
  1460			atomic_set(&folio->_entire_mapcount, 0);
  1461			/* increment count (starts at -1) */
  1462			atomic_set(&folio->_large_mapcount, 0);
  1463			atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
  1464			if (exclusive)
  1465				SetPageAnonExclusive(&folio->page);
  1466			nr_pmdmapped = nr;
  1467		}
  1468	
  1469		__folio_mod_stat(folio, nr, nr_pmdmapped);
> 1470		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
  1471	}
  1472	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
  2024-08-08  7:08   ` Barry Song
  2024-08-09  5:17   ` kernel test robot
@ 2024-08-09  5:28   ` kernel test robot
  2024-08-09  8:13   ` Ryan Roberts
  3 siblings, 0 replies; 28+ messages in thread
From: kernel test robot @ 2024-08-09  5:28 UTC (permalink / raw)
  To: Barry Song; +Cc: llvm, oe-kbuild-all

Hi Barry,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on next-20240807]
[also build test ERROR on v6.11-rc2]
[cannot apply to akpm-mm/mm-everything linus/master v6.11-rc2 v6.11-rc1 v6.10]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Barry-Song/mm-collect-the-number-of-anon-large-folios/20240808-090627
base:   next-20240807
patch link:    https://lore.kernel.org/r/20240808010457.228753-2-21cnbao%40gmail.com
patch subject: [PATCH RFC 1/2] mm: collect the number of anon large folios
config: s390-allnoconfig (https://download.01.org/0day-ci/archive/20240809/202408091324.ZqWOIwKo-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project f86594788ce93b696675c94f54016d27a6c21d18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240809/202408091324.ZqWOIwKo-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408091324.ZqWOIwKo-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from mm/rmap.c:56:
   In file included from include/linux/mm.h:2199:
   include/linux/vmstat.h:514:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     514 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   In file included from mm/rmap.c:77:
   include/linux/mm_inline.h:47:41: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
      47 |         __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
         |                                    ~~~~~~~~~~~ ^ ~~~
   include/linux/mm_inline.h:49:22: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
      49 |                                 NR_ZONE_LRU_BASE + lru, nr_pages);
         |                                 ~~~~~~~~~~~~~~~~ ^ ~~~
>> mm/rmap.c:1470:2: error: call to undeclared function 'mod_mthp_stat'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1470 |         mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
         |         ^
>> mm/rmap.c:1470:36: error: use of undeclared identifier 'MTHP_STAT_NR_ANON'
    1470 |         mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
         |                                           ^
   mm/rmap.c:1587:3: error: call to undeclared function 'mod_mthp_stat'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1587 |                 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
         |                 ^
   mm/rmap.c:1587:37: error: use of undeclared identifier 'MTHP_STAT_NR_ANON'
    1587 |                 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
         |                                                   ^
   3 warnings and 4 errors generated.


vim +/mod_mthp_stat +1470 mm/rmap.c

  1402	
  1403	/**
  1404	 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
  1405	 * @folio:	The folio to add the mapping to.
  1406	 * @vma:	the vm area in which the mapping is added
  1407	 * @address:	the user virtual address mapped
  1408	 * @flags:	The rmap flags
  1409	 *
  1410	 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
  1411	 * This means the inc-and-test can be bypassed.
  1412	 * The folio doesn't necessarily need to be locked while it's exclusive
  1413	 * unless two threads map it concurrently. However, the folio must be
  1414	 * locked if it's shared.
  1415	 *
  1416	 * If the folio is pmd-mappable, it is accounted as a THP.
  1417	 */
  1418	void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
  1419			unsigned long address, rmap_t flags)
  1420	{
  1421		const int nr = folio_nr_pages(folio);
  1422		const bool exclusive = flags & RMAP_EXCLUSIVE;
  1423		int nr_pmdmapped = 0;
  1424	
  1425		VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
  1426		VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
  1427		VM_BUG_ON_VMA(address < vma->vm_start ||
  1428				address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
  1429	
  1430		/*
  1431		 * VM_DROPPABLE mappings don't swap; instead they're just dropped when
  1432		 * under memory pressure.
  1433		 */
  1434		if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
  1435			__folio_set_swapbacked(folio);
  1436		__folio_set_anon(folio, vma, address, exclusive);
  1437	
  1438		if (likely(!folio_test_large(folio))) {
  1439			/* increment count (starts at -1) */
  1440			atomic_set(&folio->_mapcount, 0);
  1441			if (exclusive)
  1442				SetPageAnonExclusive(&folio->page);
  1443		} else if (!folio_test_pmd_mappable(folio)) {
  1444			int i;
  1445	
  1446			for (i = 0; i < nr; i++) {
  1447				struct page *page = folio_page(folio, i);
  1448	
  1449				/* increment count (starts at -1) */
  1450				atomic_set(&page->_mapcount, 0);
  1451				if (exclusive)
  1452					SetPageAnonExclusive(page);
  1453			}
  1454	
  1455			/* increment count (starts at -1) */
  1456			atomic_set(&folio->_large_mapcount, nr - 1);
  1457			atomic_set(&folio->_nr_pages_mapped, nr);
  1458		} else {
  1459			/* increment count (starts at -1) */
  1460			atomic_set(&folio->_entire_mapcount, 0);
  1461			/* increment count (starts at -1) */
  1462			atomic_set(&folio->_large_mapcount, 0);
  1463			atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
  1464			if (exclusive)
  1465				SetPageAnonExclusive(&folio->page);
  1466			nr_pmdmapped = nr;
  1467		}
  1468	
  1469		__folio_mod_stat(folio, nr, nr_pmdmapped);
> 1470		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
  1471	}
  1472	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  8:17         ` David Hildenbrand
  2024-08-08  9:20           ` Barry Song
@ 2024-08-09  7:04           ` Barry Song
  2024-08-09  7:22             ` David Hildenbrand
  1 sibling, 1 reply; 28+ messages in thread
From: Barry Song @ 2024-08-09  7:04 UTC (permalink / raw)
  To: david
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

> >> I would appreciate if we leave the rmap out here.
> >>
> >> Can't we handle that when actually freeing the folio? folio_test_anon()
> >> is sticky until freed.
> >
> > To be clearer: we increment the counter when we set a folio anon, which
> > should indeed only happen in folio_add_new_anon_rmap(). We'll have to
> > ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().
> >
> > Then, when we free an anon folio we decrement the counter. (hugetlb
> > should clear the anon flag when an anon folio gets freed back to its
> > allocator -- likely that is already done).
> >
>
> Sorry that I am talking to myself: I'm wondering if we also have to
> adjust the counter when splitting a large folio to multiple
> smaller-but-still-large folios.

Hi David,

The conceptual code is shown below. Does this make more
sense to you? we have a line "mod_mthp_stat(new_order,
MTHP_STAT_NR_ANON, 1 << (order - new_order));"

@@ -3270,8 +3272,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 	/* reset xarray order to new order after split */
 	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
-	struct anon_vma *anon_vma = NULL;
+	bool is_anon = folio_test_anon(folio);
 	struct address_space *mapping = NULL;
+	struct anon_vma *anon_vma = NULL;
 	int order = folio_order(folio);
 	int extra_pins, ret;
 	pgoff_t end;
@@ -3283,7 +3286,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	if (new_order >= folio_order(folio))
 		return -EINVAL;
 
-	if (folio_test_anon(folio)) {
+	if (is_anon) {
 		/* order-1 is not supported for anonymous THP. */
 		if (new_order == 1) {
 			VM_WARN_ONCE(1, "Cannot split to order-1 folio");
@@ -3323,7 +3326,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	if (folio_test_writeback(folio))
 		return -EBUSY;
 
-	if (folio_test_anon(folio)) {
+	if (is_anon) {
 		/*
 		 * The caller does not necessarily hold an mmap_lock that would
 		 * prevent the anon_vma disappearing so we first we take a
@@ -3437,6 +3440,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			}
 		}
 
+		if (is_anon) {
+			mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+			mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
+		}
 		__split_huge_page(page, list, end, new_order);
 		ret = 0;
 	} else {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 408ef3d25cf5..c869d0601614 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1039,6 +1039,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 	bool skip_kasan_poison = should_skip_kasan_poison(page);
 	bool init = want_init_on_free();
 	bool compound = PageCompound(page);
+	bool anon = PageAnon(page);
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1130,6 +1131,9 @@ __always_inline bool free_pages_prepare(struct page *page,
 
 	debug_pagealloc_unmap_pages(page, 1 << order);
 
+	if (anon && compound)
+		mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+
 	return true;
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 8d432051e970..982862cbf5ba 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 	}
 
 	__folio_mod_stat(folio, nr, nr_pmdmapped);
+	mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
 }
 
 static __always_inline void __folio_add_file_rmap(struct folio *folio,


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  7:04           ` Barry Song
@ 2024-08-09  7:22             ` David Hildenbrand
  2024-08-11  5:20               ` Barry Song
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-09  7:22 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On 09.08.24 09:04, Barry Song wrote:
>>>> I would appreciate if we leave the rmap out here.
>>>>
>>>> Can't we handle that when actually freeing the folio? folio_test_anon()
>>>> is sticky until freed.
>>>
>>> To be clearer: we increment the counter when we set a folio anon, which
>>> should indeed only happen in folio_add_new_anon_rmap(). We'll have to
>>> ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().
>>>
>>> Then, when we free an anon folio we decrement the counter. (hugetlb
>>> should clear the anon flag when an anon folio gets freed back to its
>>> allocator -- likely that is already done).
>>>
>>
>> Sorry that I am talking to myself: I'm wondering if we also have to
>> adjust the counter when splitting a large folio to multiple
>> smaller-but-still-large folios.
> 
> Hi David,
> 
> The conceptual code is shown below. Does this make more
> sense to you? we have a line "mod_mthp_stat(new_order,
> MTHP_STAT_NR_ANON, 1 << (order - new_order));"
> 
> @@ -3270,8 +3272,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>   	/* reset xarray order to new order after split */
>   	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> -	struct anon_vma *anon_vma = NULL;
> +	bool is_anon = folio_test_anon(folio);
>   	struct address_space *mapping = NULL;
> +	struct anon_vma *anon_vma = NULL;
>   	int order = folio_order(folio);
>   	int extra_pins, ret;
>   	pgoff_t end;
> @@ -3283,7 +3286,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   	if (new_order >= folio_order(folio))
>   		return -EINVAL;
>   
> -	if (folio_test_anon(folio)) {
> +	if (is_anon) {
>   		/* order-1 is not supported for anonymous THP. */
>   		if (new_order == 1) {
>   			VM_WARN_ONCE(1, "Cannot split to order-1 folio");
> @@ -3323,7 +3326,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   	if (folio_test_writeback(folio))
>   		return -EBUSY;
>   
> -	if (folio_test_anon(folio)) {
> +	if (is_anon) {
>   		/*
>   		 * The caller does not necessarily hold an mmap_lock that would
>   		 * prevent the anon_vma disappearing so we first we take a
> @@ -3437,6 +3440,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   			}
>   		}
>   
> +		if (is_anon) {
> +			mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> +			mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
> +		}
>   		__split_huge_page(page, list, end, new_order);
>   		ret = 0;
>   	} else {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 408ef3d25cf5..c869d0601614 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1039,6 +1039,7 @@ __always_inline bool free_pages_prepare(struct page *page,
>   	bool skip_kasan_poison = should_skip_kasan_poison(page);
>   	bool init = want_init_on_free();
>   	bool compound = PageCompound(page);
> +	bool anon = PageAnon(page);
>   
>   	VM_BUG_ON_PAGE(PageTail(page), page);
>   
> @@ -1130,6 +1131,9 @@ __always_inline bool free_pages_prepare(struct page *page,
>   
>   	debug_pagealloc_unmap_pages(page, 1 << order);
>   
> +	if (anon && compound)
> +		mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> +
>   	return true;

I'd have placed it here, when we are already passed the "PageMappingFlags" check and
shouldn't have any added overhead for most !anon pages IIRC (mostly only anon/ksm pages should
run into that path).

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 408ef3d25cf5..a11b9dd62964 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1079,8 +1079,11 @@ __always_inline bool free_pages_prepare(struct page *page,
                         (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
                 }
         }
-       if (PageMappingFlags(page))
+       if (PageMappingFlags(page)) {
+               if (PageAnon(page) && compound)
+                       mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
                 page->mapping = NULL;
+       }
         if (is_check_pages_enabled()) {
                 if (free_page_is_bad(page))
                         bad++;

Conceptually LGTM. We account an anon folio as long as it's anon,
even when still GUP-pinned after unmapping it or when temporarily
unmapping+remapping it during migration.

-- 
Cheers,

David / dhildenb



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
                     ` (2 preceding siblings ...)
  2024-08-09  5:28   ` kernel test robot
@ 2024-08-09  8:13   ` Ryan Roberts
  2024-08-09  8:27     ` Ryan Roberts
  2024-08-09  8:39     ` David Hildenbrand
  3 siblings, 2 replies; 28+ messages in thread
From: Ryan Roberts @ 2024-08-09  8:13 UTC (permalink / raw)
  To: Barry Song, akpm, linux-mm
  Cc: chrisl, david, kaleshsingh, kasong, linux-kernel, ioworker0,
	baolin.wang, ziy, hanchuanhua, Barry Song

On 08/08/2024 02:04, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> When a new anonymous mTHP is added to the rmap, we increase the count.
> We reduce the count whenever an mTHP is completely unmapped.
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>  include/linux/huge_mm.h                    | 15 +++++++++++++--
>  mm/huge_memory.c                           |  2 ++
>  mm/rmap.c                                  |  3 +++
>  4 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 058485daf186..715f181543f6 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -527,6 +527,11 @@ split_deferred
>          it would free up some memory. Pages on split queue are going to
>          be split under memory pressure, if splitting is possible.
>  
> +anon_num
> +       the number of anon huge pages we have in the whole system.
> +       These huge pages could be still entirely mapped and have partially
> +       unmapped and unused subpages.

nit: "entirely mapped and have partially unmapped and unused subpages" ->
"entirely mapped or have partially unmapped/unused subpages"

> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..294c348fe3cc 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>  	MTHP_STAT_SPLIT,
>  	MTHP_STAT_SPLIT_FAILED,
>  	MTHP_STAT_SPLIT_DEFERRED,
> +	MTHP_STAT_NR_ANON,
>  	__MTHP_STAT_COUNT
>  };
>  
> @@ -291,14 +292,24 @@ struct mthp_stat {
>  #ifdef CONFIG_SYSFS
>  DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>  
> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>  {
>  	if (order <= 0 || order > PMD_ORDER)
>  		return;
>  
> -	this_cpu_inc(mthp_stats.stats[order][item]);
> +	this_cpu_add(mthp_stats.stats[order][item], delta);
> +}
> +
> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> +{
> +	mod_mthp_stat(order, item, 1);
>  }
> +
>  #else
> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
> +{
> +}
> +
>  static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>  {
>  }
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 697fcf89f975..b6bc2a3791e3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>  DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>  DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>  
>  static struct attribute *stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>  	&split_attr.attr,
>  	&split_failed_attr.attr,
>  	&split_deferred_attr.attr,
> +	&anon_num_attr.attr,
>  	NULL,
>  };
>  
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 901950200957..2b722f26224c 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>  	}
>  
>  	__folio_mod_stat(folio, nr, nr_pmdmapped);
> +	mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>  }
>  
>  static __always_inline void __folio_add_file_rmap(struct folio *folio,
> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>  	    list_empty(&folio->_deferred_list))
>  		deferred_split_folio(folio);
>  	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
> +	if (folio_test_anon(folio) && !atomic_read(mapped))

Agree that atomic_read() is dodgy here.

Not sure I fully understand why David prefers to do the unaccounting at
free-time though? It feels unbalanced to me to increment when first mapped but
decrement when freed. Surely its safer to either use alloc/free or use first
map/last map?

If using alloc/free isn't there a THP constructor/destructor that prepares the
deferred list? (My memory may be failing me). Could we use that?

> +		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
>  
>  	/*
>  	 * It would be tidy to reset folio_test_anon mapping when fully



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped
  2024-08-08  1:04 ` [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped Barry Song
@ 2024-08-09  8:23   ` Ryan Roberts
  2024-08-09  8:48     ` Barry Song
  0 siblings, 1 reply; 28+ messages in thread
From: Ryan Roberts @ 2024-08-09  8:23 UTC (permalink / raw)
  To: Barry Song, akpm, linux-mm
  Cc: chrisl, david, kaleshsingh, kasong, linux-kernel, ioworker0,
	baolin.wang, ziy, hanchuanhua, Barry Song

On 08/08/2024 02:04, Barry Song wrote:
> From: Barry Song <v-songbaohua@oppo.com>
> 
> When an mTHP is added to the deferred_list, its partial pages
> are unused, leading to wasted memory and potentially increasing
> memory reclamation pressure. Tracking this number indicates
> the extent to which userspace is partially unmapping mTHPs.
> 
> Detailing the specifics of how unmapping occurs is quite difficult
> and not that useful, so we adopt a simple approach: each time an
> mTHP enters the deferred_list, we increment the count by 1; whenever
> it leaves for any reason, we decrement the count by 1.
> 
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 5 +++++
>  include/linux/huge_mm.h                    | 1 +
>  mm/huge_memory.c                           | 6 ++++++
>  3 files changed, 12 insertions(+)
> 
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 715f181543f6..5028d61cbe0c 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -532,6 +532,11 @@ anon_num
>         These huge pages could be still entirely mapped and have partially
>         unmapped and unused subpages.
>  
> +anon_num_partial_unused

Why is the user-exposed name completely different to the internal
(MTHP_STAT_NR_ANON_SPLIT_DEFERRED) name?

> +       the number of anon huge pages which have been partially unmapped
> +       we have in the whole system. These unmapped subpages are also
> +       unused and temporarily wasting memory.
> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 294c348fe3cc..4b27a9797150 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -282,6 +282,7 @@ enum mthp_stat_item {
>  	MTHP_STAT_SPLIT_FAILED,
>  	MTHP_STAT_SPLIT_DEFERRED,
>  	MTHP_STAT_NR_ANON,
> +	MTHP_STAT_NR_ANON_SPLIT_DEFERRED,

So the existing MTHP_STAT_SPLIT_DEFERRED is counting all folios that were ever
put on the list, and the new MTHP_STAT_NR_ANON_SPLIT_DEFERRED is counting the
number of folios that are currently on the list?

In which case, do we need the "ANON" in the name? It's implicit for the existing
split counters that they are anon-only. That would relate it more clearly to the
existing MTHP_STAT_SPLIT_DEFERRED too?

>  	__MTHP_STAT_COUNT
>  };
>  
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index b6bc2a3791e3..6083144f9fa0 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -579,6 +579,7 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>  DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>  DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
> +DEFINE_MTHP_STAT_ATTR(anon_num_partial_unused, MTHP_STAT_NR_ANON_SPLIT_DEFERRED);
>  
>  static struct attribute *stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
> @@ -593,6 +594,7 @@ static struct attribute *stats_attrs[] = {
>  	&split_failed_attr.attr,
>  	&split_deferred_attr.attr,
>  	&anon_num_attr.attr,
> +	&anon_num_partial_unused_attr.attr,
>  	NULL,
>  };
>  
> @@ -3229,6 +3231,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  		if (folio_order(folio) > 1 &&
>  		    !list_empty(&folio->_deferred_list)) {
>  			ds_queue->split_queue_len--;
> +			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
>  			/*
>  			 * Reinitialize page_deferred_list after removing the
>  			 * page from the split_queue, otherwise a subsequent
> @@ -3291,6 +3294,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
>  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>  	if (!list_empty(&folio->_deferred_list)) {
>  		ds_queue->split_queue_len--;
> +		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
>  		list_del_init(&folio->_deferred_list);
>  	}
>  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> @@ -3332,6 +3336,7 @@ void deferred_split_folio(struct folio *folio)
>  		if (folio_test_pmd_mappable(folio))
>  			count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>  		count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> +		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, 1);
>  		list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>  		ds_queue->split_queue_len++;
>  #ifdef CONFIG_MEMCG
> @@ -3379,6 +3384,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>  			list_move(&folio->_deferred_list, &list);
>  		} else {
>  			/* We lost race with folio_put() */
> +			mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
>  			list_del_init(&folio->_deferred_list);
>  			ds_queue->split_queue_len--;
>  		}



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:13   ` Ryan Roberts
@ 2024-08-09  8:27     ` Ryan Roberts
  2024-08-09  8:40       ` Barry Song
  2024-08-09  8:42       ` David Hildenbrand
  2024-08-09  8:39     ` David Hildenbrand
  1 sibling, 2 replies; 28+ messages in thread
From: Ryan Roberts @ 2024-08-09  8:27 UTC (permalink / raw)
  To: Barry Song, akpm, linux-mm
  Cc: chrisl, david, kaleshsingh, kasong, linux-kernel, ioworker0,
	baolin.wang, ziy, hanchuanhua, Barry Song

On 09/08/2024 09:13, Ryan Roberts wrote:
> On 08/08/2024 02:04, Barry Song wrote:
>> From: Barry Song <v-songbaohua@oppo.com>
>>
>> When a new anonymous mTHP is added to the rmap, we increase the count.
>> We reduce the count whenever an mTHP is completely unmapped.
>>
>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> ---
>>  Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>>  include/linux/huge_mm.h                    | 15 +++++++++++++--
>>  mm/huge_memory.c                           |  2 ++
>>  mm/rmap.c                                  |  3 +++
>>  4 files changed, 23 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index 058485daf186..715f181543f6 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -527,6 +527,11 @@ split_deferred
>>          it would free up some memory. Pages on split queue are going to
>>          be split under memory pressure, if splitting is possible.
>>  
>> +anon_num
>> +       the number of anon huge pages we have in the whole system.
>> +       These huge pages could be still entirely mapped and have partially
>> +       unmapped and unused subpages.
> 
> nit: "entirely mapped and have partially unmapped and unused subpages" ->
> "entirely mapped or have partially unmapped/unused subpages"
> 
>> +
>>  As the system ages, allocating huge pages may be expensive as the
>>  system uses memory compaction to copy data around memory to free a
>>  huge page for use. There are some counters in ``/proc/vmstat`` to help
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..294c348fe3cc 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>>  	MTHP_STAT_SPLIT,
>>  	MTHP_STAT_SPLIT_FAILED,
>>  	MTHP_STAT_SPLIT_DEFERRED,
>> +	MTHP_STAT_NR_ANON,
>>  	__MTHP_STAT_COUNT
>>  };
>>  
>> @@ -291,14 +292,24 @@ struct mthp_stat {
>>  #ifdef CONFIG_SYSFS
>>  DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>>  
>> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>  {
>>  	if (order <= 0 || order > PMD_ORDER)
>>  		return;
>>  
>> -	this_cpu_inc(mthp_stats.stats[order][item]);
>> +	this_cpu_add(mthp_stats.stats[order][item], delta);
>> +}
>> +
>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>> +{
>> +	mod_mthp_stat(order, item, 1);
>>  }
>> +
>>  #else
>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>> +{
>> +}
>> +
>>  static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>  {
>>  }
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 697fcf89f975..b6bc2a3791e3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>>  DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>  DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);

Why are the user-facing and internal names different? Perhaps it would be
clearer to call this nr_anon in sysfs?

>>  
>>  static struct attribute *stats_attrs[] = {
>>  	&anon_fault_alloc_attr.attr,
>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>>  	&split_attr.attr,
>>  	&split_failed_attr.attr,
>>  	&split_deferred_attr.attr,
>> +	&anon_num_attr.attr,
>>  	NULL,
>>  };
>>  
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 901950200957..2b722f26224c 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>  	}
>>  
>>  	__folio_mod_stat(folio, nr, nr_pmdmapped);
>> +	mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>>  }
>>  
>>  static __always_inline void __folio_add_file_rmap(struct folio *folio,
>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>  	    list_empty(&folio->_deferred_list))
>>  		deferred_split_folio(folio);
>>  	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
>> +	if (folio_test_anon(folio) && !atomic_read(mapped))
> 
> Agree that atomic_read() is dodgy here.
> 
> Not sure I fully understand why David prefers to do the unaccounting at
> free-time though? It feels unbalanced to me to increment when first mapped but
> decrement when freed. Surely its safer to either use alloc/free or use first
> map/last map?
> 
> If using alloc/free isn't there a THP constructor/destructor that prepares the
> deferred list? (My memory may be failing me). Could we use that?

Additionally, if we wanted to extend (eventually) to track the number of shmem
and file mthps in additional counters, could we also account using similar folio
free-time hooks? If not, it might be an argument to account in rmap_unmap to be
consistent for all?


> 
>> +		mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
>>  
>>  	/*
>>  	 * It would be tidy to reset folio_test_anon mapping when fully
> 



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:13   ` Ryan Roberts
  2024-08-09  8:27     ` Ryan Roberts
@ 2024-08-09  8:39     ` David Hildenbrand
  2024-08-09  9:00       ` Ryan Roberts
  1 sibling, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-09  8:39 UTC (permalink / raw)
  To: Ryan Roberts, Barry Song, akpm, linux-mm
  Cc: chrisl, kaleshsingh, kasong, linux-kernel, ioworker0, baolin.wang,
	ziy, hanchuanhua, Barry Song

On 09.08.24 10:13, Ryan Roberts wrote:
> On 08/08/2024 02:04, Barry Song wrote:
>> From: Barry Song <v-songbaohua@oppo.com>
>>
>> When a new anonymous mTHP is added to the rmap, we increase the count.
>> We reduce the count whenever an mTHP is completely unmapped.
>>
>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>> ---
>>   Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>>   include/linux/huge_mm.h                    | 15 +++++++++++++--
>>   mm/huge_memory.c                           |  2 ++
>>   mm/rmap.c                                  |  3 +++
>>   4 files changed, 23 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
>> index 058485daf186..715f181543f6 100644
>> --- a/Documentation/admin-guide/mm/transhuge.rst
>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>> @@ -527,6 +527,11 @@ split_deferred
>>           it would free up some memory. Pages on split queue are going to
>>           be split under memory pressure, if splitting is possible.
>>   
>> +anon_num
>> +       the number of anon huge pages we have in the whole system.
>> +       These huge pages could be still entirely mapped and have partially
>> +       unmapped and unused subpages.
> 
> nit: "entirely mapped and have partially unmapped and unused subpages" ->
> "entirely mapped or have partially unmapped/unused subpages"
> 
>> +
>>   As the system ages, allocating huge pages may be expensive as the
>>   system uses memory compaction to copy data around memory to free a
>>   huge page for use. There are some counters in ``/proc/vmstat`` to help
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..294c348fe3cc 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>>   	MTHP_STAT_SPLIT,
>>   	MTHP_STAT_SPLIT_FAILED,
>>   	MTHP_STAT_SPLIT_DEFERRED,
>> +	MTHP_STAT_NR_ANON,
>>   	__MTHP_STAT_COUNT
>>   };
>>   
>> @@ -291,14 +292,24 @@ struct mthp_stat {
>>   #ifdef CONFIG_SYSFS
>>   DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>>   
>> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>>   {
>>   	if (order <= 0 || order > PMD_ORDER)
>>   		return;
>>   
>> -	this_cpu_inc(mthp_stats.stats[order][item]);
>> +	this_cpu_add(mthp_stats.stats[order][item], delta);
>> +}
>> +
>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>> +{
>> +	mod_mthp_stat(order, item, 1);
>>   }
>> +
>>   #else
>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
>> +{
>> +}
>> +
>>   static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>   {
>>   }
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 697fcf89f975..b6bc2a3791e3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>>   DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>   DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>   DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>>   
>>   static struct attribute *stats_attrs[] = {
>>   	&anon_fault_alloc_attr.attr,
>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>>   	&split_attr.attr,
>>   	&split_failed_attr.attr,
>>   	&split_deferred_attr.attr,
>> +	&anon_num_attr.attr,
>>   	NULL,
>>   };
>>   
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 901950200957..2b722f26224c 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>   	}
>>   
>>   	__folio_mod_stat(folio, nr, nr_pmdmapped);
>> +	mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>>   }
>>   
>>   static __always_inline void __folio_add_file_rmap(struct folio *folio,
>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>   	    list_empty(&folio->_deferred_list))
>>   		deferred_split_folio(folio);
>>   	__folio_mod_stat(folio, -nr, -nr_pmdmapped);
>> +	if (folio_test_anon(folio) && !atomic_read(mapped))
> 
> Agree that atomic_read() is dodgy here.
> 
> Not sure I fully understand why David prefers to do the unaccounting at
> free-time though? It feels unbalanced to me to increment when first mapped but
> decrement when freed. Surely its safer to either use alloc/free or use first
> map/last map?

Doing it when we set/clear folio->mapping is straight forward.

Anon folios currently come to live when we first map them, and they stay 
that way until we free them.

In the future, we'll have to move that anon handling further out, when 
if have to allocate anon-specific memdesc ahead of time, then, it will 
be clued to that lifetime.

> 
> If using alloc/free isn't there a THP constructor/destructor that prepares the
> deferred list? (My memory may be failing me). Could we use that?

Likely the deconstructor could work as well. Not sure if that is any 
better than the freeing path where folio->mapping currently gets cleared.

The generic constructor certainly won't work right now. That's not where 
the "anon" part comes to live.

Let's take a look how NR_FILE_THPS is handled:

__filemap_add_folio() increments it -- when we set folio->mapping
__filemap_remove_folio() (->filemap_unaccount_folio) decrements it -- 
after which we usually call page_cache_delete() to set folio->mapping = 
NULL;

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:27     ` Ryan Roberts
@ 2024-08-09  8:40       ` Barry Song
  2024-08-09  8:42       ` David Hildenbrand
  1 sibling, 0 replies; 28+ messages in thread
From: Barry Song @ 2024-08-09  8:40 UTC (permalink / raw)
  To: Ryan Roberts
  Cc: akpm, linux-mm, chrisl, david, kaleshsingh, kasong, linux-kernel,
	ioworker0, baolin.wang, ziy, hanchuanhua, Barry Song

On Fri, Aug 9, 2024 at 4:27 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 09/08/2024 09:13, Ryan Roberts wrote:
> > On 08/08/2024 02:04, Barry Song wrote:
> >> From: Barry Song <v-songbaohua@oppo.com>
> >>
> >> When a new anonymous mTHP is added to the rmap, we increase the count.
> >> We reduce the count whenever an mTHP is completely unmapped.
> >>
> >> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> >> ---
> >>  Documentation/admin-guide/mm/transhuge.rst |  5 +++++
> >>  include/linux/huge_mm.h                    | 15 +++++++++++++--
> >>  mm/huge_memory.c                           |  2 ++
> >>  mm/rmap.c                                  |  3 +++
> >>  4 files changed, 23 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> >> index 058485daf186..715f181543f6 100644
> >> --- a/Documentation/admin-guide/mm/transhuge.rst
> >> +++ b/Documentation/admin-guide/mm/transhuge.rst
> >> @@ -527,6 +527,11 @@ split_deferred
> >>          it would free up some memory. Pages on split queue are going to
> >>          be split under memory pressure, if splitting is possible.
> >>
> >> +anon_num
> >> +       the number of anon huge pages we have in the whole system.
> >> +       These huge pages could be still entirely mapped and have partially
> >> +       unmapped and unused subpages.
> >
> > nit: "entirely mapped and have partially unmapped and unused subpages" ->
> > "entirely mapped or have partially unmapped/unused subpages"
> >
> >> +
> >>  As the system ages, allocating huge pages may be expensive as the
> >>  system uses memory compaction to copy data around memory to free a
> >>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index e25d9ebfdf89..294c348fe3cc 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -281,6 +281,7 @@ enum mthp_stat_item {
> >>      MTHP_STAT_SPLIT,
> >>      MTHP_STAT_SPLIT_FAILED,
> >>      MTHP_STAT_SPLIT_DEFERRED,
> >> +    MTHP_STAT_NR_ANON,
> >>      __MTHP_STAT_COUNT
> >>  };
> >>
> >> @@ -291,14 +292,24 @@ struct mthp_stat {
> >>  #ifdef CONFIG_SYSFS
> >>  DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
> >>
> >> -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
> >>  {
> >>      if (order <= 0 || order > PMD_ORDER)
> >>              return;
> >>
> >> -    this_cpu_inc(mthp_stats.stats[order][item]);
> >> +    this_cpu_add(mthp_stats.stats[order][item], delta);
> >> +}
> >> +
> >> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >> +{
> >> +    mod_mthp_stat(order, item, 1);
> >>  }
> >> +
> >>  #else
> >> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
> >> +{
> >> +}
> >> +
> >>  static inline void count_mthp_stat(int order, enum mthp_stat_item item)
> >>  {
> >>  }
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 697fcf89f975..b6bc2a3791e3 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
> >>  DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
> >>  DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> >>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> >> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>
> Why are the user-facing and internal names different? Perhaps it would be
> clearer to call this nr_anon in sysfs?
>
> >>
> >>  static struct attribute *stats_attrs[] = {
> >>      &anon_fault_alloc_attr.attr,
> >> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
> >>      &split_attr.attr,
> >>      &split_failed_attr.attr,
> >>      &split_deferred_attr.attr,
> >> +    &anon_num_attr.attr,
> >>      NULL,
> >>  };
> >>
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index 901950200957..2b722f26224c 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
> >>      }
> >>
> >>      __folio_mod_stat(folio, nr, nr_pmdmapped);
> >> +    mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
> >>  }
> >>
> >>  static __always_inline void __folio_add_file_rmap(struct folio *folio,
> >> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> >>          list_empty(&folio->_deferred_list))
> >>              deferred_split_folio(folio);
> >>      __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >> +    if (folio_test_anon(folio) && !atomic_read(mapped))
> >
> > Agree that atomic_read() is dodgy here.
> >
> > Not sure I fully understand why David prefers to do the unaccounting at
> > free-time though? It feels unbalanced to me to increment when first mapped but
> > decrement when freed. Surely its safer to either use alloc/free or use first
> > map/last map?

As long as we can account for mTHP when clearing the Anon flag for the folio,
we should be safe. It’s challenging to add +1 when allocating a large folio
because we don’t know its intended use—it could be for file, anon, or shmem.

> >
> > If using alloc/free isn't there a THP constructor/destructor that prepares the
> > deferred list? (My memory may be failing me). Could we use that?
>
> Additionally, if we wanted to extend (eventually) to track the number of shmem
> and file mthps in additional counters, could we also account using similar folio
> free-time hooks? If not, it might be an argument to account in rmap_unmap to be
> consistent for all?

I've been struggling quite a bit with rmap. Despite trying various
approaches, I’m
still occasionally seeing a negative mTHP counter after running it some hours
on phones. It seems that rmap is really tricky to handle.  I admit that I have
failed on rmap :-)

On the other hand, for anon folios, we have cases where they are split from
order M to order N. So, we add +1 when a new anon folio is added to rmap
and subtract -1 when we either split it or free it. This approach seems clearer
to me. When we split from order M to order N, we can add 1 << (M - N) for
order N.

>
>
> >
> >> +            mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, -1);
> >>
> >>      /*
> >>       * It would be tidy to reset folio_test_anon mapping when fully

Thanks
Barry


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:27     ` Ryan Roberts
  2024-08-09  8:40       ` Barry Song
@ 2024-08-09  8:42       ` David Hildenbrand
  2024-08-09  8:58         ` David Hildenbrand
  1 sibling, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-09  8:42 UTC (permalink / raw)
  To: Ryan Roberts, Barry Song, akpm, linux-mm
  Cc: chrisl, kaleshsingh, kasong, linux-kernel, ioworker0, baolin.wang,
	ziy, hanchuanhua, Barry Song

>> Not sure I fully understand why David prefers to do the unaccounting at
>> free-time though? It feels unbalanced to me to increment when first mapped but
>> decrement when freed. Surely its safer to either use alloc/free or use first
>> map/last map?
>>
>> If using alloc/free isn't there a THP constructor/destructor that prepares the
>> deferred list? (My memory may be failing me). Could we use that?
> 
> Additionally, if we wanted to extend (eventually) to track the number of shmem
> and file mthps in additional counters, could we also account using similar folio
> free-time hooks? If not, it might be an argument to account in rmap_unmap to be
> consistent for all?

Again, see NR_FILE_THPS handling. No rmap over-complication please.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped
  2024-08-09  8:23   ` Ryan Roberts
@ 2024-08-09  8:48     ` Barry Song
  0 siblings, 0 replies; 28+ messages in thread
From: Barry Song @ 2024-08-09  8:48 UTC (permalink / raw)
  To: Ryan Roberts
  Cc: akpm, linux-mm, chrisl, david, kaleshsingh, kasong, linux-kernel,
	ioworker0, baolin.wang, ziy, hanchuanhua, Barry Song

On Fri, Aug 9, 2024 at 4:23 PM Ryan Roberts <ryan.roberts@arm.com> wrote:
>
> On 08/08/2024 02:04, Barry Song wrote:
> > From: Barry Song <v-songbaohua@oppo.com>
> >
> > When an mTHP is added to the deferred_list, its partial pages
> > are unused, leading to wasted memory and potentially increasing
> > memory reclamation pressure. Tracking this number indicates
> > the extent to which userspace is partially unmapping mTHPs.
> >
> > Detailing the specifics of how unmapping occurs is quite difficult
> > and not that useful, so we adopt a simple approach: each time an
> > mTHP enters the deferred_list, we increment the count by 1; whenever
> > it leaves for any reason, we decrement the count by 1.
> >
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > ---
> >  Documentation/admin-guide/mm/transhuge.rst | 5 +++++
> >  include/linux/huge_mm.h                    | 1 +
> >  mm/huge_memory.c                           | 6 ++++++
> >  3 files changed, 12 insertions(+)
> >
> > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > index 715f181543f6..5028d61cbe0c 100644
> > --- a/Documentation/admin-guide/mm/transhuge.rst
> > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > @@ -532,6 +532,11 @@ anon_num
> >         These huge pages could be still entirely mapped and have partially
> >         unmapped and unused subpages.
> >
> > +anon_num_partial_unused
>
> Why is the user-exposed name completely different to the internal
> (MTHP_STAT_NR_ANON_SPLIT_DEFERRED) name?

My point is that the user might not even know what a deferred split is;
they are more concerned with whether there's any temporary memory
waste or what the deferred list means from a user perspective.

However, since we've referred to it as SPLIT_DEFERRED in other
sys ABI, I agree with you that we should continue using that term.

>
> > +       the number of anon huge pages which have been partially unmapped
> > +       we have in the whole system. These unmapped subpages are also
> > +       unused and temporarily wasting memory.
> > +
> >  As the system ages, allocating huge pages may be expensive as the
> >  system uses memory compaction to copy data around memory to free a
> >  huge page for use. There are some counters in ``/proc/vmstat`` to help
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 294c348fe3cc..4b27a9797150 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -282,6 +282,7 @@ enum mthp_stat_item {
> >       MTHP_STAT_SPLIT_FAILED,
> >       MTHP_STAT_SPLIT_DEFERRED,
> >       MTHP_STAT_NR_ANON,
> > +     MTHP_STAT_NR_ANON_SPLIT_DEFERRED,
>
> So the existing MTHP_STAT_SPLIT_DEFERRED is counting all folios that were ever
> put on the list, and the new MTHP_STAT_NR_ANON_SPLIT_DEFERRED is counting the
> number of folios that are currently on the list?

Yep.

>
> In which case, do we need the "ANON" in the name? It's implicit for the existing
> split counters that they are anon-only. That would relate it more clearly to the
> existing MTHP_STAT_SPLIT_DEFERRED too?

ack.

>
> >       __MTHP_STAT_COUNT
> >  };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index b6bc2a3791e3..6083144f9fa0 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -579,6 +579,7 @@ DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
> >  DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> >  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> >  DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
> > +DEFINE_MTHP_STAT_ATTR(anon_num_partial_unused, MTHP_STAT_NR_ANON_SPLIT_DEFERRED);
> >
> >  static struct attribute *stats_attrs[] = {
> >       &anon_fault_alloc_attr.attr,
> > @@ -593,6 +594,7 @@ static struct attribute *stats_attrs[] = {
> >       &split_failed_attr.attr,
> >       &split_deferred_attr.attr,
> >       &anon_num_attr.attr,
> > +     &anon_num_partial_unused_attr.attr,
> >       NULL,
> >  };
> >
> > @@ -3229,6 +3231,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >               if (folio_order(folio) > 1 &&
> >                   !list_empty(&folio->_deferred_list)) {
> >                       ds_queue->split_queue_len--;
> > +                     mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
> >                       /*
> >                        * Reinitialize page_deferred_list after removing the
> >                        * page from the split_queue, otherwise a subsequent
> > @@ -3291,6 +3294,7 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >       if (!list_empty(&folio->_deferred_list)) {
> >               ds_queue->split_queue_len--;
> > +             mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
> >               list_del_init(&folio->_deferred_list);
> >       }
> >       spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> > @@ -3332,6 +3336,7 @@ void deferred_split_folio(struct folio *folio)
> >               if (folio_test_pmd_mappable(folio))
> >                       count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >               count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > +             mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, 1);
> >               list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >               ds_queue->split_queue_len++;
> >  #ifdef CONFIG_MEMCG
> > @@ -3379,6 +3384,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >                       list_move(&folio->_deferred_list, &list);
> >               } else {
> >                       /* We lost race with folio_put() */
> > +                     mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_SPLIT_DEFERRED, -1);
> >                       list_del_init(&folio->_deferred_list);
> >                       ds_queue->split_queue_len--;
> >               }
>

Thanks
Barry


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:42       ` David Hildenbrand
@ 2024-08-09  8:58         ` David Hildenbrand
  2024-08-09  9:05           ` Ryan Roberts
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-09  8:58 UTC (permalink / raw)
  To: Ryan Roberts, Barry Song, akpm, linux-mm
  Cc: chrisl, kaleshsingh, kasong, linux-kernel, ioworker0, baolin.wang,
	ziy, hanchuanhua, Barry Song

On 09.08.24 10:42, David Hildenbrand wrote:
>>> Not sure I fully understand why David prefers to do the unaccounting at
>>> free-time though? It feels unbalanced to me to increment when first mapped but
>>> decrement when freed. Surely its safer to either use alloc/free or use first
>>> map/last map?
>>>
>>> If using alloc/free isn't there a THP constructor/destructor that prepares the
>>> deferred list? (My memory may be failing me). Could we use that?
>>
>> Additionally, if we wanted to extend (eventually) to track the number of shmem
>> and file mthps in additional counters, could we also account using similar folio
>> free-time hooks? If not, it might be an argument to account in rmap_unmap to be
>> consistent for all?
> 
> Again, see NR_FILE_THPS handling. No rmap over-complication please.

... not to mention that it is non-sensical to only count pageache folios 
that are mapped to user space ;)

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:39     ` David Hildenbrand
@ 2024-08-09  9:00       ` Ryan Roberts
  0 siblings, 0 replies; 28+ messages in thread
From: Ryan Roberts @ 2024-08-09  9:00 UTC (permalink / raw)
  To: David Hildenbrand, Barry Song, akpm, linux-mm
  Cc: chrisl, kaleshsingh, kasong, linux-kernel, ioworker0, baolin.wang,
	ziy, hanchuanhua, Barry Song

On 09/08/2024 09:39, David Hildenbrand wrote:
> On 09.08.24 10:13, Ryan Roberts wrote:
>> On 08/08/2024 02:04, Barry Song wrote:
>>> From: Barry Song <v-songbaohua@oppo.com>
>>>
>>> When a new anonymous mTHP is added to the rmap, we increase the count.
>>> We reduce the count whenever an mTHP is completely unmapped.
>>>
>>> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
>>> ---
>>>   Documentation/admin-guide/mm/transhuge.rst |  5 +++++
>>>   include/linux/huge_mm.h                    | 15 +++++++++++++--
>>>   mm/huge_memory.c                           |  2 ++
>>>   mm/rmap.c                                  |  3 +++
>>>   4 files changed, 23 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/Documentation/admin-guide/mm/transhuge.rst
>>> b/Documentation/admin-guide/mm/transhuge.rst
>>> index 058485daf186..715f181543f6 100644
>>> --- a/Documentation/admin-guide/mm/transhuge.rst
>>> +++ b/Documentation/admin-guide/mm/transhuge.rst
>>> @@ -527,6 +527,11 @@ split_deferred
>>>           it would free up some memory. Pages on split queue are going to
>>>           be split under memory pressure, if splitting is possible.
>>>   +anon_num
>>> +       the number of anon huge pages we have in the whole system.
>>> +       These huge pages could be still entirely mapped and have partially
>>> +       unmapped and unused subpages.
>>
>> nit: "entirely mapped and have partially unmapped and unused subpages" ->
>> "entirely mapped or have partially unmapped/unused subpages"
>>
>>> +
>>>   As the system ages, allocating huge pages may be expensive as the
>>>   system uses memory compaction to copy data around memory to free a
>>>   huge page for use. There are some counters in ``/proc/vmstat`` to help
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..294c348fe3cc 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -281,6 +281,7 @@ enum mthp_stat_item {
>>>       MTHP_STAT_SPLIT,
>>>       MTHP_STAT_SPLIT_FAILED,
>>>       MTHP_STAT_SPLIT_DEFERRED,
>>> +    MTHP_STAT_NR_ANON,
>>>       __MTHP_STAT_COUNT
>>>   };
>>>   @@ -291,14 +292,24 @@ struct mthp_stat {
>>>   #ifdef CONFIG_SYSFS
>>>   DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
>>>   -static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int
>>> delta)
>>>   {
>>>       if (order <= 0 || order > PMD_ORDER)
>>>           return;
>>>   -    this_cpu_inc(mthp_stats.stats[order][item]);
>>> +    this_cpu_add(mthp_stats.stats[order][item], delta);
>>> +}
>>> +
>>> +static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>> +{
>>> +    mod_mthp_stat(order, item, 1);
>>>   }
>>> +
>>>   #else
>>> +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int
>>> delta)
>>> +{
>>> +}
>>> +
>>>   static inline void count_mthp_stat(int order, enum mthp_stat_item item)
>>>   {
>>>   }
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 697fcf89f975..b6bc2a3791e3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -578,6 +578,7 @@ DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge,
>>> MTHP_STAT_SHMEM_FALLBACK_CHARGE);
>>>   DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
>>>   DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>>>   DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>>> +DEFINE_MTHP_STAT_ATTR(anon_num, MTHP_STAT_NR_ANON);
>>>     static struct attribute *stats_attrs[] = {
>>>       &anon_fault_alloc_attr.attr,
>>> @@ -591,6 +592,7 @@ static struct attribute *stats_attrs[] = {
>>>       &split_attr.attr,
>>>       &split_failed_attr.attr,
>>>       &split_deferred_attr.attr,
>>> +    &anon_num_attr.attr,
>>>       NULL,
>>>   };
>>>   diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 901950200957..2b722f26224c 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1467,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio,
>>> struct vm_area_struct *vma,
>>>       }
>>>         __folio_mod_stat(folio, nr, nr_pmdmapped);
>>> +    mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
>>>   }
>>>     static __always_inline void __folio_add_file_rmap(struct folio *folio,
>>> @@ -1582,6 +1583,8 @@ static __always_inline void __folio_remove_rmap(struct
>>> folio *folio,
>>>           list_empty(&folio->_deferred_list))
>>>           deferred_split_folio(folio);
>>>       __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>>> +    if (folio_test_anon(folio) && !atomic_read(mapped))
>>
>> Agree that atomic_read() is dodgy here.
>>
>> Not sure I fully understand why David prefers to do the unaccounting at
>> free-time though? It feels unbalanced to me to increment when first mapped but
>> decrement when freed. Surely its safer to either use alloc/free or use first
>> map/last map?
> 
> Doing it when we set/clear folio->mapping is straight forward.
> 
> Anon folios currently come to live when we first map them, and they stay that
> way until we free them.
> 
> In the future, we'll have to move that anon handling further out, when if have
> to allocate anon-specific memdesc ahead of time, then, it will be clued to that
> lifetime.
> 
>>
>> If using alloc/free isn't there a THP constructor/destructor that prepares the
>> deferred list? (My memory may be failing me). Could we use that?
> 
> Likely the deconstructor could work as well. Not sure if that is any better than
> the freeing path where folio->mapping currently gets cleared.
> 
> The generic constructor certainly won't work right now. That's not where the
> "anon" part comes to live.
> 
> Let's take a look how NR_FILE_THPS is handled:
> 
> __filemap_add_folio() increments it -- when we set folio->mapping
> __filemap_remove_folio() (->filemap_unaccount_folio) decrements it -- after
> which we usually call page_cache_delete() to set folio->mapping = NULL;
> 

OK got it, thanks!



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  8:58         ` David Hildenbrand
@ 2024-08-09  9:05           ` Ryan Roberts
  2024-08-09  9:22             ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: Ryan Roberts @ 2024-08-09  9:05 UTC (permalink / raw)
  To: David Hildenbrand, Barry Song, akpm, linux-mm
  Cc: chrisl, kaleshsingh, kasong, linux-kernel, ioworker0, baolin.wang,
	ziy, hanchuanhua, Barry Song

On 09/08/2024 09:58, David Hildenbrand wrote:
> On 09.08.24 10:42, David Hildenbrand wrote:
>>>> Not sure I fully understand why David prefers to do the unaccounting at
>>>> free-time though? It feels unbalanced to me to increment when first mapped but
>>>> decrement when freed. Surely its safer to either use alloc/free or use first
>>>> map/last map?
>>>>
>>>> If using alloc/free isn't there a THP constructor/destructor that prepares the
>>>> deferred list? (My memory may be failing me). Could we use that?
>>>
>>> Additionally, if we wanted to extend (eventually) to track the number of shmem
>>> and file mthps in additional counters, could we also account using similar folio
>>> free-time hooks? If not, it might be an argument to account in rmap_unmap to be
>>> consistent for all?
>>
>> Again, see NR_FILE_THPS handling. No rmap over-complication please.
> 
> ... not to mention that it is non-sensical to only count pageache folios that
> are mapped to user space ;)

Yes, good point. I'll get back in my box. :)



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  9:05           ` Ryan Roberts
@ 2024-08-09  9:22             ` David Hildenbrand
  2024-08-11  8:13               ` Barry Song
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-09  9:22 UTC (permalink / raw)
  To: Ryan Roberts, Barry Song, akpm, linux-mm
  Cc: chrisl, kaleshsingh, kasong, linux-kernel, ioworker0, baolin.wang,
	ziy, hanchuanhua, Barry Song

On 09.08.24 11:05, Ryan Roberts wrote:
> On 09/08/2024 09:58, David Hildenbrand wrote:
>> On 09.08.24 10:42, David Hildenbrand wrote:
>>>>> Not sure I fully understand why David prefers to do the unaccounting at
>>>>> free-time though? It feels unbalanced to me to increment when first mapped but
>>>>> decrement when freed. Surely its safer to either use alloc/free or use first
>>>>> map/last map?
>>>>>
>>>>> If using alloc/free isn't there a THP constructor/destructor that prepares the
>>>>> deferred list? (My memory may be failing me). Could we use that?
>>>>
>>>> Additionally, if we wanted to extend (eventually) to track the number of shmem
>>>> and file mthps in additional counters, could we also account using similar folio
>>>> free-time hooks? If not, it might be an argument to account in rmap_unmap to be
>>>> consistent for all?
>>>
>>> Again, see NR_FILE_THPS handling. No rmap over-complication please.
>>
>> ... not to mention that it is non-sensical to only count pageache folios that
>> are mapped to user space ;)
> 
> Yes, good point. I'll get back in my box. :)

Well, it was a valuable discussion!

anon folios in the swapcache are interesting: they are only "anon" after 
we first mapped them (harder to change, but would be possible by using a 
NULL mapping maybe, if really worth it; with memdesc that might turn out 
interesting). But once they are anon, they will stay anon until actually 
reclaimed -> freed.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  7:22             ` David Hildenbrand
@ 2024-08-11  5:20               ` Barry Song
  2024-08-11  6:54                 ` Barry Song
  0 siblings, 1 reply; 28+ messages in thread
From: Barry Song @ 2024-08-11  5:20 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On Fri, Aug 9, 2024 at 7:22 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 09.08.24 09:04, Barry Song wrote:
> >>>> I would appreciate if we leave the rmap out here.
> >>>>
> >>>> Can't we handle that when actually freeing the folio? folio_test_anon()
> >>>> is sticky until freed.
> >>>
> >>> To be clearer: we increment the counter when we set a folio anon, which
> >>> should indeed only happen in folio_add_new_anon_rmap(). We'll have to
> >>> ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().
> >>>
> >>> Then, when we free an anon folio we decrement the counter. (hugetlb
> >>> should clear the anon flag when an anon folio gets freed back to its
> >>> allocator -- likely that is already done).
> >>>
> >>
> >> Sorry that I am talking to myself: I'm wondering if we also have to
> >> adjust the counter when splitting a large folio to multiple
> >> smaller-but-still-large folios.
> >
> > Hi David,
> >
> > The conceptual code is shown below. Does this make more
> > sense to you? we have a line "mod_mthp_stat(new_order,
> > MTHP_STAT_NR_ANON, 1 << (order - new_order));"
> >
> > @@ -3270,8 +3272,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >       struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >       /* reset xarray order to new order after split */
> >       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> > -     struct anon_vma *anon_vma = NULL;
> > +     bool is_anon = folio_test_anon(folio);
> >       struct address_space *mapping = NULL;
> > +     struct anon_vma *anon_vma = NULL;
> >       int order = folio_order(folio);
> >       int extra_pins, ret;
> >       pgoff_t end;
> > @@ -3283,7 +3286,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >       if (new_order >= folio_order(folio))
> >               return -EINVAL;
> >
> > -     if (folio_test_anon(folio)) {
> > +     if (is_anon) {
> >               /* order-1 is not supported for anonymous THP. */
> >               if (new_order == 1) {
> >                       VM_WARN_ONCE(1, "Cannot split to order-1 folio");
> > @@ -3323,7 +3326,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >       if (folio_test_writeback(folio))
> >               return -EBUSY;
> >
> > -     if (folio_test_anon(folio)) {
> > +     if (is_anon) {
> >               /*
> >                * The caller does not necessarily hold an mmap_lock that would
> >                * prevent the anon_vma disappearing so we first we take a
> > @@ -3437,6 +3440,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >                       }
> >               }
> >
> > +             if (is_anon) {
> > +                     mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> > +                     mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
> > +             }
> >               __split_huge_page(page, list, end, new_order);
> >               ret = 0;
> >       } else {
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 408ef3d25cf5..c869d0601614 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1039,6 +1039,7 @@ __always_inline bool free_pages_prepare(struct page *page,
> >       bool skip_kasan_poison = should_skip_kasan_poison(page);
> >       bool init = want_init_on_free();
> >       bool compound = PageCompound(page);
> > +     bool anon = PageAnon(page);
> >
> >       VM_BUG_ON_PAGE(PageTail(page), page);
> >
> > @@ -1130,6 +1131,9 @@ __always_inline bool free_pages_prepare(struct page *page,
> >
> >       debug_pagealloc_unmap_pages(page, 1 << order);
> >
> > +     if (anon && compound)
> > +             mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> > +
> >       return true;
>
> I'd have placed it here, when we are already passed the "PageMappingFlags" check and
> shouldn't have any added overhead for most !anon pages IIRC (mostly only anon/ksm pages should
> run into that path).
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 408ef3d25cf5..a11b9dd62964 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1079,8 +1079,11 @@ __always_inline bool free_pages_prepare(struct page *page,
>                          (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
>                  }
>          }
> -       if (PageMappingFlags(page))
> +       if (PageMappingFlags(page)) {
> +               if (PageAnon(page) && compound)
> +                       mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
>                  page->mapping = NULL;
> +       }
>          if (is_check_pages_enabled()) {
>                  if (free_page_is_bad(page))
>                          bad++;
>

This looks better, but we're not concerned about the bad pages, correct?
For bad pages, we're reducing the count by 1, but they aren't actually
freed. We don't need to worry about this since it's already considered
a bug, right?

> Conceptually LGTM. We account an anon folio as long as it's anon,
> even when still GUP-pinned after unmapping it or when temporarily
> unmapping+remapping it during migration.

right. but migration might be a problem? as the dst folio doesn't
call folio_add_new_anon_rmap(), dst->mapping is copied from
src. So I suspect we need the below (otherwise, src has been put
and got -1, but dst won't get +1),

diff --git a/mm/migrate.c b/mm/migrate.c
index 7e1267042a56..11ef11e59036 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1102,6 +1102,8 @@ static void migrate_folio_done(struct folio *src,
                mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
                                    folio_is_file_lru(src),
-folio_nr_pages(src));

+       mod_mthp_stat(folio_order(src), MTHP_STAT_NR_ANON, 1);
+
        if (reason != MR_MEMORY_FAILURE)
                /* We release the page in page_handle_poison. */
                folio_put(src);


>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-11  5:20               ` Barry Song
@ 2024-08-11  6:54                 ` Barry Song
  2024-08-11  8:51                   ` David Hildenbrand
  0 siblings, 1 reply; 28+ messages in thread
From: Barry Song @ 2024-08-11  6:54 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On Sun, Aug 11, 2024 at 5:20 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Fri, Aug 9, 2024 at 7:22 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 09.08.24 09:04, Barry Song wrote:
> > >>>> I would appreciate if we leave the rmap out here.
> > >>>>
> > >>>> Can't we handle that when actually freeing the folio? folio_test_anon()
> > >>>> is sticky until freed.
> > >>>
> > >>> To be clearer: we increment the counter when we set a folio anon, which
> > >>> should indeed only happen in folio_add_new_anon_rmap(). We'll have to
> > >>> ignore hugetlb here where we do it in hugetlb_add_new_anon_rmap().
> > >>>
> > >>> Then, when we free an anon folio we decrement the counter. (hugetlb
> > >>> should clear the anon flag when an anon folio gets freed back to its
> > >>> allocator -- likely that is already done).
> > >>>
> > >>
> > >> Sorry that I am talking to myself: I'm wondering if we also have to
> > >> adjust the counter when splitting a large folio to multiple
> > >> smaller-but-still-large folios.
> > >
> > > Hi David,
> > >
> > > The conceptual code is shown below. Does this make more
> > > sense to you? we have a line "mod_mthp_stat(new_order,
> > > MTHP_STAT_NR_ANON, 1 << (order - new_order));"
> > >
> > > @@ -3270,8 +3272,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >       struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> > >       /* reset xarray order to new order after split */
> > >       XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
> > > -     struct anon_vma *anon_vma = NULL;
> > > +     bool is_anon = folio_test_anon(folio);
> > >       struct address_space *mapping = NULL;
> > > +     struct anon_vma *anon_vma = NULL;
> > >       int order = folio_order(folio);
> > >       int extra_pins, ret;
> > >       pgoff_t end;
> > > @@ -3283,7 +3286,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >       if (new_order >= folio_order(folio))
> > >               return -EINVAL;
> > >
> > > -     if (folio_test_anon(folio)) {
> > > +     if (is_anon) {
> > >               /* order-1 is not supported for anonymous THP. */
> > >               if (new_order == 1) {
> > >                       VM_WARN_ONCE(1, "Cannot split to order-1 folio");
> > > @@ -3323,7 +3326,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >       if (folio_test_writeback(folio))
> > >               return -EBUSY;
> > >
> > > -     if (folio_test_anon(folio)) {
> > > +     if (is_anon) {
> > >               /*
> > >                * The caller does not necessarily hold an mmap_lock that would
> > >                * prevent the anon_vma disappearing so we first we take a
> > > @@ -3437,6 +3440,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >                       }
> > >               }
> > >
> > > +             if (is_anon) {
> > > +                     mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> > > +                     mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
> > > +             }
> > >               __split_huge_page(page, list, end, new_order);
> > >               ret = 0;
> > >       } else {
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 408ef3d25cf5..c869d0601614 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1039,6 +1039,7 @@ __always_inline bool free_pages_prepare(struct page *page,
> > >       bool skip_kasan_poison = should_skip_kasan_poison(page);
> > >       bool init = want_init_on_free();
> > >       bool compound = PageCompound(page);
> > > +     bool anon = PageAnon(page);
> > >
> > >       VM_BUG_ON_PAGE(PageTail(page), page);
> > >
> > > @@ -1130,6 +1131,9 @@ __always_inline bool free_pages_prepare(struct page *page,
> > >
> > >       debug_pagealloc_unmap_pages(page, 1 << order);
> > >
> > > +     if (anon && compound)
> > > +             mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> > > +
> > >       return true;
> >
> > I'd have placed it here, when we are already passed the "PageMappingFlags" check and
> > shouldn't have any added overhead for most !anon pages IIRC (mostly only anon/ksm pages should
> > run into that path).
> >
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 408ef3d25cf5..a11b9dd62964 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1079,8 +1079,11 @@ __always_inline bool free_pages_prepare(struct page *page,
> >                          (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
> >                  }
> >          }
> > -       if (PageMappingFlags(page))
> > +       if (PageMappingFlags(page)) {
> > +               if (PageAnon(page) && compound)
> > +                       mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
> >                  page->mapping = NULL;
> > +       }
> >          if (is_check_pages_enabled()) {
> >                  if (free_page_is_bad(page))
> >                          bad++;
> >
>
> This looks better, but we're not concerned about the bad pages, correct?
> For bad pages, we're reducing the count by 1, but they aren't actually
> freed. We don't need to worry about this since it's already considered
> a bug, right?
>
> > Conceptually LGTM. We account an anon folio as long as it's anon,
> > even when still GUP-pinned after unmapping it or when temporarily
> > unmapping+remapping it during migration.
>
> right. but migration might be a problem? as the dst folio doesn't
> call folio_add_new_anon_rmap(), dst->mapping is copied from
> src. So I suspect we need the below (otherwise, src has been put
> and got -1, but dst won't get +1),
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 7e1267042a56..11ef11e59036 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1102,6 +1102,8 @@ static void migrate_folio_done(struct folio *src,
>                 mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
>                                     folio_is_file_lru(src),
> -folio_nr_pages(src));
>
> +       mod_mthp_stat(folio_order(src), MTHP_STAT_NR_ANON, 1);
> +
>         if (reason != MR_MEMORY_FAILURE)
>                 /* We release the page in page_handle_poison. */
>                 folio_put(src);
>

sorry. wrong place as migrate_folio_done() doesn't necessarily mean
migration is done. it could be "Folio was freed from under us"

        if (folio_ref_count(src) == 1) {
                /* Folio was freed from under us. So we are done. */
                folio_clear_active(src);
                folio_clear_unevictable(src);
                /* free_pages_prepare() will clear PG_isolated. */
                list_del(&src->lru);
                migrate_folio_done(src, reason);
                return MIGRATEPAGE_SUCCESS;
        }


the correct place should be:

@@ -1329,6 +1326,10 @@ static int migrate_folio_move(free_folio_t
put_new_folio, unsigned long private,
        if (anon_vma)
                put_anon_vma(anon_vma);
        folio_unlock(src);
+
+       if (folio_test_anon(src))
+               mod_mthp_stat(folio_order(src), MTHP_STAT_NR_ANON, 1);
+
        migrate_folio_done(src, reason);

        return rc;

Without this modification in migration code, my tests fail, anon_num can
become negative.

>
> >
> > --
> > Cheers,
> >
> > David / dhildenb
> >

Thanks
Barry


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-09  9:22             ` David Hildenbrand
@ 2024-08-11  8:13               ` Barry Song
  0 siblings, 0 replies; 28+ messages in thread
From: Barry Song @ 2024-08-11  8:13 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Ryan Roberts, akpm, linux-mm, chrisl, kaleshsingh, kasong,
	linux-kernel, ioworker0, baolin.wang, ziy, hanchuanhua,
	Barry Song

On Fri, Aug 9, 2024 at 9:22 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 09.08.24 11:05, Ryan Roberts wrote:
> > On 09/08/2024 09:58, David Hildenbrand wrote:
> >> On 09.08.24 10:42, David Hildenbrand wrote:
> >>>>> Not sure I fully understand why David prefers to do the unaccounting at
> >>>>> free-time though? It feels unbalanced to me to increment when first mapped but
> >>>>> decrement when freed. Surely its safer to either use alloc/free or use first
> >>>>> map/last map?
> >>>>>
> >>>>> If using alloc/free isn't there a THP constructor/destructor that prepares the
> >>>>> deferred list? (My memory may be failing me). Could we use that?
> >>>>
> >>>> Additionally, if we wanted to extend (eventually) to track the number of shmem
> >>>> and file mthps in additional counters, could we also account using similar folio
> >>>> free-time hooks? If not, it might be an argument to account in rmap_unmap to be
> >>>> consistent for all?
> >>>
> >>> Again, see NR_FILE_THPS handling. No rmap over-complication please.
> >>
> >> ... not to mention that it is non-sensical to only count pageache folios that
> >> are mapped to user space ;)
> >
> > Yes, good point. I'll get back in my box. :)
>
> Well, it was a valuable discussion!
>
> anon folios in the swapcache are interesting: they are only "anon" after
> we first mapped them (harder to change, but would be possible by using a
> NULL mapping maybe, if really worth it; with memdesc that might turn out
> interesting). But once they are anon, they will stay anon until actually
> reclaimed -> freed.

I assume we don’t need to worry about this, as even AnonPages (NR_ANON_MAPPED)
in /proc/meminfo also entirely depends on anon pages becoming anon mappings.

>
> --
> Cheers,
>
> David / dhildenb
>

Thanks
Barry


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-11  6:54                 ` Barry Song
@ 2024-08-11  8:51                   ` David Hildenbrand
  2024-08-11  9:22                     ` Barry Song
  0 siblings, 1 reply; 28+ messages in thread
From: David Hildenbrand @ 2024-08-11  8:51 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

> the correct place should be:
> 
> @@ -1329,6 +1326,10 @@ static int migrate_folio_move(free_folio_t
> put_new_folio, unsigned long private,
>          if (anon_vma)
>                  put_anon_vma(anon_vma);
>          folio_unlock(src);
> +
> +       if (folio_test_anon(src))
> +               mod_mthp_stat(folio_order(src), MTHP_STAT_NR_ANON, 1);
> +
>          migrate_folio_done(src, reason);
> 
>          return rc;
> 
> Without this modification in migration code, my tests fail, anon_num can
> become negative.

I was wondering if we should do it in __folio_migrate_mapping().

There, we set newfolio->mapping.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 1/2] mm: collect the number of anon large folios
  2024-08-11  8:51                   ` David Hildenbrand
@ 2024-08-11  9:22                     ` Barry Song
  0 siblings, 0 replies; 28+ messages in thread
From: Barry Song @ 2024-08-11  9:22 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: akpm, baolin.wang, chrisl, hanchuanhua, ioworker0, kaleshsingh,
	kasong, linux-kernel, linux-mm, ryan.roberts, v-songbaohua, ziy

On Sun, Aug 11, 2024 at 8:51 PM David Hildenbrand <david@redhat.com> wrote:
>
> > the correct place should be:
> >
> > @@ -1329,6 +1326,10 @@ static int migrate_folio_move(free_folio_t
> > put_new_folio, unsigned long private,
> >          if (anon_vma)
> >                  put_anon_vma(anon_vma);
> >          folio_unlock(src);
> > +
> > +       if (folio_test_anon(src))
> > +               mod_mthp_stat(folio_order(src), MTHP_STAT_NR_ANON, 1);
> > +
> >          migrate_folio_done(src, reason);
> >
> >          return rc;
> >
> > Without this modification in migration code, my tests fail, anon_num can
> > become negative.
>
> I was wondering if we should do it in __folio_migrate_mapping().
>
> There, we set newfolio->mapping.

Correct! To handle cases where the destination might not be migrated to but
could be put or freed, migrate_folio_undo_dst() will be called to release the
destination earlier. It's better to simply place the count where the
anon mapping
is set and cleared.

--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -423,6 +423,8 @@ static int __folio_migrate_mapping(struct
address_space *mapping,
                /* No turning back from here */
                newfolio->index = folio->index;
                newfolio->mapping = folio->mapping;
+               if (folio_test_anon(folio) && folio_test_large(folio))
+                       mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
                if (folio_test_swapbacked(folio))
                        __folio_set_swapbacked(newfolio);

@@ -446,6 +448,9 @@ static int __folio_migrate_mapping(struct
address_space *mapping,
         * no turning back from here.
         */
        newfolio->index = folio->index;
+
+       if (folio_test_anon(folio) && folio_test_large(folio))
+               mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
        newfolio->mapping = folio->mapping;
        folio_ref_add(newfolio, nr); /* add cache reference */
        if (folio_test_swapbacked(folio)) {


>
> --
> Cheers,
>
> David / dhildenb
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2024-08-11  9:22 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-08-08  1:04 [PATCH RFC 0/2] mm: collect the number of anon mTHP Barry Song
2024-08-08  1:04 ` [PATCH RFC 1/2] mm: collect the number of anon large folios Barry Song
2024-08-08  7:08   ` Barry Song
2024-08-08  8:03     ` David Hildenbrand
2024-08-08  8:08       ` David Hildenbrand
2024-08-08  8:17         ` David Hildenbrand
2024-08-08  9:20           ` Barry Song
2024-08-09  7:04           ` Barry Song
2024-08-09  7:22             ` David Hildenbrand
2024-08-11  5:20               ` Barry Song
2024-08-11  6:54                 ` Barry Song
2024-08-11  8:51                   ` David Hildenbrand
2024-08-11  9:22                     ` Barry Song
2024-08-09  5:17   ` kernel test robot
2024-08-09  5:28   ` kernel test robot
2024-08-09  8:13   ` Ryan Roberts
2024-08-09  8:27     ` Ryan Roberts
2024-08-09  8:40       ` Barry Song
2024-08-09  8:42       ` David Hildenbrand
2024-08-09  8:58         ` David Hildenbrand
2024-08-09  9:05           ` Ryan Roberts
2024-08-09  9:22             ` David Hildenbrand
2024-08-11  8:13               ` Barry Song
2024-08-09  8:39     ` David Hildenbrand
2024-08-09  9:00       ` Ryan Roberts
2024-08-08  1:04 ` [PATCH RFC 2/2] mm: collect the number of anon large folios partially unmapped Barry Song
2024-08-09  8:23   ` Ryan Roberts
2024-08-09  8:48     ` Barry Song

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.