* [PATCH v2 1/5] mm: rename huge_zero_page to huge_zero_folio
2025-08-08 12:11 [PATCH v2 0/5] add persistent huge zero folio support Pankaj Raghav (Samsung)
@ 2025-08-08 12:11 ` Pankaj Raghav (Samsung)
2025-08-18 6:38 ` Hannes Reinecke
2025-08-08 12:11 ` [PATCH v2 2/5] mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO Pankaj Raghav (Samsung)
` (3 subsequent siblings)
4 siblings, 1 reply; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-08 12:11 UTC (permalink / raw)
To: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, kernel, hch,
Pankaj Raghav
From: Pankaj Raghav <p.raghav@samsung.com>
As the transition already happened from exposing huge_zero_page to
huge_zero_folio, change the name of the shrinker and the other helper
function to reflect that.
No functional changes.
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
mm/huge_memory.c | 34 +++++++++++++++++-----------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2b4ea5a2ce7d..6625514f622b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
-static bool get_huge_zero_page(void)
+static bool get_huge_zero_folio(void)
{
struct folio *zero_folio;
retry:
@@ -237,7 +237,7 @@ static bool get_huge_zero_page(void)
return true;
}
-static void put_huge_zero_page(void)
+static void put_huge_zero_folio(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -251,11 +251,11 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
return READ_ONCE(huge_zero_folio);
- if (!get_huge_zero_page())
+ if (!get_huge_zero_folio())
return NULL;
if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
}
@@ -263,18 +263,18 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ put_huge_zero_folio();
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
/* we can free zero page only if last reference remains */
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
@@ -287,7 +287,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
+static struct shrinker *huge_zero_folio_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -849,8 +849,8 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker)
return -ENOMEM;
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
@@ -858,13 +858,13 @@ static int __init thp_shrinker_init(void)
SHRINKER_NONSLAB,
"thp-deferred_split");
if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
return -ENOMEM;
}
- huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
+ huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
+ huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
+ shrinker_register(huge_zero_folio_shrinker);
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
@@ -875,7 +875,7 @@ static int __init thp_shrinker_init(void)
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
}
--
2.49.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 1/5] mm: rename huge_zero_page to huge_zero_folio
2025-08-08 12:11 ` [PATCH v2 1/5] mm: rename huge_zero_page to huge_zero_folio Pankaj Raghav (Samsung)
@ 2025-08-18 6:38 ` Hannes Reinecke
0 siblings, 0 replies; 15+ messages in thread
From: Hannes Reinecke @ 2025-08-18 6:38 UTC (permalink / raw)
To: Pankaj Raghav (Samsung), Suren Baghdasaryan, Ryan Roberts,
Baolin Wang, Vlastimil Babka, Zi Yan, Mike Rapoport, Dave Hansen,
Michal Hocko, David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, hch,
Pankaj Raghav
On 8/8/25 14:11, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> As the transition already happened from exposing huge_zero_page to
> huge_zero_folio, change the name of the shrinker and the other helper
> function to reflect that.
>
> No functional changes.
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
> mm/huge_memory.c | 34 +++++++++++++++++-----------------
> 1 file changed, 17 insertions(+), 17 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 2/5] mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO
2025-08-08 12:11 [PATCH v2 0/5] add persistent huge zero folio support Pankaj Raghav (Samsung)
2025-08-08 12:11 ` [PATCH v2 1/5] mm: rename huge_zero_page to huge_zero_folio Pankaj Raghav (Samsung)
@ 2025-08-08 12:11 ` Pankaj Raghav (Samsung)
2025-08-18 6:39 ` Hannes Reinecke
2025-08-08 12:11 ` [PATCH v2 3/5] mm: add persistent huge zero folio Pankaj Raghav (Samsung)
` (2 subsequent siblings)
4 siblings, 1 reply; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-08 12:11 UTC (permalink / raw)
To: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, kernel, hch,
Pankaj Raghav
From: Pankaj Raghav <p.raghav@samsung.com>
As all the helper functions has been renamed from *_page to *_folio,
rename the MM flag from MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO.
No functional changes.
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
include/linux/mm_types.h | 2 +-
mm/huge_memory.c | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3ed763e7ec6f..cf94df4955c7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1758,7 +1758,7 @@ enum {
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
-#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
+#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6625514f622b..ff06dee213eb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -248,13 +248,13 @@ static void put_huge_zero_folio(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
return READ_ONCE(huge_zero_folio);
if (!get_huge_zero_folio())
return NULL;
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
@@ -262,7 +262,7 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
put_huge_zero_folio();
}
--
2.49.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 2/5] mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO
2025-08-08 12:11 ` [PATCH v2 2/5] mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO Pankaj Raghav (Samsung)
@ 2025-08-18 6:39 ` Hannes Reinecke
0 siblings, 0 replies; 15+ messages in thread
From: Hannes Reinecke @ 2025-08-18 6:39 UTC (permalink / raw)
To: Pankaj Raghav (Samsung), Suren Baghdasaryan, Ryan Roberts,
Baolin Wang, Vlastimil Babka, Zi Yan, Mike Rapoport, Dave Hansen,
Michal Hocko, David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, hch,
Pankaj Raghav
On 8/8/25 14:11, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> As all the helper functions has been renamed from *_page to *_folio,
> rename the MM flag from MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO.
>
> No functional changes.
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
> include/linux/mm_types.h | 2 +-
> mm/huge_memory.c | 6 +++---
> 2 files changed, 4 insertions(+), 4 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 3/5] mm: add persistent huge zero folio
2025-08-08 12:11 [PATCH v2 0/5] add persistent huge zero folio support Pankaj Raghav (Samsung)
2025-08-08 12:11 ` [PATCH v2 1/5] mm: rename huge_zero_page to huge_zero_folio Pankaj Raghav (Samsung)
2025-08-08 12:11 ` [PATCH v2 2/5] mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO Pankaj Raghav (Samsung)
@ 2025-08-08 12:11 ` Pankaj Raghav (Samsung)
2025-08-08 12:47 ` Pankaj Raghav (Samsung)
` (2 more replies)
2025-08-08 12:11 ` [PATCH v2 4/5] mm: add largest_zero_folio() routine Pankaj Raghav (Samsung)
2025-08-08 12:11 ` [PATCH v2 5/5] block: use largest_zero_folio in __blkdev_issue_zero_pages() Pankaj Raghav (Samsung)
4 siblings, 3 replies; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-08 12:11 UTC (permalink / raw)
To: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, kernel, hch,
Pankaj Raghav
From: Pankaj Raghav <p.raghav@samsung.com>
Many places in the kernel need to zero out larger chunks, but the
maximum segment that can be zeroed out at a time by ZERO_PAGE is limited
by PAGE_SIZE.
This is especially annoying in block devices and filesystems where
multiple ZERO_PAGEs are attached to the bio in different bvecs. With
multipage bvec support in block layer, it is much more efficient to send
out larger zero pages as a part of single bvec.
This concern was raised during the review of adding Large Block Size
support to XFS[1][2].
Usually huge_zero_folio is allocated on demand, and it will be
deallocated by the shrinker if there are no users of it left. At moment,
huge_zero_folio infrastructure refcount is tied to the process lifetime
that created it. This might not work for bio layer as the completions
can be async and the process that created the huge_zero_folio might no
longer be alive. And, one of the main points that came up during
discussion is to have something bigger than zero page as a drop-in
replacement.
Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will result in
allocating the huge zero folio during early init and never free the memory
by disabling the shrinker. This makes using the huge_zero_folio without
having to pass any mm struct and does not tie the lifetime of the zero
folio to anything, making it a drop-in replacement for ZERO_PAGE.
If PERSISTENT_HUGE_ZERO_FOLIO config option is enabled, then
mm_get_huge_zero_folio() will simply return the allocated page instead of
dynamically allocating a new PMD page.
Use this option carefully in resource constrained systems as it uses
one full PMD sized page for zeroing purposes.
[1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
[2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
include/linux/huge_mm.h | 16 ++++++++++++++++
mm/Kconfig | 16 ++++++++++++++++
mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++----------
3 files changed, 62 insertions(+), 10 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7748489fde1b..bd547857c6c1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return NULL;
+
+ if (unlikely(!huge_zero_folio))
+ return NULL;
+
+ return huge_zero_folio;
+}
+
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
@@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb,
{
return 0;
}
+
+static inline struct folio *get_persistent_huge_zero_folio(void)
+{
+ return NULL;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,
diff --git a/mm/Kconfig b/mm/Kconfig
index e443fe8cd6cf..fbe86ef97fd0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB
config ARCH_WANTS_THP_SWAP
def_bool n
+config PERSISTENT_HUGE_ZERO_FOLIO
+ bool "Allocate a PMD sized folio for zeroing"
+ depends on TRANSPARENT_HUGEPAGE
+ help
+ Enable this option to reduce the runtime refcounting overhead
+ of the huge zero folio and expand the places in the kernel
+ that can use huge zero folios. This can potentially improve
+ the performance while performing an I/O.
+
+ With this option enabled, the huge zero folio is allocated
+ once and never freed. One full huge page worth of memory shall
+ be used.
+
+ Say Y if your system has lots of memory. Say N if you are
+ memory constrained.
+
config MM_ID
def_bool n
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ff06dee213eb..bedda9640936 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -248,6 +248,9 @@ static void put_huge_zero_folio(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return huge_zero_folio;
+
if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
return READ_ONCE(huge_zero_folio);
@@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return;
+
if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
put_huge_zero_folio();
}
@@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_folio_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_folio_shrinker);
+ if (!deferred_split_shrinker)
+ return -ENOMEM;
+
+ deferred_split_shrinker->count_objects = deferred_split_count;
+ deferred_split_shrinker->scan_objects = deferred_split_scan;
+ shrinker_register(deferred_split_shrinker);
+
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
+ /*
+ * Bump the reference of the huge_zero_folio and do not
+ * initialize the shrinker.
+ *
+ * huge_zero_folio will always be NULL on failure. We assume
+ * that get_huge_zero_folio() will most likely not fail as
+ * thp_shrinker_init() is invoked early on during boot.
+ */
+ if (!get_huge_zero_folio())
+ pr_warn("Allocating static huge zero folio failed\n");
+ return 0;
+ }
+
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker) {
+ shrinker_free(deferred_split_shrinker);
return -ENOMEM;
}
@@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void)
huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
shrinker_register(huge_zero_folio_shrinker);
- deferred_split_shrinker->count_objects = deferred_split_count;
- deferred_split_shrinker->scan_objects = deferred_split_scan;
- shrinker_register(deferred_split_shrinker);
-
return 0;
}
--
2.49.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 3/5] mm: add persistent huge zero folio
2025-08-08 12:11 ` [PATCH v2 3/5] mm: add persistent huge zero folio Pankaj Raghav (Samsung)
@ 2025-08-08 12:47 ` Pankaj Raghav (Samsung)
2025-08-08 15:47 ` Lorenzo Stoakes
2025-08-18 12:02 ` Hannes Reinecke
2 siblings, 0 replies; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-08 12:47 UTC (permalink / raw)
To: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, hch,
Pankaj Raghav
> + if (!get_huge_zero_folio())
> + pr_warn("Allocating static huge zero folio failed\n");
Oops, forgot to do s/static/persistent/ here.
I can fold this in the next version after receiving the comments.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 3/5] mm: add persistent huge zero folio
2025-08-08 12:11 ` [PATCH v2 3/5] mm: add persistent huge zero folio Pankaj Raghav (Samsung)
2025-08-08 12:47 ` Pankaj Raghav (Samsung)
@ 2025-08-08 15:47 ` Lorenzo Stoakes
2025-08-11 8:33 ` Pankaj Raghav (Samsung)
2025-08-18 12:02 ` Hannes Reinecke
2 siblings, 1 reply; 15+ messages in thread
From: Lorenzo Stoakes @ 2025-08-08 15:47 UTC (permalink / raw)
To: Pankaj Raghav (Samsung)
Cc: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Andrew Morton, Thomas Gleixner, Nico Pache,
Dev Jain, Liam R . Howlett, Jens Axboe, linux-kernel, willy,
linux-mm, Ritesh Harjani, linux-block, linux-fsdevel,
Darrick J . Wong, mcgrof, gost.dev, hch, Pankaj Raghav
On Fri, Aug 08, 2025 at 02:11:39PM +0200, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> Many places in the kernel need to zero out larger chunks, but the
> maximum segment that can be zeroed out at a time by ZERO_PAGE is limited
> by PAGE_SIZE.
>
> This is especially annoying in block devices and filesystems where
> multiple ZERO_PAGEs are attached to the bio in different bvecs. With
> multipage bvec support in block layer, it is much more efficient to send
> out larger zero pages as a part of single bvec.
>
> This concern was raised during the review of adding Large Block Size
> support to XFS[1][2].
>
> Usually huge_zero_folio is allocated on demand, and it will be
> deallocated by the shrinker if there are no users of it left. At moment,
> huge_zero_folio infrastructure refcount is tied to the process lifetime
> that created it. This might not work for bio layer as the completions
> can be async and the process that created the huge_zero_folio might no
> longer be alive. And, one of the main points that came up during
> discussion is to have something bigger than zero page as a drop-in
> replacement.
>
> Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will result in
> allocating the huge zero folio during early init and never free the memory
> by disabling the shrinker. This makes using the huge_zero_folio without
> having to pass any mm struct and does not tie the lifetime of the zero
> folio to anything, making it a drop-in replacement for ZERO_PAGE.
>
> If PERSISTENT_HUGE_ZERO_FOLIO config option is enabled, then
> mm_get_huge_zero_folio() will simply return the allocated page instead of
> dynamically allocating a new PMD page.
>
> Use this option carefully in resource constrained systems as it uses
> one full PMD sized page for zeroing purposes.
>
> [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
> [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
This is much nicer and now _super_ simple, I like it.
A few nits below but generally:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> include/linux/huge_mm.h | 16 ++++++++++++++++
> mm/Kconfig | 16 ++++++++++++++++
> mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++----------
> 3 files changed, 62 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 7748489fde1b..bd547857c6c1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
> struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
> void mm_put_huge_zero_folio(struct mm_struct *mm);
>
> +static inline struct folio *get_persistent_huge_zero_folio(void)
> +{
> + if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> + return NULL;
> +
> + if (unlikely(!huge_zero_folio))
> + return NULL;
> +
> + return huge_zero_folio;
> +}
> +
> static inline bool thp_migration_supported(void)
> {
> return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
> @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb,
> {
> return 0;
> }
> +
> +static inline struct folio *get_persistent_huge_zero_folio(void)
> +{
> + return NULL;
> +}
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> static inline int split_folio_to_list_to_order(struct folio *folio,
> diff --git a/mm/Kconfig b/mm/Kconfig
> index e443fe8cd6cf..fbe86ef97fd0 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB
> config ARCH_WANTS_THP_SWAP
> def_bool n
>
> +config PERSISTENT_HUGE_ZERO_FOLIO
> + bool "Allocate a PMD sized folio for zeroing"
> + depends on TRANSPARENT_HUGEPAGE
I feel like we really need to sort out what is/isn't predicated on THP... it
seems like THP is sort of short hand for 'any large folio stuff' but not
always...
But this is a more general point :)
> + help
> + Enable this option to reduce the runtime refcounting overhead
> + of the huge zero folio and expand the places in the kernel
> + that can use huge zero folios. This can potentially improve
> + the performance while performing an I/O.
NIT: I think we can drop 'an', and probably refactor this sentence to something
like 'For instance, block I/O benefits from access to large folios for zeroing
memory'.
> +
> + With this option enabled, the huge zero folio is allocated
> + once and never freed. One full huge page worth of memory shall
> + be used.
NIT: huge page worth -> huge page's worth
> +
> + Say Y if your system has lots of memory. Say N if you are
> + memory constrained.
> +
> config MM_ID
> def_bool n
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index ff06dee213eb..bedda9640936 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -248,6 +248,9 @@ static void put_huge_zero_folio(void)
>
> struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
> {
> + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> + return huge_zero_folio;
> +
> if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
> return READ_ONCE(huge_zero_folio);
>
> @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
>
> void mm_put_huge_zero_folio(struct mm_struct *mm)
> {
> + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
> + return;
> +
> if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags))
> put_huge_zero_folio();
> }
> @@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
>
> static int __init thp_shrinker_init(void)
> {
> - huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
> - if (!huge_zero_folio_shrinker)
> - return -ENOMEM;
> -
> deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
> SHRINKER_MEMCG_AWARE |
> SHRINKER_NONSLAB,
> "thp-deferred_split");
> - if (!deferred_split_shrinker) {
> - shrinker_free(huge_zero_folio_shrinker);
> + if (!deferred_split_shrinker)
> + return -ENOMEM;
> +
> + deferred_split_shrinker->count_objects = deferred_split_count;
> + deferred_split_shrinker->scan_objects = deferred_split_scan;
> + shrinker_register(deferred_split_shrinker);
> +
> + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
> + /*
> + * Bump the reference of the huge_zero_folio and do not
> + * initialize the shrinker.
> + *
> + * huge_zero_folio will always be NULL on failure. We assume
> + * that get_huge_zero_folio() will most likely not fail as
> + * thp_shrinker_init() is invoked early on during boot.
> + */
> + if (!get_huge_zero_folio())
> + pr_warn("Allocating static huge zero folio failed\n");
> + return 0;
> + }
> +
> + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
> + if (!huge_zero_folio_shrinker) {
> + shrinker_free(deferred_split_shrinker);
> return -ENOMEM;
> }
>
> @@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void)
> huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
> shrinker_register(huge_zero_folio_shrinker);
>
> - deferred_split_shrinker->count_objects = deferred_split_count;
> - deferred_split_shrinker->scan_objects = deferred_split_scan;
> - shrinker_register(deferred_split_shrinker);
> -
> return 0;
> }
>
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 3/5] mm: add persistent huge zero folio
2025-08-08 15:47 ` Lorenzo Stoakes
@ 2025-08-11 8:33 ` Pankaj Raghav (Samsung)
0 siblings, 0 replies; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-11 8:33 UTC (permalink / raw)
To: Lorenzo Stoakes
Cc: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Andrew Morton, Thomas Gleixner, Nico Pache,
Dev Jain, Liam R . Howlett, Jens Axboe, linux-kernel, willy,
linux-mm, Ritesh Harjani, linux-block, linux-fsdevel,
Darrick J . Wong, mcgrof, gost.dev, hch, Pankaj Raghav
> This is much nicer and now _super_ simple, I like it.
Thanks to you and David :)
>
> A few nits below but generally:
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Thanks.
>
> > ---
> > include/linux/huge_mm.h | 16 ++++++++++++++++
> > mm/Kconfig | 16 ++++++++++++++++
> > mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++----------
> > 3 files changed, 62 insertions(+), 10 deletions(-)
> >
> > static inline int split_folio_to_list_to_order(struct folio *folio,
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index e443fe8cd6cf..fbe86ef97fd0 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB
> > config ARCH_WANTS_THP_SWAP
> > def_bool n
> >
> > +config PERSISTENT_HUGE_ZERO_FOLIO
> > + bool "Allocate a PMD sized folio for zeroing"
> > + depends on TRANSPARENT_HUGEPAGE
>
> I feel like we really need to sort out what is/isn't predicated on THP... it
> seems like THP is sort of short hand for 'any large folio stuff' but not
> always...
>
> But this is a more general point :)
I already brought this topic once during THP cabal. I am thinking of
submitting a talk about this topic for LPC Memory Management MC.
>
> > + help
> > + Enable this option to reduce the runtime refcounting overhead
> > + of the huge zero folio and expand the places in the kernel
> > + that can use huge zero folios. This can potentially improve
> > + the performance while performing an I/O.
>
> NIT: I think we can drop 'an', and probably refactor this sentence to something
> like 'For instance, block I/O benefits from access to large folios for zeroing
> memory'.
>
> > +
> > + With this option enabled, the huge zero folio is allocated
> > + once and never freed. One full huge page worth of memory shall
> > + be used.
>
> NIT: huge page worth -> huge page's worth
>
Thanks for the comments. I will make those changes and send a new
version.
--
Pankaj
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 3/5] mm: add persistent huge zero folio
2025-08-08 12:11 ` [PATCH v2 3/5] mm: add persistent huge zero folio Pankaj Raghav (Samsung)
2025-08-08 12:47 ` Pankaj Raghav (Samsung)
2025-08-08 15:47 ` Lorenzo Stoakes
@ 2025-08-18 12:02 ` Hannes Reinecke
2 siblings, 0 replies; 15+ messages in thread
From: Hannes Reinecke @ 2025-08-18 12:02 UTC (permalink / raw)
To: Pankaj Raghav (Samsung), Suren Baghdasaryan, Ryan Roberts,
Baolin Wang, Vlastimil Babka, Zi Yan, Mike Rapoport, Dave Hansen,
Michal Hocko, David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, hch,
Pankaj Raghav
On 8/8/25 14:11, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> Many places in the kernel need to zero out larger chunks, but the
> maximum segment that can be zeroed out at a time by ZERO_PAGE is limited
> by PAGE_SIZE.
>
> This is especially annoying in block devices and filesystems where
> multiple ZERO_PAGEs are attached to the bio in different bvecs. With
> multipage bvec support in block layer, it is much more efficient to send
> out larger zero pages as a part of single bvec.
>
> This concern was raised during the review of adding Large Block Size
> support to XFS[1][2].
>
> Usually huge_zero_folio is allocated on demand, and it will be
> deallocated by the shrinker if there are no users of it left. At moment,
> huge_zero_folio infrastructure refcount is tied to the process lifetime
> that created it. This might not work for bio layer as the completions
> can be async and the process that created the huge_zero_folio might no
> longer be alive. And, one of the main points that came up during
> discussion is to have something bigger than zero page as a drop-in
> replacement.
>
> Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will result in
> allocating the huge zero folio during early init and never free the memory
> by disabling the shrinker. This makes using the huge_zero_folio without
> having to pass any mm struct and does not tie the lifetime of the zero
> folio to anything, making it a drop-in replacement for ZERO_PAGE.
>
> If PERSISTENT_HUGE_ZERO_FOLIO config option is enabled, then
> mm_get_huge_zero_folio() will simply return the allocated page instead of
> dynamically allocating a new PMD page.
>
> Use this option carefully in resource constrained systems as it uses
> one full PMD sized page for zeroing purposes.
>
> [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/
> [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/
>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
> include/linux/huge_mm.h | 16 ++++++++++++++++
> mm/Kconfig | 16 ++++++++++++++++
> mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++----------
> 3 files changed, 62 insertions(+), 10 deletions(-)
>
As mentioned, I really would like to have a kernel commandline parameter
for disabling huge zero folio.
Otherwise:
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 4/5] mm: add largest_zero_folio() routine
2025-08-08 12:11 [PATCH v2 0/5] add persistent huge zero folio support Pankaj Raghav (Samsung)
` (2 preceding siblings ...)
2025-08-08 12:11 ` [PATCH v2 3/5] mm: add persistent huge zero folio Pankaj Raghav (Samsung)
@ 2025-08-08 12:11 ` Pankaj Raghav (Samsung)
2025-08-08 15:50 ` Lorenzo Stoakes
2025-08-18 12:04 ` Hannes Reinecke
2025-08-08 12:11 ` [PATCH v2 5/5] block: use largest_zero_folio in __blkdev_issue_zero_pages() Pankaj Raghav (Samsung)
4 siblings, 2 replies; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-08 12:11 UTC (permalink / raw)
To: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, kernel, hch,
Pankaj Raghav
From: Pankaj Raghav <p.raghav@samsung.com>
The callers of mm_get_huge_zero_folio() have access to a mm struct and
the lifetime of the huge_zero_folio is tied to the lifetime of the mm
struct.
largest_zero_folio() will give access to huge_zero_folio when
PERSISTENT_HUGE_ZERO_FOLIO config option is enabled for callers that do not
want to tie the lifetime to a mm struct. This is very useful for
filesystem and block layers where the request completions can be async
and there is no guarantee on the mm struct lifetime.
This function will return a ZERO_PAGE folio if PERSISTENT_HUGE_ZERO_FOLIO
is disabled or if we failed to allocate a huge_zero_folio during early
init.
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
include/linux/huge_mm.h | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index bd547857c6c1..14d424830fa8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -714,4 +714,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
return split_folio_to_list_to_order(folio, NULL, new_order);
}
+/**
+ * largest_zero_folio - Get the largest zero size folio available
+ *
+ * This function shall be used when mm_get_huge_zero_folio() cannot be
+ * used as there is no appropriate mm lifetime to tie the huge zero folio
+ * from the caller.
+ *
+ * Deduce the size of the folio with folio_size instead of assuming the
+ * folio size.
+ *
+ * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
+ * is enabled or a single page sized zero folio
+ */
+static inline struct folio *largest_zero_folio(void)
+{
+ struct folio *folio = get_persistent_huge_zero_folio();
+
+ if (folio)
+ return folio;
+
+ return page_folio(ZERO_PAGE(0));
+}
#endif /* _LINUX_HUGE_MM_H */
--
2.49.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 4/5] mm: add largest_zero_folio() routine
2025-08-08 12:11 ` [PATCH v2 4/5] mm: add largest_zero_folio() routine Pankaj Raghav (Samsung)
@ 2025-08-08 15:50 ` Lorenzo Stoakes
2025-08-18 12:04 ` Hannes Reinecke
1 sibling, 0 replies; 15+ messages in thread
From: Lorenzo Stoakes @ 2025-08-08 15:50 UTC (permalink / raw)
To: Pankaj Raghav (Samsung)
Cc: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Andrew Morton, Thomas Gleixner, Nico Pache,
Dev Jain, Liam R . Howlett, Jens Axboe, linux-kernel, willy,
linux-mm, Ritesh Harjani, linux-block, linux-fsdevel,
Darrick J . Wong, mcgrof, gost.dev, hch, Pankaj Raghav
On Fri, Aug 08, 2025 at 02:11:40PM +0200, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> The callers of mm_get_huge_zero_folio() have access to a mm struct and
> the lifetime of the huge_zero_folio is tied to the lifetime of the mm
> struct.
>
> largest_zero_folio() will give access to huge_zero_folio when
> PERSISTENT_HUGE_ZERO_FOLIO config option is enabled for callers that do not
> want to tie the lifetime to a mm struct. This is very useful for
> filesystem and block layers where the request completions can be async
> and there is no guarantee on the mm struct lifetime.
>
> This function will return a ZERO_PAGE folio if PERSISTENT_HUGE_ZERO_FOLIO
> is disabled or if we failed to allocate a huge_zero_folio during early
> init.
>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Hm thought I R-b this already :P
LGTM, so:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> include/linux/huge_mm.h | 22 ++++++++++++++++++++++
> 1 file changed, 22 insertions(+)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index bd547857c6c1..14d424830fa8 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -714,4 +714,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
> return split_folio_to_list_to_order(folio, NULL, new_order);
> }
>
> +/**
> + * largest_zero_folio - Get the largest zero size folio available
> + *
> + * This function shall be used when mm_get_huge_zero_folio() cannot be
> + * used as there is no appropriate mm lifetime to tie the huge zero folio
> + * from the caller.
> + *
> + * Deduce the size of the folio with folio_size instead of assuming the
> + * folio size.
> + *
> + * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO
> + * is enabled or a single page sized zero folio
> + */
> +static inline struct folio *largest_zero_folio(void)
> +{
> + struct folio *folio = get_persistent_huge_zero_folio();
> +
> + if (folio)
> + return folio;
> +
> + return page_folio(ZERO_PAGE(0));
> +}
> #endif /* _LINUX_HUGE_MM_H */
> --
> 2.49.0
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2 4/5] mm: add largest_zero_folio() routine
2025-08-08 12:11 ` [PATCH v2 4/5] mm: add largest_zero_folio() routine Pankaj Raghav (Samsung)
2025-08-08 15:50 ` Lorenzo Stoakes
@ 2025-08-18 12:04 ` Hannes Reinecke
1 sibling, 0 replies; 15+ messages in thread
From: Hannes Reinecke @ 2025-08-18 12:04 UTC (permalink / raw)
To: Pankaj Raghav (Samsung), Suren Baghdasaryan, Ryan Roberts,
Baolin Wang, Vlastimil Babka, Zi Yan, Mike Rapoport, Dave Hansen,
Michal Hocko, David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, hch,
Pankaj Raghav
On 8/8/25 14:11, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> The callers of mm_get_huge_zero_folio() have access to a mm struct and
> the lifetime of the huge_zero_folio is tied to the lifetime of the mm
> struct.
>
> largest_zero_folio() will give access to huge_zero_folio when
> PERSISTENT_HUGE_ZERO_FOLIO config option is enabled for callers that do not
> want to tie the lifetime to a mm struct. This is very useful for
> filesystem and block layers where the request completions can be async
> and there is no guarantee on the mm struct lifetime.
>
> This function will return a ZERO_PAGE folio if PERSISTENT_HUGE_ZERO_FOLIO
> is disabled or if we failed to allocate a huge_zero_folio during early
> init.
>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
> include/linux/huge_mm.h | 22 ++++++++++++++++++++++
> 1 file changed, 22 insertions(+)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2 5/5] block: use largest_zero_folio in __blkdev_issue_zero_pages()
2025-08-08 12:11 [PATCH v2 0/5] add persistent huge zero folio support Pankaj Raghav (Samsung)
` (3 preceding siblings ...)
2025-08-08 12:11 ` [PATCH v2 4/5] mm: add largest_zero_folio() routine Pankaj Raghav (Samsung)
@ 2025-08-08 12:11 ` Pankaj Raghav (Samsung)
2025-08-18 12:05 ` Hannes Reinecke
4 siblings, 1 reply; 15+ messages in thread
From: Pankaj Raghav (Samsung) @ 2025-08-08 12:11 UTC (permalink / raw)
To: Suren Baghdasaryan, Ryan Roberts, Baolin Wang, Vlastimil Babka,
Zi Yan, Mike Rapoport, Dave Hansen, Michal Hocko,
David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, kernel, hch,
Pankaj Raghav
From: Pankaj Raghav <p.raghav@samsung.com>
Use largest_zero_folio() in __blkdev_issue_zero_pages().
On systems with CONFIG_PERSISTENT_HUGE_ZERO_FOLIO enabled, we will end up
sending larger bvecs instead of multiple small ones.
Noticed a 4% increase in performance on a commercial NVMe SSD which does
not support OP_WRITE_ZEROES. The device's MDTS was 128K. The performance
gains might be bigger if the device supports bigger MDTS.
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
block/blk-lib.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 4c9f20a689f7..3030a772d3aa 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -196,6 +196,8 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned int flags)
{
+ struct folio *zero_folio = largest_zero_folio();
+
while (nr_sects) {
unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects);
struct bio *bio;
@@ -208,15 +210,14 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev,
break;
do {
- unsigned int len, added;
+ unsigned int len;
- len = min_t(sector_t,
- PAGE_SIZE, nr_sects << SECTOR_SHIFT);
- added = bio_add_page(bio, ZERO_PAGE(0), len, 0);
- if (added < len)
+ len = min_t(sector_t, folio_size(zero_folio),
+ nr_sects << SECTOR_SHIFT);
+ if (!bio_add_folio(bio, zero_folio, len, 0))
break;
- nr_sects -= added >> SECTOR_SHIFT;
- sector += added >> SECTOR_SHIFT;
+ nr_sects -= len >> SECTOR_SHIFT;
+ sector += len >> SECTOR_SHIFT;
} while (nr_sects);
*biop = bio_chain_and_submit(*biop, bio);
--
2.49.0
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2 5/5] block: use largest_zero_folio in __blkdev_issue_zero_pages()
2025-08-08 12:11 ` [PATCH v2 5/5] block: use largest_zero_folio in __blkdev_issue_zero_pages() Pankaj Raghav (Samsung)
@ 2025-08-18 12:05 ` Hannes Reinecke
0 siblings, 0 replies; 15+ messages in thread
From: Hannes Reinecke @ 2025-08-18 12:05 UTC (permalink / raw)
To: Pankaj Raghav (Samsung), Suren Baghdasaryan, Ryan Roberts,
Baolin Wang, Vlastimil Babka, Zi Yan, Mike Rapoport, Dave Hansen,
Michal Hocko, David Hildenbrand, Lorenzo Stoakes, Andrew Morton,
Thomas Gleixner, Nico Pache, Dev Jain, Liam R . Howlett,
Jens Axboe
Cc: linux-kernel, willy, linux-mm, Ritesh Harjani, linux-block,
linux-fsdevel, Darrick J . Wong, mcgrof, gost.dev, hch,
Pankaj Raghav
On 8/8/25 14:11, Pankaj Raghav (Samsung) wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
>
> Use largest_zero_folio() in __blkdev_issue_zero_pages().
> On systems with CONFIG_PERSISTENT_HUGE_ZERO_FOLIO enabled, we will end up
> sending larger bvecs instead of multiple small ones.
>
> Noticed a 4% increase in performance on a commercial NVMe SSD which does
> not support OP_WRITE_ZEROES. The device's MDTS was 128K. The performance
> gains might be bigger if the device supports bigger MDTS.
>
> Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
> block/blk-lib.c | 15 ++++++++-------
> 1 file changed, 8 insertions(+), 7 deletions(-)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 15+ messages in thread