linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: David Hildenbrand <david@redhat.com>
To: Kefeng Wang <wangkefeng.wang@huawei.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Oscar Salvador <osalvador@suse.de>,
	Muchun Song <muchun.song@linux.dev>
Cc: sidhartha.kumar@oracle.com, jane.chu@oracle.com,
	Zi Yan <ziy@nvidia.com>, Vlastimil Babka <vbabka@suse.cz>,
	Brendan Jackman <jackmanb@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	linux-mm@kvack.org
Subject: Re: [PATCH v3 3/6] mm: page_alloc: add alloc_contig_{range_frozen,frozen_pages}()
Date: Thu, 16 Oct 2025 22:53:18 +0200	[thread overview]
Message-ID: <e1f7ca22-a249-4b24-b308-d46afb2afb29@redhat.com> (raw)
In-Reply-To: <20251013133854.2466530-4-wangkefeng.wang@huawei.com>

On 13.10.25 15:38, Kefeng Wang wrote:
> In order to allocate given range of pages or allocate compound
> pages without incrementing their refcount, adding two new helper
> alloc_contig_{range_frozen,frozen_pages}() which may be beneficial
> to some users (eg hugetlb), also free_contig_range_frozen() is
> provided to match alloc_contig_range_frozen(), but it is better to
> use free_frozen_pages() to free frozen compound pages.
> 
> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
> ---
>   include/linux/gfp.h |  29 +++++--
>   mm/page_alloc.c     | 183 +++++++++++++++++++++++++++++---------------
>   2 files changed, 143 insertions(+), 69 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 1fefb63e0480..fbbdd8c88483 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -429,14 +429,27 @@ typedef unsigned int __bitwise acr_flags_t;
>   #define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA
>   
>   /* The below functions must be run on a range from a single zone. */
> -extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
> -				     acr_flags_t alloc_flags, gfp_t gfp_mask);
> -#define alloc_contig_range(...)			alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
> -
> -extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
> -					      int nid, nodemask_t *nodemask);
> -#define alloc_contig_pages(...)			alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))
> -
> +int alloc_contig_range_frozen_noprof(unsigned long start, unsigned long end,
> +		acr_flags_t alloc_flags, gfp_t gfp_mask);

Just wondering: given alloc_contig_pages() vs. alloc_contig_frozen_pages_()

Shouldn't it be alloc_contig_range() vs. alloc_contig_frozen_range()

And then free_contig_frozen_range()?


Do we want kerneldoc here as well?

> +int alloc_contig_range_frozen_noprof(unsigned long start, unsigned long end,
> +		acr_flags_t alloc_flags, gfp_t gfp_mask)
>   {
>   	const unsigned int order = ilog2(end - start);
>   	unsigned long outer_start, outer_end;
> @@ -7003,19 +6982,18 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
>   	}
>   
>   	if (!(gfp_mask & __GFP_COMP)) {
> -		split_free_pages(cc.freepages, gfp_mask);
> +		split_free_frozen_pages(cc.freepages, gfp_mask);
>   
>   		/* Free head and tail (if any) */
>   		if (start != outer_start)
> -			free_contig_range(outer_start, start - outer_start);
> +			free_contig_range_frozen(outer_start, start - outer_start);
>   		if (end != outer_end)
> -			free_contig_range(end, outer_end - end);
> +			free_contig_range_frozen(end, outer_end - end);
>   	} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
>   		struct page *head = pfn_to_page(start);
>   
>   		check_new_pages(head, order);
>   		prep_new_page(head, order, gfp_mask, 0);
> -		set_page_refcounted(head);
>   	} else {
>   		ret = -EINVAL;
>   		WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
> @@ -7025,16 +7003,48 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
>   	undo_isolate_page_range(start, end);
>   	return ret;
>   }
> -EXPORT_SYMBOL(alloc_contig_range_noprof);
>   
> -static int __alloc_contig_pages(unsigned long start_pfn,
> -				unsigned long nr_pages, gfp_t gfp_mask)
> +/**
> + * alloc_contig_range() -- tries to allocate given range of pages
> + * @start:	start PFN to allocate
> + * @end:	one-past-the-last PFN to allocate
> + * @alloc_flags:	allocation information
> + * @gfp_mask:	GFP mask. Node/zone/placement hints are ignored; only some
> + *		action and reclaim modifiers are supported. Reclaim modifiers
> + *		control allocation behavior during compaction/migration/reclaim.
> + *
> + * The PFN range does not have to be pageblock aligned. The PFN range must
> + * belong to a single zone.
> + *
> + * The first thing this routine does is attempt to MIGRATE_ISOLATE all
> + * pageblocks in the range.  Once isolated, the pageblocks should not
> + * be modified by others.
> + *
> + * Return: zero on success or negative error code.  On success all
> + * pages which PFN is in [start, end) are allocated for the caller and
> + * need to be freed with free_contig_range().
> + */
> +int alloc_contig_range_noprof(unsigned long start, unsigned long end,
> +			      acr_flags_t alloc_flags, gfp_t gfp_mask)
>   {
> -	unsigned long end_pfn = start_pfn + nr_pages;
> +	int ret;
> +
> +	ret = alloc_contig_range_frozen_noprof(start, end, alloc_flags, gfp_mask);
> +	if (ret)
> +		return ret;
> +
> +	if (gfp_mask & __GFP_COMP) {
> +		set_page_refcounted(pfn_to_page(start));
> +	} else {
> +		unsigned long pfn;
> +
> +		for (pfn = start; pfn < end; pfn++)
> +			set_page_refcounted(pfn_to_page(pfn));
> +	}

Might read better as

unsigned long pfn;

...

if (gfp_mask & __GFP_COMP) {
	set_page_refcounted(pfn_to_page(start));
	return 0;
}

for (pfn = start; pfn < end; pfn++)
	set_page_refcounted(pfn_to_page(pfn));
return 0;


One could also do something fancy like

unsigned long pfn;
...

for (pfn = start; pfn < end; pfn++) {
	set_page_refcounted(pfn_to_page(pfn));
	if (gfp_mask & __GFP_COMP)
		break;
}
return 0:


>   
> -	return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
> -					 gfp_mask);
> +	return 0;
>   }
> +EXPORT_SYMBOL(alloc_contig_range_noprof);
>   
>   static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
>   				   unsigned long nr_pages)
> @@ -7067,31 +7077,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
>   	return zone_spans_pfn(zone, last_pfn);
>   }


... kerneldoc? :)

> +struct page *alloc_contig_frozen_pages_noprof(unsigned long nr_pages,
> +		gfp_t gfp_mask, int nid, nodemask_t *nodemask)
>   {
>   	unsigned long ret, pfn, flags;
>   	struct zonelist *zonelist;
> @@ -7114,7 +7101,9 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
>   				 * and cause alloc_contig_range() to fail...
>   				 */
>   				spin_unlock_irqrestore(&zone->lock, flags);
> -				ret = __alloc_contig_pages(pfn, nr_pages,
> +				ret = alloc_contig_range_frozen_noprof(pfn,
> +							pfn + nr_pages,
> +							ACR_FLAGS_NONE,
>   							gfp_mask);
>   				if (!ret)
>   					return pfn_to_page(pfn);
> @@ -7126,6 +7115,78 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
>   	}
>   	return NULL;
>   }
> +EXPORT_SYMBOL(alloc_contig_range_frozen_noprof);
> +

kerneldoc? :)

> +void free_contig_range_frozen(unsigned long pfn, unsigned long nr_pages)
> +{
> +	struct folio *folio = pfn_folio(pfn);
> +
> +	if (folio_test_large(folio)) {
> +		int expected = folio_nr_pages(folio);
> +
> +		WARN_ON(folio_ref_count(folio));
> +
> +		if (nr_pages == expected)
> +			free_frozen_pages(&folio->page, folio_order(folio));
> +		else
> +			WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
> +			     pfn, nr_pages, expected);
> +		return;
> +	}
> +
> +	for (; nr_pages--; pfn++) {
> +		struct page *page = pfn_to_page(pfn);
> +
> +		WARN_ON(page_ref_count(page));
> +		free_frozen_pages(page, 0);
> +	}

That's mostly a copy-and-paste of free_contig_range().

I wonder if there is some way to avoid duplicating a lot of
free_contig_range() here. Hmmm.

Also, the folio stuff in there looks a bit weird I'm afraid.

Can't we just refuse to free compound pages throught this interface and
free_contig_range() ? IIRC only hugetlb uses it and uses folio_put() either way?

Then we can just document that compound allocations are to be freed differently.

And do something like

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 600d9e981c23d..776b4addc3685 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7123,29 +7123,25 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
  }
  #endif /* CONFIG_CONTIG_ALLOC */
  
-void free_contig_range(unsigned long pfn, unsigned long nr_pages)
+static inline void __free_contig_range(unsigned long pfn, unsigned long nr_pages, bool put_ref)
  {
-       unsigned long count = 0;
-       struct folio *folio = pfn_folio(pfn);
-
-       if (folio_test_large(folio)) {
-               int expected = folio_nr_pages(folio);
-
-               if (nr_pages == expected)
-                       folio_put(folio);
-               else
-                       WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
-                            pfn, nr_pages, expected);
-               return;
-       }
+       if (WARN_ON_ONCE(PageHead(pfn_to_page(pfn))))
+               break;
  
         for (; nr_pages--; pfn++) {
                 struct page *page = pfn_to_page(pfn);
  
-               count += page_count(page) != 1;
-               __free_page(page);
+               if (put_ref)
+                       page_ref_dec(page);
+               if (WARN_ON_ONCE(page_count(page)))
+                       continue;
+               free_frozen_pages(page, 0);
         }
-       WARN(count != 0, "%lu pages are still in use!\n", count);
+}
+
+void free_contig_range(unsigned long pfn, unsigned long nr_pages)
+{
+       return __free_contig_range(pfn, nr_pages, /* put_ref= */ true);
  }
  EXPORT_SYMBOL(free_contig_range);
  

Just a thought, I dislike current free_contig_range() and the duplicated
variant.

> +}
> +EXPORT_SYMBOL(free_contig_range_frozen);
> +
> +/**
> + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
> + * @nr_pages:	Number of contiguous pages to allocate
> + * @gfp_mask:	GFP mask. Node/zone/placement hints limit the search; only some
> + *		action and reclaim modifiers are supported. Reclaim modifiers
> + *		control allocation behavior during compaction/migration/reclaim.
> + * @nid:	Target node
> + * @nodemask:	Mask for other possible nodes
> + *
> + * This routine is a wrapper around alloc_contig_range(). It scans over zones
> + * on an applicable zonelist to find a contiguous pfn range which can then be
> + * tried for allocation with alloc_contig_range(). This routine is intended
> + * for allocation requests which can not be fulfilled with the buddy allocator.
> + *
> + * The allocated memory is always aligned to a page boundary. If nr_pages is a
> + * power of two, then allocated range is also guaranteed to be aligned to same
> + * nr_pages (e.g. 1GB request would be aligned to 1GB).
> + *
> + * Allocated pages can be freed with free_contig_range() or by manually calling
> + * __free_page() on each allocated page.
> + *
> + * Return: pointer to contiguous pages on success, or NULL if not successful.
> + */
> +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
> +		int nid, nodemask_t *nodemask)
> +{
> +	struct page *page;
> +
> +	page =  alloc_contig_frozen_pages_noprof(nr_pages, gfp_mask, nid,
> +						 nodemask);
> +	if (!page)
> +		return NULL;
> +
> +	if (gfp_mask & __GFP_COMP) {
> +		set_page_refcounted(page);
> +	} else {
> +		unsigned long pfn = page_to_pfn(page);
> +
> +		for (; nr_pages--; pfn++)
> +			set_page_refcounted(pfn_to_page(pfn));
> +	}
> +
> +	return page;

Same here, might be able to make it easier to read like I suggested for the
alloc_contig_range_noprof().

Or that part can just be factored out?

void set_pages_refcounted(struct page *page, unsigned long nr_pages, gfp_t gfp_mask)

or better

void set_pages_refcounted(struct page *page, unsigned long nr_pages)

And deriving __GFP_COMP from PageHead().

-- 
Cheers

David / dhildenb



  reply	other threads:[~2025-10-16 20:53 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-13 13:38 [PATCH v3 0/6] mm: hugetlb: allocate frozen gigantic folio Kefeng Wang
2025-10-13 13:38 ` [PATCH v3 1/6] mm: debug_vm_pgtable: add debug_vm_pgtable_free_huge_page() Kefeng Wang
2025-10-13 13:38 ` [PATCH v3 2/6] mm: page_alloc: add __split_page() Kefeng Wang
2025-10-13 19:44   ` David Hildenbrand
2025-10-14  3:45     ` Kefeng Wang
2025-10-13 13:38 ` [PATCH v3 3/6] mm: page_alloc: add alloc_contig_{range_frozen,frozen_pages}() Kefeng Wang
2025-10-16 20:53   ` David Hildenbrand [this message]
2025-10-17  7:19     ` Kefeng Wang
2025-10-20 13:07       ` David Hildenbrand
2025-10-20 15:21         ` Kefeng Wang
2025-10-23 12:06           ` Kefeng Wang
2025-10-13 13:38 ` [PATCH v3 4/6] mm: cma: add __cma_release() Kefeng Wang
2025-10-13 19:48   ` David Hildenbrand
2025-10-14  3:45     ` Kefeng Wang
2025-10-13 13:38 ` [PATCH v3 5/6] mm: cma: add cma_alloc_frozen{_compound}() Kefeng Wang
2025-10-13 13:38 ` [PATCH v3 6/6] mm: hugetlb: allocate frozen pages in alloc_gigantic_folio() Kefeng Wang
2025-10-16  1:20 ` [PATCH v3 0/6] mm: hugetlb: allocate frozen gigantic folio Kefeng Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e1f7ca22-a249-4b24-b308-d46afb2afb29@redhat.com \
    --to=david@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=jackmanb@google.com \
    --cc=jane.chu@oracle.com \
    --cc=linux-mm@kvack.org \
    --cc=muchun.song@linux.dev \
    --cc=osalvador@suse.de \
    --cc=sidhartha.kumar@oracle.com \
    --cc=vbabka@suse.cz \
    --cc=wangkefeng.wang@huawei.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).