All of lore.kernel.org
 help / color / mirror / Atom feed
From: Minchan Kim <minchan@kernel.org>
To: Mel Gorman <mgorman@techsingularity.net>
Cc: Nicolas Saenz Julienne <nsaenzju@redhat.com>,
	Marcelo Tosatti <mtosatti@redhat.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Michal Hocko <mhocko@kernel.org>,
	LKML <linux-kernel@vger.kernel.org>,
	Linux-MM <linux-mm@kvack.org>
Subject: Re: [PATCH 5/6] mm/page_alloc: Protect PCP lists with a spinlock
Date: Tue, 26 Apr 2022 12:24:56 -0700	[thread overview]
Message-ID: <YmhHCHaUy+zARWxi@google.com> (raw)
In-Reply-To: <20220420095906.27349-6-mgorman@techsingularity.net>

On Wed, Apr 20, 2022 at 10:59:05AM +0100, Mel Gorman wrote:
> Currently the PCP lists are protected by using local_lock_irqsave to
> prevent migration and IRQ reentrancy but this is inconvenient. Remote
> draining of the lists is impossible and a workqueue is required and
> every task allocation/free must disable then enable interrupts which is
> expensive.
> 
> As preparation for dealing with both of those problems, protect the
> lists with a spinlock. The IRQ-unsafe version of the lock is used
> because IRQs are already disabled by local_lock_irqsave. spin_trylock
> is used in preparation for a time when local_lock could be used instead
> of lock_lock_irqsave.
> 
> The per_cpu_pages still fits within the same number of cache lines after
> this patch relative to before the series.
> 
> struct per_cpu_pages {
>         spinlock_t                 lock;                 /*     0     4 */
>         int                        count;                /*     4     4 */
>         int                        high;                 /*     8     4 */
>         int                        batch;                /*    12     4 */
>         short int                  free_factor;          /*    16     2 */
>         short int                  expire;               /*    18     2 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         struct list_head           lists[13];            /*    24   208 */
> 
>         /* size: 256, cachelines: 4, members: 7 */
>         /* sum members: 228, holes: 1, sum holes: 4 */
>         /* padding: 24 */
> } __attribute__((__aligned__(64)));
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  include/linux/mmzone.h |   1 +
>  mm/page_alloc.c        | 155 +++++++++++++++++++++++++++++++++++------
>  2 files changed, 136 insertions(+), 20 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index abe530748de6..8b5757735428 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -385,6 +385,7 @@ enum zone_watermarks {
>  
>  /* Fields and list protected by pagesets local_lock in page_alloc.c */
>  struct per_cpu_pages {
> +	spinlock_t lock;	/* Protects lists field */
>  	int count;		/* number of pages in the list */
>  	int high;		/* high watermark, emptying needed */
>  	int batch;		/* chunk size for buddy add/remove */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index dc0fdeb3795c..813c84b67c65 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -132,6 +132,17 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
>  	.lock = INIT_LOCAL_LOCK(lock),
>  };
>  
> +#ifdef CONFIG_SMP
> +/* On SMP, spin_trylock is sufficient protection */
> +#define pcp_trylock_prepare(flags)	do { } while (0)
> +#define pcp_trylock_finish(flag)	do { } while (0)
> +#else
> +
> +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
> +#define pcp_trylock_prepare(flags)	local_irq_save(flags)
> +#define pcp_trylock_finish(flags)	local_irq_restore(flags)
> +#endif
> +
>  #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
>  DEFINE_PER_CPU(int, numa_node);
>  EXPORT_PER_CPU_SYMBOL(numa_node);
> @@ -3082,15 +3093,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>   */
>  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>  {
> -	unsigned long flags;
>  	int to_drain, batch;
>  
> -	local_lock_irqsave(&pagesets.lock, flags);
>  	batch = READ_ONCE(pcp->batch);
>  	to_drain = min(pcp->count, batch);
> -	if (to_drain > 0)
> +	if (to_drain > 0) {
> +		unsigned long flags;
> +
> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> +		local_irq_save(flags);
> +
> +		spin_lock(&pcp->lock);
>  		free_pcppages_bulk(zone, to_drain, pcp, 0);
> -	local_unlock_irqrestore(&pagesets.lock, flags);
> +		spin_unlock(&pcp->lock);
> +
> +		local_irq_restore(flags);
> +	}
>  }
>  #endif
>  
> @@ -3103,16 +3121,21 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
>   */
>  static void drain_pages_zone(unsigned int cpu, struct zone *zone)
>  {
> -	unsigned long flags;
>  	struct per_cpu_pages *pcp;
>  
> -	local_lock_irqsave(&pagesets.lock, flags);
> -
>  	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
> -	if (pcp->count)
> +	if (pcp->count) {
> +		unsigned long flags;
> +
> +		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
> +		local_irq_save(flags);
> +
> +		spin_lock(&pcp->lock);
>  		free_pcppages_bulk(zone, pcp->count, pcp, 0);
> +		spin_unlock(&pcp->lock);
>  
> -	local_unlock_irqrestore(&pagesets.lock, flags);
> +		local_irq_restore(flags);
> +	}
>  }
>  
>  /*
> @@ -3380,18 +3403,30 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
>  	return min(READ_ONCE(pcp->batch) << 2, high);
>  }
>  
> -static void free_unref_page_commit(struct page *page, int migratetype,
> -				   unsigned int order)
> +/* Returns true if the page was committed to the per-cpu list. */
> +static bool free_unref_page_commit(struct page *page, int migratetype,
> +				   unsigned int order, bool locked)
>  {
>  	struct zone *zone = page_zone(page);
>  	struct per_cpu_pages *pcp;
>  	int high;
>  	int pindex;
>  	bool free_high;
> +	unsigned long __maybe_unused UP_flags;
>  
>  	__count_vm_event(PGFREE);
>  	pcp = this_cpu_ptr(zone->per_cpu_pageset);
>  	pindex = order_to_pindex(migratetype, order);
> +
> +	if (!locked) {
> +		/* Protect against a parallel drain. */
> +		pcp_trylock_prepare(UP_flags);
> +		if (!spin_trylock(&pcp->lock)) {
> +			pcp_trylock_finish(UP_flags);
> +			return false;
> +		}
> +	}
> +
>  	list_add(&page->pcp_list, &pcp->lists[pindex]);
>  	pcp->count += 1 << order;
>  
> @@ -3409,6 +3444,13 @@ static void free_unref_page_commit(struct page *page, int migratetype,
>  
>  		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
>  	}
> +
> +	if (!locked) {
> +		spin_unlock(&pcp->lock);
> +		pcp_trylock_finish(UP_flags);
> +	}
> +
> +	return true;
>  }
>  
>  /*
> @@ -3419,6 +3461,7 @@ void free_unref_page(struct page *page, unsigned int order)
>  	unsigned long flags;
>  	unsigned long pfn = page_to_pfn(page);
>  	int migratetype;
> +	bool freed_pcp = false;
>  
>  	if (!free_unref_page_prepare(page, pfn, order))
>  		return;
> @@ -3440,8 +3483,11 @@ void free_unref_page(struct page *page, unsigned int order)
>  	}
>  
>  	local_lock_irqsave(&pagesets.lock, flags);
> -	free_unref_page_commit(page, migratetype, order);
> +	freed_pcp = free_unref_page_commit(page, migratetype, order, false);
>  	local_unlock_irqrestore(&pagesets.lock, flags);
> +
> +	if (unlikely(!freed_pcp))
> +		free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
>  }
>  
>  /*
> @@ -3450,10 +3496,19 @@ void free_unref_page(struct page *page, unsigned int order)
>  void free_unref_page_list(struct list_head *list)
>  {
>  	struct page *page, *next;
> +	struct per_cpu_pages *pcp;
> +	struct zone *locked_zone;
>  	unsigned long flags;
>  	int batch_count = 0;
>  	int migratetype;
>  
> +	/*
> +	 * An empty list is possible. Check early so that the later
> +	 * lru_to_page() does not potentially read garbage.
> +	 */
> +	if (list_empty(list))
> +		return;
> +
>  	/* Prepare pages for freeing */
>  	list_for_each_entry_safe(page, next, list, lru) {
>  		unsigned long pfn = page_to_pfn(page);
> @@ -3474,8 +3529,26 @@ void free_unref_page_list(struct list_head *list)
>  		}
>  	}
>  
> +	VM_BUG_ON(in_hardirq());

You need to check the list_empty here again and bail out if it is.

> +
>  	local_lock_irqsave(&pagesets.lock, flags);
> +
> +	page = lru_to_page(list);


  parent reply	other threads:[~2022-04-26 19:25 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-20  9:59 [RFC PATCH 0/6] Drain remote per-cpu directly Mel Gorman
2022-04-20  9:59 ` [PATCH 1/6] mm/page_alloc: Add page->buddy_list and page->pcp_list Mel Gorman
2022-04-20 20:43   ` Matthew Wilcox
2022-04-21  8:38     ` Mel Gorman
2022-04-20  9:59 ` [PATCH 2/6] mm/page_alloc: Use only one PCP list for THP-sized allocations Mel Gorman
2022-04-20  9:59 ` [PATCH 3/6] mm/page_alloc: Split out buddy removal code from rmqueue into separate helper Mel Gorman
2022-04-20  9:59 ` [PATCH 4/6] mm/page_alloc: Remove unnecessary page == NULL check in rmqueue Mel Gorman
2022-04-20  9:59 ` [PATCH 5/6] mm/page_alloc: Protect PCP lists with a spinlock Mel Gorman
2022-04-20 14:02   ` Hillf Danton
2022-04-20 14:35     ` Nicolas Saenz Julienne
2022-04-26 16:42   ` Nicolas Saenz Julienne
2022-04-26 16:48     ` Vlastimil Babka
2022-04-29  9:13     ` Mel Gorman
2022-04-26 19:24   ` Minchan Kim [this message]
2022-04-29  9:05     ` Mel Gorman
2022-04-20  9:59 ` [PATCH 6/6] mm/page_alloc: Remotely drain per-cpu lists Mel Gorman
2022-04-25 22:58 ` [RFC PATCH 0/6] Drain remote per-cpu directly Minchan Kim
2022-04-26 11:06   ` Nicolas Saenz Julienne
2022-04-27 15:21     ` Marcelo Tosatti
2022-04-26  2:49 ` Suren Baghdasaryan
2022-04-26  6:30   ` Suren Baghdasaryan
  -- strict thread matches above, loose matches on Subject: below --
2022-05-09 13:07 [RFC PATCH 0/6] Drain remote per-cpu directly v2 Mel Gorman
2022-05-09 13:08 ` [PATCH 5/6] mm/page_alloc: Protect PCP lists with a spinlock Mel Gorman
2022-05-22  2:49   ` Hugh Dickins
2022-05-24 12:12     ` Mel Gorman
2022-05-24 12:19       ` Mel Gorman
2022-05-12  8:50 [PATCH 0/6] Drain remote per-cpu directly v3 Mel Gorman
2022-05-12  8:50 ` [PATCH 5/6] mm/page_alloc: Protect PCP lists with a spinlock Mel Gorman
2022-05-13 12:22   ` Nicolas Saenz Julienne

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YmhHCHaUy+zARWxi@google.com \
    --to=minchan@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@techsingularity.net \
    --cc=mhocko@kernel.org \
    --cc=mtosatti@redhat.com \
    --cc=nsaenzju@redhat.com \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.