cgroups.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Harry Yoo <harry.yoo@oracle.com>
To: Muchun Song <songmuchun@bytedance.com>
Cc: hannes@cmpxchg.org, mhocko@kernel.org, roman.gushchin@linux.dev,
	shakeel.butt@linux.dev, muchun.song@linux.dev,
	akpm@linux-foundation.org, david@fromorbit.com,
	zhengqi.arch@bytedance.com, yosry.ahmed@linux.dev,
	nphamcs@gmail.com, chengming.zhou@linux.dev,
	linux-kernel@vger.kernel.org, cgroups@vger.kernel.org,
	linux-mm@kvack.org, hamzamahfooz@linux.microsoft.com,
	apais@linux.microsoft.com
Subject: Re: [PATCH RFC 26/28] mm: memcontrol: introduce memcg_reparent_ops
Date: Wed, 2 Jul 2025 07:12:39 +0900	[thread overview]
Message-ID: <aGRdVzB5Ao1KkEu1@hyeyoo> (raw)
In-Reply-To: <aGKHXWJl0ECKN1Zh@hyeyoo>

On Mon, Jun 30, 2025 at 09:47:25PM +0900, Harry Yoo wrote:
> On Tue, Apr 15, 2025 at 10:45:30AM +0800, Muchun Song wrote:
> > In the previous patch, we established a method to ensure the safety of the
> > lruvec lock and the split queue lock during the reparenting of LRU folios.
> > The process involves the following steps:
> > 
> >     memcg_reparent_objcgs(memcg)
> >         1) lock
> >         // lruvec belongs to memcg and lruvec_parent belongs to parent memcg.
> >         spin_lock(&lruvec->lru_lock);
> >         spin_lock(&lruvec_parent->lru_lock);
> > 
> >         2) relocate from current memcg to its parent
> >         // Move all the pages from the lruvec list to the parent lruvec list.
> > 
> >         3) unlock
> >         spin_unlock(&lruvec_parent->lru_lock);
> >         spin_unlock(&lruvec->lru_lock);
> > 
> > In addition to the folio lruvec lock, the deferred split queue lock
> > (specific to THP) also requires a similar approach. Therefore, we abstract
> > the three essential steps from the memcg_reparent_objcgs() function.
> > 
> >     memcg_reparent_objcgs(memcg)
> >         1) lock
> >         memcg_reparent_ops->lock(memcg, parent);
> > 
> >         2) relocate
> >         memcg_reparent_ops->relocate(memcg, reparent);
> > 
> >         3) unlock
> >         memcg_reparent_ops->unlock(memcg, reparent);
> > 
> > Currently, two distinct locks (such as the lruvec lock and the deferred
> > split queue lock) need to utilize this infrastructure. In the subsequent
> > patch, we will employ these APIs to ensure the safety of these locks
> > during the reparenting of LRU folios.
> > 
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  include/linux/memcontrol.h | 20 ++++++++++++
> >  mm/memcontrol.c            | 62 ++++++++++++++++++++++++++++++--------
> >  2 files changed, 69 insertions(+), 13 deletions(-)
> > 
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 27b23e464229..0e450623f8fa 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -311,6 +311,26 @@ struct mem_cgroup {
> >  	struct mem_cgroup_per_node *nodeinfo[];
> >  };
> >  
> > +struct memcg_reparent_ops {
> > +	/*
> > +	 * Note that interrupt is disabled before calling those callbacks,
> > +	 * so the interrupt should remain disabled when leaving those callbacks.
> > +	 */
> > +	void (*lock)(struct mem_cgroup *src, struct mem_cgroup *dst);
> > +	void (*relocate)(struct mem_cgroup *src, struct mem_cgroup *dst);
> > +	void (*unlock)(struct mem_cgroup *src, struct mem_cgroup *dst);
> > +};
> > +
> > +#define DEFINE_MEMCG_REPARENT_OPS(name)					\
> > +	const struct memcg_reparent_ops memcg_##name##_reparent_ops = {	\
> > +		.lock		= name##_reparent_lock,			\
> > +		.relocate	= name##_reparent_relocate,		\
> > +		.unlock		= name##_reparent_unlock,		\
> > +	}
> > +
> > +#define DECLARE_MEMCG_REPARENT_OPS(name)				\
> > +	extern const struct memcg_reparent_ops memcg_##name##_reparent_ops
> > +
> >  /*
> >   * size of first charge trial.
> >   * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 1f0c6e7b69cc..3fac51179186 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -194,24 +194,60 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
> >  	return objcg;
> >  }
> >  
> > -static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
> > +static void objcg_reparent_lock(struct mem_cgroup *src, struct mem_cgroup *dst)
> > +{
> > +	spin_lock(&objcg_lock);
> > +}
> > +
> > +static void objcg_reparent_relocate(struct mem_cgroup *src, struct mem_cgroup *dst)
> >  {
> >  	struct obj_cgroup *objcg, *iter;
> > -	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
> >  
> > -	objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
> > +	objcg = rcu_replace_pointer(src->objcg, NULL, true);
> > +	/* 1) Ready to reparent active objcg. */
> > +	list_add(&objcg->list, &src->objcg_list);
> > +	/* 2) Reparent active objcg and already reparented objcgs to dst. */
> > +	list_for_each_entry(iter, &src->objcg_list, list)
> > +		WRITE_ONCE(iter->memcg, dst);
> > +	/* 3) Move already reparented objcgs to the dst's list */
> > +	list_splice(&src->objcg_list, &dst->objcg_list);
> > +}
> >  
> > -	spin_lock_irq(&objcg_lock);
> > +static void objcg_reparent_unlock(struct mem_cgroup *src, struct mem_cgroup *dst)
> > +{
> > +	spin_unlock(&objcg_lock);
> > +}
> >  
> > -	/* 1) Ready to reparent active objcg. */
> > -	list_add(&objcg->list, &memcg->objcg_list);
> > -	/* 2) Reparent active objcg and already reparented objcgs to parent. */
> > -	list_for_each_entry(iter, &memcg->objcg_list, list)
> > -		WRITE_ONCE(iter->memcg, parent);
> > -	/* 3) Move already reparented objcgs to the parent's list */
> > -	list_splice(&memcg->objcg_list, &parent->objcg_list);
> > -
> > -	spin_unlock_irq(&objcg_lock);
> > +static DEFINE_MEMCG_REPARENT_OPS(objcg);
> > +
> > +static const struct memcg_reparent_ops *memcg_reparent_ops[] = {
> > +	&memcg_objcg_reparent_ops,
> > +};
> > +
> > +#define DEFINE_MEMCG_REPARENT_FUNC(phase)				\
> > +	static void memcg_reparent_##phase(struct mem_cgroup *src,	\
> > +					   struct mem_cgroup *dst)	\
> > +	{								\
> > +		int i;							\
> > +									\
> > +		for (i = 0; i < ARRAY_SIZE(memcg_reparent_ops); i++)	\
> > +			memcg_reparent_ops[i]->phase(src, dst);		\
> > +	}
> > +
> > +DEFINE_MEMCG_REPARENT_FUNC(lock)
> > +DEFINE_MEMCG_REPARENT_FUNC(relocate)
> > +DEFINE_MEMCG_REPARENT_FUNC(unlock)
> > +
> > +static void memcg_reparent_objcgs(struct mem_cgroup *src)
> > +{
> > +	struct mem_cgroup *dst = parent_mem_cgroup(src);
> > +	struct obj_cgroup *objcg = rcu_dereference_protected(src->objcg, true);
> > +
> > +	local_irq_disable();
> > +	memcg_reparent_lock(src, dst);
> > +	memcg_reparent_relocate(src, dst);
> > +	memcg_reparent_unlock(src, dst);
> > +	local_irq_enable();
> 
> Hi,
> 
> It seems unnecessarily complicated to 1) acquire objcg, lruvec and
> thp_sq locks, 2) call their ->relocate() callbacks, and
> 3) release those locks.
> 
> Why not simply do the following instead?
> 
> for (i = 0; i < ARRAY_SIZE(memcg_reparent_ops); i++) {
> 	local_irq_disable();
> 	memcg_reparent_ops[i]->lock(src, dst);
> 	memcg_reparent_ops[i]->relocate(src, dst);
> 	memcg_reparent_ops[i]->unlock(src, dst);
> 	local_irq_enable();
> }
> 
> As there is no actual lock dependency between the three.
> 
> Or am I missing something important about the locking requirements?

Hmm... looks like I was missing some important requirements!

It seems like:

1) objcg should be reparented under lruvec locks, otherwise
   users can observe folio_memcg(folio) != lruvec_memcg(lruvec)

2) Similarly, lruvec_reparent_relocate() should reparent all folios
   at once under lruvec locks, otherwise users can observe
   folio_memcg(folio) != lruvec_memcg(lruvec) for some folios.

   IoW, lruvec_reparent_relocate() cannot do something like this:
   while (lruvec is not empty) {
	   move some pages;
	   unlock lruvec locks;
	   cond_resched();
	   lock lruvec locks;
   }

Failing to satisfy 1) and 2) means user can't rely on a stable binding
between a folio and a memcg, which is a no-go.

Also, 2) makes it quite undesirable to iterate over folios and move each
one to the right generation in MGLRU as this will certainly introduce
soft lockups as the memcg size grows...

Is my reasoning correct?
If so, adding a brief comment about 1 and 2 wouldn't hurt ;)

-- 
Cheers,
Harry / Hyeonggon

  reply	other threads:[~2025-07-01 22:13 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-15  2:45 [PATCH RFC 00/28] Eliminate Dying Memory Cgroup Muchun Song
2025-04-15  2:45 ` [PATCH RFC 01/28] mm: memcontrol: remove dead code of checking parent memory cgroup Muchun Song
2025-04-17 14:35   ` Johannes Weiner
2025-04-15  2:45 ` [PATCH RFC 02/28] mm: memcontrol: use folio_memcg_charged() to avoid potential rcu lock holding Muchun Song
2025-04-17 14:48   ` Johannes Weiner
2025-04-18  2:38     ` Muchun Song
2025-04-15  2:45 ` [PATCH RFC 03/28] mm: workingset: use folio_lruvec() in workingset_refault() Muchun Song
2025-04-17 14:52   ` Johannes Weiner
2025-04-15  2:45 ` [PATCH RFC 04/28] mm: rename unlock_page_lruvec_irq and its variants Muchun Song
2025-04-17 14:53   ` Johannes Weiner
2025-04-15  2:45 ` [PATCH RFC 05/28] mm: thp: replace folio_memcg() with folio_memcg_charged() Muchun Song
2025-04-17 14:54   ` Johannes Weiner
2025-04-15  2:45 ` [PATCH RFC 06/28] mm: thp: introduce folio_split_queue_lock and its variants Muchun Song
2025-04-17 14:58   ` Johannes Weiner
2025-04-18 19:50   ` Johannes Weiner
2025-04-19 14:20     ` Muchun Song
2025-04-15  2:45 ` [PATCH RFC 07/28] mm: thp: use folio_batch to handle THP splitting in deferred_split_scan() Muchun Song
2025-04-30 14:37   ` Johannes Weiner
2025-05-06  6:44     ` Hugh Dickins
2025-05-06 21:44       ` Hugh Dickins
2025-05-07  3:30         ` Muchun Song
2025-04-15  2:45 ` [PATCH RFC 08/28] mm: vmscan: refactor move_folios_to_lru() Muchun Song
2025-04-30 14:49   ` Johannes Weiner
2025-04-15  2:45 ` [PATCH RFC 09/28] mm: memcontrol: allocate object cgroup for non-kmem case Muchun Song
2025-04-15  2:45 ` [PATCH RFC 10/28] mm: memcontrol: return root object cgroup for root memory cgroup Muchun Song
2025-06-28  3:09   ` Chen Ridong
2025-06-30  7:16     ` Muchun Song
2025-04-15  2:45 ` [PATCH RFC 11/28] mm: memcontrol: prevent memory cgroup release in get_mem_cgroup_from_folio() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 12/28] buffer: prevent memory cgroup release in folio_alloc_buffers() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 13/28] writeback: prevent memory cgroup release in writeback module Muchun Song
2025-04-15  2:45 ` [PATCH RFC 14/28] mm: memcontrol: prevent memory cgroup release in count_memcg_folio_events() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 15/28] mm: page_io: prevent memory cgroup release in page_io module Muchun Song
2025-04-15  2:45 ` [PATCH RFC 16/28] mm: migrate: prevent memory cgroup release in folio_migrate_mapping() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 17/28] mm: mglru: prevent memory cgroup release in mglru Muchun Song
2025-04-15  2:45 ` [PATCH RFC 18/28] mm: memcontrol: prevent memory cgroup release in mem_cgroup_swap_full() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 19/28] mm: workingset: prevent memory cgroup release in lru_gen_eviction() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 20/28] mm: workingset: prevent lruvec release in workingset_refault() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 21/28] mm: zswap: prevent lruvec release in zswap_folio_swapin() Muchun Song
2025-04-17 17:39   ` Nhat Pham
2025-04-18  2:36   ` Chengming Zhou
2025-04-15  2:45 ` [PATCH RFC 22/28] mm: swap: prevent lruvec release in swap module Muchun Song
2025-04-15  2:45 ` [PATCH RFC 23/28] mm: workingset: prevent lruvec release in workingset_activation() Muchun Song
2025-04-15  2:45 ` [PATCH RFC 24/28] mm: memcontrol: prepare for reparenting LRU pages for lruvec lock Muchun Song
2025-04-15  2:45 ` [PATCH RFC 25/28] mm: thp: prepare for reparenting LRU pages for split queue lock Muchun Song
2025-04-15  2:45 ` [PATCH RFC 26/28] mm: memcontrol: introduce memcg_reparent_ops Muchun Song
2025-06-30 12:47   ` Harry Yoo
2025-07-01 22:12     ` Harry Yoo [this message]
2025-07-07  9:29       ` [External] " Muchun Song
2025-07-09  0:14         ` Harry Yoo
2025-04-15  2:45 ` [PATCH RFC 27/28] mm: memcontrol: eliminate the problem of dying memory cgroup for LRU folios Muchun Song
2025-05-20 11:27   ` Harry Yoo
2025-05-22  2:31     ` Muchun Song
2025-05-23  1:24       ` Harry Yoo
2025-04-15  2:45 ` [PATCH RFC 28/28] mm: lru: add VM_WARN_ON_ONCE_FOLIO to lru maintenance helpers Muchun Song
2025-04-15  2:53 ` [PATCH RFC 00/28] Eliminate Dying Memory Cgroup Muchun Song
2025-04-15  6:19 ` Kairui Song
2025-04-15  8:01   ` Muchun Song
2025-04-17 18:22     ` Kairui Song
2025-04-17 19:04       ` Johannes Weiner
2025-06-27  8:50         ` Chen Ridong
2025-04-17 21:45       ` Roman Gushchin
2025-04-28  3:43         ` Kairui Song
2025-06-27  9:02           ` Chen Ridong
2025-06-27 18:54             ` Kairui Song
2025-06-27 19:14               ` Shakeel Butt
2025-06-28  9:21                 ` Chen Ridong
2025-04-22 14:20       ` Yosry Ahmed
2025-05-23  1:23 ` Harry Yoo
2025-05-23  2:39   ` Muchun Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aGRdVzB5Ao1KkEu1@hyeyoo \
    --to=harry.yoo@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=apais@linux.microsoft.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chengming.zhou@linux.dev \
    --cc=david@fromorbit.com \
    --cc=hamzamahfooz@linux.microsoft.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=songmuchun@bytedance.com \
    --cc=yosry.ahmed@linux.dev \
    --cc=zhengqi.arch@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).