[PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
@ 2026-06-23  2:42 Qi Zheng
  2026-06-23  2:56 ` Qi Zheng
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Qi Zheng @ 2026-06-23  2:42 UTC (permalink / raw)
  To: akpm, david, kasong, shakeel.butt, baohua, axelrasmussen, yuanchu,
	weixugc, hannes, harry, muchun.song, peiyang_he, mhocko,
	roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable

From: Qi Zheng <zhengqi.arch@bytedance.com>

The mglru page table walker batches per-generation size deltas in
walk->nr_pages while walking page tables without holding the lruvec lock.
The reset_batch_size() later folds those deltas into walk->lruvec under
the lruvec lock.

The page table walker can run concurrently with the memcg reparenting path
as follows:

CPU0                           CPU1
====                           ====

walk_mm
--> walk_page_range
    --> update_batch_size
        --> walk->nr_pages += delta

                              mem_cgroup_css_offline
                              --> memcg_reparent_objcgs
                                  --> lock lruvec
                                      lru_gen_reparent_memcg
                                      --> reparent child folios to parent
                                      unlock lruvec

    lock lruvec
    reset_batch_size
    --> child lrugen->nr_pages += delta

This will trigger the following warning in lru_gen_exit_memcg():

	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
				   sizeof(lruvec->lrugen.nr_pages)));

To fix it, add lrugen->reparented to remember the new owner of a
reparented lruvec, and make reset_batch_size() charge pending deltas to
that owner.

Reported-by: Peiyang He <peiyang_he@smail.nju.edu.cn>
Closes: https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn
Fixes: f304652609ea ("mm: vmscan: prepare for reparenting MGLRU folios")
Cc: <stable@vger.kernel.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Barry Song <baohua@kernel.org>
---
 include/linux/mmzone.h |  4 ++++
 mm/vmscan.c            | 43 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ca2712187147..0d572db2ef64 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -584,6 +584,10 @@ struct lru_gen_folio {
 	u8 gen;
 	/* the list segment this lru_gen_folio belongs to */
 	u8 seg;
+#ifdef CONFIG_MEMCG
+	/* the lruvec this lruvec has been reparented to */
+	struct lruvec *reparented;
+#endif
 	/* per-node lru_gen_folio list for global reclaim */
 	struct hlist_nulls_node list;
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 35c3bb15ae96..64362cbed814 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3262,10 +3262,37 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
 	walk->nr_pages[new_gen][type][zone] += delta;
 }
 
+#ifdef CONFIG_MEMCG
+static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
+{
+	struct lruvec *reparented;
+
+	for (;;) {
+		lruvec_lock_irq(lruvec);
+
+		reparented = lruvec->lrugen.reparented;
+		if (!reparented)
+			break;
+
+		lruvec_unlock_irq(lruvec);
+		lruvec = reparented;
+	}
+
+	return lruvec;
+}
+#else
+static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
+{
+	lruvec_lock_irq(lruvec);
+
+	return lruvec;
+}
+#endif
+
 static void reset_batch_size(struct lru_gen_mm_walk *walk)
 {
 	int gen, type, zone;
-	struct lruvec *lruvec = walk->lruvec;
+	struct lruvec *lruvec = lock_batch_lruvec(walk->lruvec);
 	struct lru_gen_folio *lrugen = &lruvec->lrugen;
 
 	walk->batched = 0;
@@ -3285,6 +3312,8 @@ static void reset_batch_size(struct lru_gen_mm_walk *walk)
 			lru += LRU_ACTIVE;
 		__update_lru_size(lruvec, lru, zone, delta);
 	}
+
+	lruvec_unlock_irq(lruvec);
 }
 
 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
@@ -3779,11 +3808,8 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
 			mmap_read_unlock(mm);
 		}
 
-		if (walk->batched) {
-			lruvec_lock_irq(lruvec);
+		if (walk->batched)
 			reset_batch_size(walk);
-			lruvec_unlock_irq(lruvec);
-		}
 
 		cond_resched();
 	} while (err == -EAGAIN);
@@ -4563,6 +4589,8 @@ void lru_gen_reparent_memcg(struct mem_cgroup *memcg, struct mem_cgroup *parent,
 			mem_cgroup_update_lru_size(parent_lruvec, lru, zid, size);
 		}
 	}
+
+	child_lruvec->lrugen.reparented = parent_lruvec;
 }
 
 #endif /* CONFIG_MEMCG */
@@ -4867,9 +4895,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
 	walk = current->reclaim_state->mm_walk;
 	if (walk && walk->batched) {
 		walk->lruvec = lruvec;
-		lruvec_lock_irq(lruvec);
 		reset_batch_size(walk);
-		lruvec_unlock_irq(lruvec);
 	}
 
 	mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
@@ -5784,6 +5810,9 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 
 	lrugen->max_seq = MIN_NR_GENS + 1;
 	lrugen->enabled = lru_gen_enabled();
+#ifdef CONFIG_MEMCG
+	lrugen->reparented = NULL;
+#endif
 
 	for (i = 0; i <= MIN_NR_GENS + 1; i++)
 		lrugen->timestamps[i] = jiffies;
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  2:42 [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting Qi Zheng
@ 2026-06-23  2:56 ` Qi Zheng
  2026-06-23  4:03 ` Baolin Wang
  2026-06-23  6:17 ` Harry Yoo
  2 siblings, 0 replies; 14+ messages in thread
From: Qi Zheng @ 2026-06-23  2:56 UTC (permalink / raw)
  To: akpm, david, kasong, shakeel.butt, baohua, axelrasmussen, yuanchu,
	weixugc, hannes, harry, muchun.song, peiyang_he, mhocko,
	roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, stable



On 6/23/26 10:42 AM, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> The mglru page table walker batches per-generation size deltas in
> walk->nr_pages while walking page tables without holding the lruvec lock.
> The reset_batch_size() later folds those deltas into walk->lruvec under
> the lruvec lock.
> 
> The page table walker can run concurrently with the memcg reparenting path
> as follows:
> 
> CPU0                           CPU1
> ====                           ====
> 
> walk_mm
> --> walk_page_range
>      --> update_batch_size
>          --> walk->nr_pages += delta
> 
>                                mem_cgroup_css_offline
>                                --> memcg_reparent_objcgs
>                                    --> lock lruvec
>                                        lru_gen_reparent_memcg
>                                        --> reparent child folios to parent
>                                        unlock lruvec
> 
>      lock lruvec
>      reset_batch_size
>      --> child lrugen->nr_pages += delta
> 
> This will trigger the following warning in lru_gen_exit_memcg():
> 
> 	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
> 				   sizeof(lruvec->lrugen.nr_pages)));
> 
> To fix it, add lrugen->reparented to remember the new owner of a
> reparented lruvec, and make reset_batch_size() charge pending deltas to
> that owner.
> 
> Reported-by: Peiyang He <peiyang_he@smail.nju.edu.cn>
> Closes: https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn
> Fixes: f304652609ea ("mm: vmscan: prepare for reparenting MGLRU folios")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Reviewed-by: Barry Song <baohua@kernel.org>
> ---

Changes in v2:
  - update the commit message (pointed by Barry)
  - collect Reviewed-by

>   include/linux/mmzone.h |  4 ++++
>   mm/vmscan.c            | 43 +++++++++++++++++++++++++++++++++++-------
>   2 files changed, 40 insertions(+), 7 deletions(-)
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  2:42 [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting Qi Zheng
  2026-06-23  2:56 ` Qi Zheng
@ 2026-06-23  4:03 ` Baolin Wang
  2026-06-23  6:17 ` Harry Yoo
  2 siblings, 0 replies; 14+ messages in thread
From: Baolin Wang @ 2026-06-23  4:03 UTC (permalink / raw)
  To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, harry, muchun.song,
	peiyang_he, mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable



On 6/23/26 10:42 AM, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> The mglru page table walker batches per-generation size deltas in
> walk->nr_pages while walking page tables without holding the lruvec lock.
> The reset_batch_size() later folds those deltas into walk->lruvec under
> the lruvec lock.
> 
> The page table walker can run concurrently with the memcg reparenting path
> as follows:
> 
> CPU0                           CPU1
> ====                           ====
> 
> walk_mm
> --> walk_page_range
>      --> update_batch_size
>          --> walk->nr_pages += delta
> 
>                                mem_cgroup_css_offline
>                                --> memcg_reparent_objcgs
>                                    --> lock lruvec
>                                        lru_gen_reparent_memcg
>                                        --> reparent child folios to parent
>                                        unlock lruvec
> 
>      lock lruvec
>      reset_batch_size
>      --> child lrugen->nr_pages += delta
> 
> This will trigger the following warning in lru_gen_exit_memcg():
> 
> 	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
> 				   sizeof(lruvec->lrugen.nr_pages)));
> 
> To fix it, add lrugen->reparented to remember the new owner of a
> reparented lruvec, and make reset_batch_size() charge pending deltas to
> that owner.
> 
> Reported-by: Peiyang He <peiyang_he@smail.nju.edu.cn>
> Closes: https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn
> Fixes: f304652609ea ("mm: vmscan: prepare for reparenting MGLRU folios")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Reviewed-by: Barry Song <baohua@kernel.org>
> ---

Looks reasonable to me. Thanks for the fix.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  2:42 [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting Qi Zheng
  2026-06-23  2:56 ` Qi Zheng
  2026-06-23  4:03 ` Baolin Wang
@ 2026-06-23  6:17 ` Harry Yoo
  2026-06-23  7:16   ` Qi Zheng
  2 siblings, 1 reply; 14+ messages in thread
From: Harry Yoo @ 2026-06-23  6:17 UTC (permalink / raw)
  To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable


[-- Attachment #1.1: Type: text/plain, Size: 2571 bytes --]



On 6/23/26 11:42 AM, Qi Zheng wrote:
> From: Qi Zheng <zhengqi.arch@bytedance.com>
> 
> The mglru page table walker batches per-generation size deltas in
> walk->nr_pages while walking page tables without holding the lruvec lock.
> The reset_batch_size() later folds those deltas into walk->lruvec under
> the lruvec lock.

Ouch.

IIRC the user-visible impact of underestimated nr_pages in MGLRU
was premature OOMs because MGLRU does not try to reclaim memory when
nr_pages reaches zero, but there are still more pages.

Perhaps worth mentioning in the changelog?

> The page table walker can run concurrently with the memcg reparenting path
> as follows:
> 
> CPU0                           CPU1
> ====                           ====
> 
> walk_mm
> --> walk_page_range
>     --> update_batch_size
>         --> walk->nr_pages += delta
> 
>                               mem_cgroup_css_offline
>                               --> memcg_reparent_objcgs
>                                   --> lock lruvec
>                                       lru_gen_reparent_memcg
>                                       --> reparent child folios to parent
>                                       unlock lruvec
> 
>     lock lruvec
>     reset_batch_size
>     --> child lrugen->nr_pages += delta

The problem here is that, while grabbing a reference to memcg
(via mem_cgroup_iter(), for example) makes sure that the memcg is not
freed, it does not prevent offlining happening, and reset_batch_size()
doesn't check whether the lruvec has been reparented, or the lruvec
is going to be reparented.

> This will trigger the following warning in lru_gen_exit_memcg():
> 
> 	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
> 				   sizeof(lruvec->lrugen.nr_pages)));
> 
> To fix it, add lrugen->reparented to remember the new owner of a
> reparented lruvec, and make reset_batch_size() charge pending deltas to
> that owner.

Could you please explain why it is unavoidable to introduce the new
field and why checking whether the cgroup is dying (and charging deltas
to non-dying parent) doesn't work?

> Reported-by: Peiyang He <peiyang_he@smail.nju.edu.cn>
> Closes: https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn
> Fixes: f304652609ea ("mm: vmscan: prepare for reparenting MGLRU folios")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> Reviewed-by: Barry Song <baohua@kernel.org>
> ---

-- 
Cheers,
Harry / Hyeonggon

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  6:17 ` Harry Yoo
@ 2026-06-23  7:16   ` Qi Zheng
  2026-06-23  8:18     ` Harry Yoo
  0 siblings, 1 reply; 14+ messages in thread
From: Qi Zheng @ 2026-06-23  7:16 UTC (permalink / raw)
  To: Harry Yoo, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable

Hi Harry,

On 6/23/26 2:17 PM, Harry Yoo wrote:
> 
> 
> On 6/23/26 11:42 AM, Qi Zheng wrote:
>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>
>> The mglru page table walker batches per-generation size deltas in
>> walk->nr_pages while walking page tables without holding the lruvec lock.
>> The reset_batch_size() later folds those deltas into walk->lruvec under
>> the lruvec lock.
> 
> Ouch.
> 
> IIRC the user-visible impact of underestimated nr_pages in MGLRU
> was premature OOMs because MGLRU does not try to reclaim memory when
> nr_pages reaches zero, but there are still more pages.
> 
> Perhaps worth mentioning in the changelog?

Maybe this should be placed before "To fix it...".

> 
>> The page table walker can run concurrently with the memcg reparenting path
>> as follows:
>>
>> CPU0                           CPU1
>> ====                           ====
>>
>> walk_mm
>> --> walk_page_range
>>      --> update_batch_size
>>          --> walk->nr_pages += delta
>>
>>                                mem_cgroup_css_offline
>>                                --> memcg_reparent_objcgs
>>                                    --> lock lruvec
>>                                        lru_gen_reparent_memcg
>>                                        --> reparent child folios to parent
>>                                        unlock lruvec
>>
>>      lock lruvec
>>      reset_batch_size
>>      --> child lrugen->nr_pages += delta
> 
> The problem here is that, while grabbing a reference to memcg
> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
> freed, it does not prevent offlining happening, and reset_batch_size()
> doesn't check whether the lruvec has been reparented, or the lruvec
> is going to be reparented.
> 
>> This will trigger the following warning in lru_gen_exit_memcg():
>>
>> 	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>> 				   sizeof(lruvec->lrugen.nr_pages)));
>>
>> To fix it, add lrugen->reparented to remember the new owner of a
>> reparented lruvec, and make reset_batch_size() charge pending deltas to
>> that owner.
> 
> Could you please explain why it is unavoidable to introduce the new
> field and why checking whether the cgroup is dying (and charging deltas
> to non-dying parent) doesn't work?

Peiyang tried doing this [1], but it doesn't work because
ss->css_offline() is called before clearing the CSS_ONLINE flag. I
also considered using mem_cgroup_tryget_online(), but that only prevent
the memcg from being freed. It's doesn't prevent the offlining.

So in the end, I chose the approach used in this patch. Simply adding
a new field to mglru to track its reparenting status seems to be the
most straightforward and effective approach.

Thanks,
Qi

[1]. 
https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn

> 
>> Reported-by: Peiyang He <peiyang_he@smail.nju.edu.cn>
>> Closes: https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn
>> Fixes: f304652609ea ("mm: vmscan: prepare for reparenting MGLRU folios")
>> Cc: <stable@vger.kernel.org>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> Reviewed-by: Barry Song <baohua@kernel.org>
>> ---
> 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  7:16   ` Qi Zheng
@ 2026-06-23  8:18     ` Harry Yoo
  2026-06-23  9:14       ` Qi Zheng
  0 siblings, 1 reply; 14+ messages in thread
From: Harry Yoo @ 2026-06-23  8:18 UTC (permalink / raw)
  To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable


[-- Attachment #1.1: Type: text/plain, Size: 4476 bytes --]



On 6/23/26 4:16 PM, Qi Zheng wrote:
> Hi Harry,

Hi Qi!

> On 6/23/26 2:17 PM, Harry Yoo wrote:
>> On 6/23/26 11:42 AM, Qi Zheng wrote:
>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>
>>> The mglru page table walker batches per-generation size deltas in
>>> walk->nr_pages while walking page tables without holding the lruvec
>>> lock.
>>> The reset_batch_size() later folds those deltas into walk->lruvec under
>>> the lruvec lock.
>>
>> Ouch.
>>
>> IIRC the user-visible impact of underestimated nr_pages in MGLRU
>> was premature OOMs because MGLRU does not try to reclaim memory when
>> nr_pages reaches zero, but there are still more pages.
>>
>> Perhaps worth mentioning in the changelog?
> 
> Maybe this should be placed before "To fix it...".

Thanks!

>>> The page table walker can run concurrently with the memcg reparenting
>>> path
>>> as follows:
>>>
>>> CPU0                           CPU1
>>> ====                           ====
>>>
>>> walk_mm
>>> --> walk_page_range
>>>      --> update_batch_size
>>>          --> walk->nr_pages += delta
>>>
>>>                                mem_cgroup_css_offline
>>>                                --> memcg_reparent_objcgs
>>>                                    --> lock lruvec
>>>                                        lru_gen_reparent_memcg
>>>                                        --> reparent child folios to
>>> parent
>>>                                        unlock lruvec
>>>
>>>      lock lruvec
>>>      reset_batch_size
>>>      --> child lrugen->nr_pages += delta
>>
>> The problem here is that, while grabbing a reference to memcg
>> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
>> freed, it does not prevent offlining happening, and reset_batch_size()
>> doesn't check whether the lruvec has been reparented, or the lruvec
>> is going to be reparented.
>>
>>> This will trigger the following warning in lru_gen_exit_memcg():
>>>
>>>     VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>>>                    sizeof(lruvec->lrugen.nr_pages)));
>>>
>>> To fix it, add lrugen->reparented to remember the new owner of a
>>> reparented lruvec, and make reset_batch_size() charge pending deltas to
>>> that owner.
>>
>> Could you please explain why it is unavoidable to introduce the new
>> field and why checking whether the cgroup is dying (and charging deltas
>> to non-dying parent) doesn't work?
> 
> Peiyang tried doing this [1], but it doesn't work because
> ss->css_offline() is called before clearing the CSS_ONLINE flag.

Right.

> I also considered using mem_cgroup_tryget_online(), but that only prevent
> the memcg from being freed. It's doesn't prevent the offlining.

Right.

I think checking CSS_DYING under RCU and grabbing the lruvec
of the first non-dying memcg should work (this pattern is already
used where we use RCU to guarantee memcgs are not freed).

If we do not observe CSS_DYING flag, it is safe to charge deltas
to the lruvec because RCU guarantees that reparenting cannot happen
under us.

If we do observe CSS_DYING, we can walk up the hierarchy and charge
deltas to the first non-dying memcg.

This requires introducing an API to grab the first non-dying
lruvec lock like folio_lruvec_lock_irq(), but it can be called only
when the caller has a reference to make sure it doens't go away.
(I wonder if there are other places that requires similar semantics)

or am I missing something?

> So in the end, I chose the approach used in this patch. Simply adding
> a new field to mglru to track its reparenting status seems to be the
> most straightforward and effective approach.

I think it would work, but we need some explanation on why this
requires special handling compared to other stuff (e.g., reparenting
of split queues).

> Thanks,
> Qi

Thanks for working on this, Qi!

> [1]. https://lore.kernel.org/all/5A9E929D82717101+12fcf643-efb8-4b9a-
> a53a-1e28cc894f0b@smail.nju.edu.cn
> 
>>> Reported-by: Peiyang He <peiyang_he@smail.nju.edu.cn>
>>> Closes: https://lore.kernel.org/all/5A9E929D82717101+12fcf643-
>>> efb8-4b9a-a53a-1e28cc894f0b@smail.nju.edu.cn
>>> Fixes: f304652609ea ("mm: vmscan: prepare for reparenting MGLRU folios")
>>> Cc: <stable@vger.kernel.org>
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> Reviewed-by: Barry Song <baohua@kernel.org>

-- 
Cheers,
Harry / Hyeonggon

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  8:18     ` Harry Yoo
@ 2026-06-23  9:14       ` Qi Zheng
  2026-06-24  4:29         ` Harry Yoo
  0 siblings, 1 reply; 14+ messages in thread
From: Qi Zheng @ 2026-06-23  9:14 UTC (permalink / raw)
  To: Harry Yoo, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable

Hi Harry,

On 6/23/26 4:18 PM, Harry Yoo wrote:
> 
> 
> On 6/23/26 4:16 PM, Qi Zheng wrote:
>> Hi Harry,
> 
> Hi Qi!
> 
>> On 6/23/26 2:17 PM, Harry Yoo wrote:
>>> On 6/23/26 11:42 AM, Qi Zheng wrote:
>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>
>>>> The mglru page table walker batches per-generation size deltas in
>>>> walk->nr_pages while walking page tables without holding the lruvec
>>>> lock.
>>>> The reset_batch_size() later folds those deltas into walk->lruvec under
>>>> the lruvec lock.
>>>
>>> Ouch.
>>>
>>> IIRC the user-visible impact of underestimated nr_pages in MGLRU
>>> was premature OOMs because MGLRU does not try to reclaim memory when
>>> nr_pages reaches zero, but there are still more pages.
>>>
>>> Perhaps worth mentioning in the changelog?
>>
>> Maybe this should be placed before "To fix it...".
> 
> Thanks!
> 
>>>> The page table walker can run concurrently with the memcg reparenting
>>>> path
>>>> as follows:
>>>>
>>>> CPU0                           CPU1
>>>> ====                           ====
>>>>
>>>> walk_mm
>>>> --> walk_page_range
>>>>       --> update_batch_size
>>>>           --> walk->nr_pages += delta
>>>>
>>>>                                 mem_cgroup_css_offline
>>>>                                 --> memcg_reparent_objcgs
>>>>                                     --> lock lruvec
>>>>                                         lru_gen_reparent_memcg
>>>>                                         --> reparent child folios to
>>>> parent
>>>>                                         unlock lruvec
>>>>
>>>>       lock lruvec
>>>>       reset_batch_size
>>>>       --> child lrugen->nr_pages += delta
>>>
>>> The problem here is that, while grabbing a reference to memcg
>>> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
>>> freed, it does not prevent offlining happening, and reset_batch_size()
>>> doesn't check whether the lruvec has been reparented, or the lruvec
>>> is going to be reparented.
>>>
>>>> This will trigger the following warning in lru_gen_exit_memcg():
>>>>
>>>>      VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>>>>                     sizeof(lruvec->lrugen.nr_pages)));
>>>>
>>>> To fix it, add lrugen->reparented to remember the new owner of a
>>>> reparented lruvec, and make reset_batch_size() charge pending deltas to
>>>> that owner.
>>>
>>> Could you please explain why it is unavoidable to introduce the new
>>> field and why checking whether the cgroup is dying (and charging deltas
>>> to non-dying parent) doesn't work?
>>
>> Peiyang tried doing this [1], but it doesn't work because
>> ss->css_offline() is called before clearing the CSS_ONLINE flag.
> 
> Right.
> 
>> I also considered using mem_cgroup_tryget_online(), but that only prevent
>> the memcg from being freed. It's doesn't prevent the offlining.
> 
> Right.
> 
> I think checking CSS_DYING under RCU and grabbing the lruvec
> of the first non-dying memcg should work (this pattern is already
> used where we use RCU to guarantee memcgs are not freed).
> 
> If we do not observe CSS_DYING flag, it is safe to charge deltas
> to the lruvec because RCU guarantees that reparenting cannot happen
> under us.
> 
> If we do observe CSS_DYING, we can walk up the hierarchy and charge
> deltas to the first non-dying memcg.

Checking CSS_DYING looks feasible, but the rcu lock alone cannot prevent
reparenting. We should recheck CSS_DYING after acquiring the lruvec
lock, otherwise we might run into the following race:

   CPU0 reset_batch_size              CPU1 memcg teardown
   =====================              ==================

   read !CSS_DYING

                                      set CSS_DYING
                                      memcg_reparent_objcgs()
                                      lock child lruvec
                                      move child to parent
                                      zero child nr_pages
                                      unlock child lruvec

   lock child lruvec
   charge stale delta to child

So it seems lock_batch_lruvec() should be implemented like this:

static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
{
	struct mem_cgroup *memcg = lruvec_memcg(lruvec);

	rcu_read_lock();
retry:
	while (memcg && css_is_dying(&memcg->css))
		memcg = parent_mem_cgroup(memcg);

	lruvec = mem_cgroup_lruvec(memcg, pgdat);
	spin_lock_irq(&lruvec->lru_lock);
	if (memcg && unlikely(css_is_dying(&memcg->css))) {
		spin_unlock_irq(&lruvec->lru_lock);
		goto retry;
	}

	rcu_read_unlock();

	return lruvec;
}

This way, there is no need to add lrugen->reparented, right?

Thanks,
Qi



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-23  9:14       ` Qi Zheng
@ 2026-06-24  4:29         ` Harry Yoo
  2026-06-24  7:11           ` Qi Zheng
  0 siblings, 1 reply; 14+ messages in thread
From: Harry Yoo @ 2026-06-24  4:29 UTC (permalink / raw)
  To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable


[-- Attachment #1.1: Type: text/plain, Size: 5682 bytes --]



On 6/23/26 6:14 PM, Qi Zheng wrote:
> Hi Harry,
> 
> On 6/23/26 4:18 PM, Harry Yoo wrote:
>> On 6/23/26 4:16 PM, Qi Zheng wrote:
>>> Hi Harry,
>>
>> Hi Qi!
>>
>>> On 6/23/26 2:17 PM, Harry Yoo wrote:
>>>> On 6/23/26 11:42 AM, Qi Zheng wrote:
>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>
>>>>> The mglru page table walker batches per-generation size deltas in
>>>>> walk->nr_pages while walking page tables without holding the lruvec
>>>>> lock.
>>>>> The reset_batch_size() later folds those deltas into walk->lruvec
>>>>> under
>>>>> the lruvec lock.
>>>>
>>>> Ouch.
>>>>
>>>> IIRC the user-visible impact of underestimated nr_pages in MGLRU
>>>> was premature OOMs because MGLRU does not try to reclaim memory when
>>>> nr_pages reaches zero, but there are still more pages.
>>>>
>>>> Perhaps worth mentioning in the changelog?
>>>
>>> Maybe this should be placed before "To fix it...".
>>
>> Thanks!
>>
>>>>> The page table walker can run concurrently with the memcg reparenting
>>>>> path
>>>>> as follows:
>>>>>
>>>>> CPU0                           CPU1
>>>>> ====                           ====
>>>>>
>>>>> walk_mm
>>>>> --> walk_page_range
>>>>>       --> update_batch_size
>>>>>           --> walk->nr_pages += delta
>>>>>
>>>>>                                 mem_cgroup_css_offline
>>>>>                                 --> memcg_reparent_objcgs
>>>>>                                     --> lock lruvec
>>>>>                                         lru_gen_reparent_memcg
>>>>>                                         --> reparent child folios to
>>>>> parent
>>>>>                                         unlock lruvec
>>>>>
>>>>>       lock lruvec
>>>>>       reset_batch_size
>>>>>       --> child lrugen->nr_pages += delta
>>>>
>>>> The problem here is that, while grabbing a reference to memcg
>>>> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
>>>> freed, it does not prevent offlining happening, and reset_batch_size()
>>>> doesn't check whether the lruvec has been reparented, or the lruvec
>>>> is going to be reparented.
>>>>
>>>>> This will trigger the following warning in lru_gen_exit_memcg():
>>>>>
>>>>>      VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>>>>>                     sizeof(lruvec->lrugen.nr_pages)));
>>>>>
>>>>> To fix it, add lrugen->reparented to remember the new owner of a
>>>>> reparented lruvec, and make reset_batch_size() charge pending
>>>>> deltas to
>>>>> that owner.
>>>>
>>>> Could you please explain why it is unavoidable to introduce the new
>>>> field and why checking whether the cgroup is dying (and charging deltas
>>>> to non-dying parent) doesn't work?
>>>
>>> Peiyang tried doing this [1], but it doesn't work because
>>> ss->css_offline() is called before clearing the CSS_ONLINE flag.
>>
>> Right.
>>
>>> I also considered using mem_cgroup_tryget_online(), but that only
>>> prevent
>>> the memcg from being freed. It's doesn't prevent the offlining.
>>
>> Right.
>>
>> I think checking CSS_DYING under RCU and grabbing the lruvec
>> of the first non-dying memcg should work (this pattern is already
>> used where we use RCU to guarantee memcgs are not freed).
>>
>> If we do not observe CSS_DYING flag, it is safe to charge deltas
>> to the lruvec because RCU guarantees that reparenting cannot happen
>> under us.
>>
>> If we do observe CSS_DYING, we can walk up the hierarchy and charge
>> deltas to the first non-dying memcg.
> 
> Checking CSS_DYING looks feasible, but the rcu lock alone cannot prevent
> reparenting. We should recheck CSS_DYING after acquiring the lruvec
> lock, otherwise we might run into the following race:

Haha, actually, I was thinking of checking CSS_DYING under both RCU and
lruvec lock. (because that's the pattern)

>   CPU0 reset_batch_size              CPU1 memcg teardown
>   =====================              ==================
> 
>   read !CSS_DYING
> 
>                                      set CSS_DYING

Oh, I thought the entire critical section is covered by RCU.
(I see lock_batch_lruvec() you suggested below doesn't do that)

Isn't RCU enough to prevent reparenting because RCU guarantees that
all readers who read !CSS_DYING complete before reparenting?

Now I'm confused. Is it strictly required to check CSS_DYING under
lruvec lock? CSS_DYING is updated outside the lruvec lock anyway?

>                                      memcg_reparent_objcgs()
>                                      lock child lruvec
>                                      move child to parent
>                                      zero child nr_pages
>                                      unlock child lruvec
> 
>   lock child lruvec
>   charge stale delta to child
> 
> So it seems lock_batch_lruvec() should be implemented like this:
> 
> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
> {
>     struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> 
>     rcu_read_lock();
> retry:
>     while (memcg && css_is_dying(&memcg->css))
>         memcg = parent_mem_cgroup(memcg);

Isn't this loop unnecessary as spin_lock_irq() -> check CSS_DYING ->
goto retry does the same thing? (of course, we need to fetch the parent
memcg before retry then...)

>     lruvec = mem_cgroup_lruvec(memcg, pgdat);
>     spin_lock_irq(&lruvec->lru_lock);
>     if (memcg && unlikely(css_is_dying(&memcg->css))) {
>         spin_unlock_irq(&lruvec->lru_lock);
>         goto retry;
>     }
> 
>     rcu_read_unlock();
> 
>     return lruvec;
> }

Thanks!

-- 
Cheers,
Harry / Hyeonggon

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-24  4:29         ` Harry Yoo
@ 2026-06-24  7:11           ` Qi Zheng
  2026-06-25  4:16             ` Harry Yoo
  0 siblings, 1 reply; 14+ messages in thread
From: Qi Zheng @ 2026-06-24  7:11 UTC (permalink / raw)
  To: Harry Yoo, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable

Hi Harry,

On 6/24/26 12:29 PM, Harry Yoo wrote:
> 
> 
> On 6/23/26 6:14 PM, Qi Zheng wrote:
>> Hi Harry,
>>
>> On 6/23/26 4:18 PM, Harry Yoo wrote:
>>> On 6/23/26 4:16 PM, Qi Zheng wrote:
>>>> Hi Harry,
>>>
>>> Hi Qi!
>>>
>>>> On 6/23/26 2:17 PM, Harry Yoo wrote:
>>>>> On 6/23/26 11:42 AM, Qi Zheng wrote:
>>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>
>>>>>> The mglru page table walker batches per-generation size deltas in
>>>>>> walk->nr_pages while walking page tables without holding the lruvec
>>>>>> lock.
>>>>>> The reset_batch_size() later folds those deltas into walk->lruvec
>>>>>> under
>>>>>> the lruvec lock.
>>>>>
>>>>> Ouch.
>>>>>
>>>>> IIRC the user-visible impact of underestimated nr_pages in MGLRU
>>>>> was premature OOMs because MGLRU does not try to reclaim memory when
>>>>> nr_pages reaches zero, but there are still more pages.
>>>>>
>>>>> Perhaps worth mentioning in the changelog?
>>>>
>>>> Maybe this should be placed before "To fix it...".
>>>
>>> Thanks!
>>>
>>>>>> The page table walker can run concurrently with the memcg reparenting
>>>>>> path
>>>>>> as follows:
>>>>>>
>>>>>> CPU0                           CPU1
>>>>>> ====                           ====
>>>>>>
>>>>>> walk_mm
>>>>>> --> walk_page_range
>>>>>>        --> update_batch_size
>>>>>>            --> walk->nr_pages += delta
>>>>>>
>>>>>>                                  mem_cgroup_css_offline
>>>>>>                                  --> memcg_reparent_objcgs
>>>>>>                                      --> lock lruvec
>>>>>>                                          lru_gen_reparent_memcg
>>>>>>                                          --> reparent child folios to
>>>>>> parent
>>>>>>                                          unlock lruvec
>>>>>>
>>>>>>        lock lruvec
>>>>>>        reset_batch_size
>>>>>>        --> child lrugen->nr_pages += delta
>>>>>
>>>>> The problem here is that, while grabbing a reference to memcg
>>>>> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
>>>>> freed, it does not prevent offlining happening, and reset_batch_size()
>>>>> doesn't check whether the lruvec has been reparented, or the lruvec
>>>>> is going to be reparented.
>>>>>
>>>>>> This will trigger the following warning in lru_gen_exit_memcg():
>>>>>>
>>>>>>       VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>>>>>>                      sizeof(lruvec->lrugen.nr_pages)));
>>>>>>
>>>>>> To fix it, add lrugen->reparented to remember the new owner of a
>>>>>> reparented lruvec, and make reset_batch_size() charge pending
>>>>>> deltas to
>>>>>> that owner.
>>>>>
>>>>> Could you please explain why it is unavoidable to introduce the new
>>>>> field and why checking whether the cgroup is dying (and charging deltas
>>>>> to non-dying parent) doesn't work?
>>>>
>>>> Peiyang tried doing this [1], but it doesn't work because
>>>> ss->css_offline() is called before clearing the CSS_ONLINE flag.
>>>
>>> Right.
>>>
>>>> I also considered using mem_cgroup_tryget_online(), but that only
>>>> prevent
>>>> the memcg from being freed. It's doesn't prevent the offlining.
>>>
>>> Right.
>>>
>>> I think checking CSS_DYING under RCU and grabbing the lruvec
>>> of the first non-dying memcg should work (this pattern is already
>>> used where we use RCU to guarantee memcgs are not freed).
>>>
>>> If we do not observe CSS_DYING flag, it is safe to charge deltas
>>> to the lruvec because RCU guarantees that reparenting cannot happen
>>> under us.
>>>
>>> If we do observe CSS_DYING, we can walk up the hierarchy and charge
>>> deltas to the first non-dying memcg.
>>
>> Checking CSS_DYING looks feasible, but the rcu lock alone cannot prevent
>> reparenting. We should recheck CSS_DYING after acquiring the lruvec
>> lock, otherwise we might run into the following race:
> 
> Haha, actually, I was thinking of checking CSS_DYING under both RCU and
> lruvec lock. (because that's the pattern)
> 
>>    CPU0 reset_batch_size              CPU1 memcg teardown
>>    =====================              ==================
>>
>>    read !CSS_DYING
>>
>>                                       set CSS_DYING
> 
> Oh, I thought the entire critical section is covered by RCU.
> (I see lock_batch_lruvec() you suggested below doesn't do that)
> 
> Isn't RCU enough to prevent reparenting because RCU guarantees that
> all readers who read !CSS_DYING complete before reparenting?

Oh, I think you are right.

I forgot that offlining is executed in the rcu work context.

Let's walk through this again:

cgroup_destroy_locked
--> kill_css_sync
     --> css->flags |= CSS_DYING;                    1)
     kill_css_finish
     --> css_killed_ref_fn
         --> css_killed_work_fn  <-- RCU work !!     2)
             --> offline_css
                 --> reparent memcg

So while holding the rcu lock, if CSS_DYING is not observed,
css_killed_work_fn() will not be called until rcu_read_unlock().

So lock_batch_lruvec() can be implemented like this:

#ifdef CONFIG_MEMCG
static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
{
	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
	struct mem_cgroup *memcg = lruvec_memcg(lruvec);

	rcu_read_lock();

	/*
	 * The memcg can be NULL when the memory controller is disabled.
	 * Otherwise, the caller keeps the memcg owning @lruvec alive.
	 */
	if (!memcg || !css_is_dying(&memcg->css))
		goto lock;

	do {
		memcg = parent_mem_cgroup(memcg);
	} while (memcg && css_is_dying(&memcg->css));
	lruvec = mem_cgroup_lruvec(memcg, pgdat);

lock:
	spin_lock_irq(&lruvec->lru_lock);

	return lruvec;
}
#else
static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
{
	lruvec_lock_irq(lruvec);

	return lruvec;
}
#endif

Does this make sense?

Thanks,
Qi

> 
> Now I'm confused. Is it strictly required to check CSS_DYING under
> lruvec lock? CSS_DYING is updated outside the lruvec lock anyway?
> 
>>                                       memcg_reparent_objcgs()
>>                                       lock child lruvec
>>                                       move child to parent
>>                                       zero child nr_pages
>>                                       unlock child lruvec
>>
>>    lock child lruvec
>>    charge stale delta to child
>>
>> So it seems lock_batch_lruvec() should be implemented like this:
>>
>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>> {
>>      struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>
>>      rcu_read_lock();
>> retry:
>>      while (memcg && css_is_dying(&memcg->css))
>>          memcg = parent_mem_cgroup(memcg);
> 
> Isn't this loop unnecessary as spin_lock_irq() -> check CSS_DYING ->
> goto retry does the same thing? (of course, we need to fetch the parent
> memcg before retry then...)
> 
>>      lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>      spin_lock_irq(&lruvec->lru_lock);
>>      if (memcg && unlikely(css_is_dying(&memcg->css))) {
>>          spin_unlock_irq(&lruvec->lru_lock);
>>          goto retry;
>>      }
>>
>>      rcu_read_unlock();
>>
>>      return lruvec;
>> }
> 
> Thanks!
> 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-24  7:11           ` Qi Zheng
@ 2026-06-25  4:16             ` Harry Yoo
  2026-06-25  6:11               ` Qi Zheng
  0 siblings, 1 reply; 14+ messages in thread
From: Harry Yoo @ 2026-06-25  4:16 UTC (permalink / raw)
  To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable


[-- Attachment #1.1: Type: text/plain, Size: 6334 bytes --]



On 6/24/26 4:11 PM, Qi Zheng wrote:
> Hi Harry,
> 
> On 6/24/26 12:29 PM, Harry Yoo wrote:
>> On 6/23/26 6:14 PM, Qi Zheng wrote:
>>> Hi Harry,
>>>
>>> On 6/23/26 4:18 PM, Harry Yoo wrote:
>>>> On 6/23/26 4:16 PM, Qi Zheng wrote:
>>>>> Hi Harry,
>>>>
>>>> Hi Qi!
>>>>
>>>>> On 6/23/26 2:17 PM, Harry Yoo wrote:
>>>>>> On 6/23/26 11:42 AM, Qi Zheng wrote:
>>>>>>> From: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>>>>
>>>>>>> The mglru page table walker batches per-generation size deltas in
>>>>>>> walk->nr_pages while walking page tables without holding the lruvec
>>>>>>> lock.
>>>>>>> The reset_batch_size() later folds those deltas into walk->lruvec
>>>>>>> under
>>>>>>> the lruvec lock.
>>>>>>
>>>>>> Ouch.
>>>>>>
>>>>>> IIRC the user-visible impact of underestimated nr_pages in MGLRU
>>>>>> was premature OOMs because MGLRU does not try to reclaim memory when
>>>>>> nr_pages reaches zero, but there are still more pages.
>>>>>>
>>>>>> Perhaps worth mentioning in the changelog?
>>>>>
>>>>> Maybe this should be placed before "To fix it...".
>>>>
>>>> Thanks!
>>>>
>>>>>>> The page table walker can run concurrently with the memcg
>>>>>>> reparenting
>>>>>>> path
>>>>>>> as follows:
>>>>>>>
>>>>>>> CPU0                           CPU1
>>>>>>> ====                           ====
>>>>>>>
>>>>>>> walk_mm
>>>>>>> --> walk_page_range
>>>>>>>        --> update_batch_size
>>>>>>>            --> walk->nr_pages += delta
>>>>>>>
>>>>>>>                                  mem_cgroup_css_offline
>>>>>>>                                  --> memcg_reparent_objcgs
>>>>>>>                                      --> lock lruvec
>>>>>>>                                          lru_gen_reparent_memcg
>>>>>>>                                          --> reparent child
>>>>>>> folios to
>>>>>>> parent
>>>>>>>                                          unlock lruvec
>>>>>>>
>>>>>>>        lock lruvec
>>>>>>>        reset_batch_size
>>>>>>>        --> child lrugen->nr_pages += delta
>>>>>>
>>>>>> The problem here is that, while grabbing a reference to memcg
>>>>>> (via mem_cgroup_iter(), for example) makes sure that the memcg is not
>>>>>> freed, it does not prevent offlining happening, and
>>>>>> reset_batch_size()
>>>>>> doesn't check whether the lruvec has been reparented, or the lruvec
>>>>>> is going to be reparented.
>>>>>>
>>>>>>> This will trigger the following warning in lru_gen_exit_memcg():
>>>>>>>
>>>>>>>       VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
>>>>>>>                      sizeof(lruvec->lrugen.nr_pages)));
>>>>>>>
>>>>>>> To fix it, add lrugen->reparented to remember the new owner of a
>>>>>>> reparented lruvec, and make reset_batch_size() charge pending
>>>>>>> deltas to
>>>>>>> that owner.
>>>>>>
>>>>>> Could you please explain why it is unavoidable to introduce the new
>>>>>> field and why checking whether the cgroup is dying (and charging
>>>>>> deltas
>>>>>> to non-dying parent) doesn't work?
>>>>>
>>>>> Peiyang tried doing this [1], but it doesn't work because
>>>>> ss->css_offline() is called before clearing the CSS_ONLINE flag.
>>>>
>>>> Right.
>>>>
>>>>> I also considered using mem_cgroup_tryget_online(), but that only
>>>>> prevent
>>>>> the memcg from being freed. It's doesn't prevent the offlining.
>>>>
>>>> Right.
>>>>
>>>> I think checking CSS_DYING under RCU and grabbing the lruvec
>>>> of the first non-dying memcg should work (this pattern is already
>>>> used where we use RCU to guarantee memcgs are not freed).
>>>>
>>>> If we do not observe CSS_DYING flag, it is safe to charge deltas
>>>> to the lruvec because RCU guarantees that reparenting cannot happen
>>>> under us.
>>>>
>>>> If we do observe CSS_DYING, we can walk up the hierarchy and charge
>>>> deltas to the first non-dying memcg.
>>>
>>> Checking CSS_DYING looks feasible, but the rcu lock alone cannot prevent
>>> reparenting. We should recheck CSS_DYING after acquiring the lruvec
>>> lock, otherwise we might run into the following race:
>>
>> Haha, actually, I was thinking of checking CSS_DYING under both RCU and
>> lruvec lock. (because that's the pattern)
>>
>>>    CPU0 reset_batch_size              CPU1 memcg teardown
>>>    =====================              ==================
>>>
>>>    read !CSS_DYING
>>>
>>>                                       set CSS_DYING
>>
>> Oh, I thought the entire critical section is covered by RCU.
>> (I see lock_batch_lruvec() you suggested below doesn't do that)
>>
>> Isn't RCU enough to prevent reparenting because RCU guarantees that
>> all readers who read !CSS_DYING complete before reparenting?
> 
> Oh, I think you are right.
> 
> I forgot that offlining is executed in the rcu work context.

It's confusing :)

> Let's walk through this again:
> 
> cgroup_destroy_locked
> --> kill_css_sync
>     --> css->flags |= CSS_DYING;                    1)
>     kill_css_finish
>     --> css_killed_ref_fn
>         --> css_killed_work_fn  <-- RCU work !!     2)
>             --> offline_css
>                 --> reparent memcg
> 
> So while holding the rcu lock, if CSS_DYING is not observed,
> css_killed_work_fn() will not be called until rcu_read_unlock().

Right.

> So lock_batch_lruvec() can be implemented like this:
> 
> #ifdef CONFIG_MEMCG
> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
> {
>     struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>     struct mem_cgroup *memcg = lruvec_memcg(lruvec);
> 
>     rcu_read_lock();
> 
>     /*
>      * The memcg can be NULL when the memory controller is disabled.
>      * Otherwise, the caller keeps the memcg owning @lruvec alive.
>      */
>     if (!memcg || !css_is_dying(&memcg->css))
>         goto lock;
> 
>     do {
>         memcg = parent_mem_cgroup(memcg);
>     } while (memcg && css_is_dying(&memcg->css));
>     lruvec = mem_cgroup_lruvec(memcg, pgdat);
> 
> lock:
>     spin_lock_irq(&lruvec->lru_lock);
> 
>     return lruvec;
> }
> #else
> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
> {
>     lruvec_lock_irq(lruvec);
> 
>     return lruvec;
> }
> #endif
> 
> Does this make sense?

Yes, looks good to me!

-- 
Cheers,
Harry / Hyeonggon

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-25  4:16             ` Harry Yoo
@ 2026-06-25  6:11               ` Qi Zheng
  2026-06-25  6:32                 ` Harry Yoo
  0 siblings, 1 reply; 14+ messages in thread
From: Qi Zheng @ 2026-06-25  6:11 UTC (permalink / raw)
  To: Harry Yoo, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable



On 6/25/26 12:16 PM, Harry Yoo wrote:
> 

[...]

> 
>> So lock_batch_lruvec() can be implemented like this:
>>
>> #ifdef CONFIG_MEMCG
>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>> {
>>      struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>      struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>
>>      rcu_read_lock();
>>
>>      /*
>>       * The memcg can be NULL when the memory controller is disabled.
>>       * Otherwise, the caller keeps the memcg owning @lruvec alive.
>>       */
>>      if (!memcg || !css_is_dying(&memcg->css))
>>          goto lock;
>>
>>      do {
>>          memcg = parent_mem_cgroup(memcg);
>>      } while (memcg && css_is_dying(&memcg->css));
>>      lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>
>> lock:
>>      spin_lock_irq(&lruvec->lru_lock);
>>
>>      return lruvec;
>> }
>> #else
>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>> {
>>      lruvec_lock_irq(lruvec);
>>
>>      return lruvec;
>> }
>> #endif
>>
>> Does this make sense?
> 
> Yes, looks good to me!

OK, this sync method makes more sense as it doesn't require adding a
new lrugen->reparente. I'll go with this method and update v3.

Hi Barry and Baolin, what do you think? Since the sync method has been
changed, I will temporarily drop your previous Reviewed-by tags in v3. ;)

Thanks,
Qi

> 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-25  6:11               ` Qi Zheng
@ 2026-06-25  6:32                 ` Harry Yoo
  2026-06-25  7:37                   ` Qi Zheng
  0 siblings, 1 reply; 14+ messages in thread
From: Harry Yoo @ 2026-06-25  6:32 UTC (permalink / raw)
  To: Qi Zheng, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable


[-- Attachment #1.1: Type: text/plain, Size: 2045 bytes --]



On 6/25/26 3:11 PM, Qi Zheng wrote:
> On 6/25/26 12:16 PM, Harry Yoo wrote:
>>
> [...]
> 
>>
>>> So lock_batch_lruvec() can be implemented like this:
>>>
>>> #ifdef CONFIG_MEMCG
>>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>>> {
>>>      struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>>      struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>>
>>>      rcu_read_lock();
>>>
>>>      /*
>>>       * The memcg can be NULL when the memory controller is disabled.
>>>       * Otherwise, the caller keeps the memcg owning @lruvec alive.
>>>       */
>>>      if (!memcg || !css_is_dying(&memcg->css))
>>>          goto lock;
>>>
>>>      do {
>>>          memcg = parent_mem_cgroup(memcg);
>>>      } while (memcg && css_is_dying(&memcg->css));
>>>      lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>>
>>> lock:
>>>      spin_lock_irq(&lruvec->lru_lock);
>>>
>>>      return lruvec;
>>> }
>>> #else
>>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>>> {
>>>      lruvec_lock_irq(lruvec);
>>>
>>>      return lruvec;
>>> }
>>> #endif
>>>
>>> Does this make sense?
>>
>> Yes, looks good to me!
> 
> OK, this sync method makes more sense as it doesn't require adding a
> new lrugen->reparente. I'll go with this method and update v3.

Thanks!

Just one thing to clarify...

So, when we check something that's updated _before_ grace period
(CSS_DYING), RCU is sufficient.

But in folio_lruvec_lock*(), that is not the case because reparenting
is performed in the RCU work, under the lruvec lock. So the check needs
to be done under RCU and the lruvec lock.

This is quite subtle :D

> Hi Barry and Baolin, what do you think? Since the sync method has been
> changed, I will temporarily drop your previous Reviewed-by tags in v3. ;)

And hopefully Peiyang would kindly double check v3 still not reproduced
on the machine :)

-- 
Cheers,
Harry / Hyeonggon


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-25  6:32                 ` Harry Yoo
@ 2026-06-25  7:37                   ` Qi Zheng
  2026-06-25  8:09                     ` Peiyang He
  0 siblings, 1 reply; 14+ messages in thread
From: Qi Zheng @ 2026-06-25  7:37 UTC (permalink / raw)
  To: Harry Yoo, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, peiyang_he,
	mhocko, roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable



On 6/25/26 2:32 PM, Harry Yoo wrote:
> 
> 
> On 6/25/26 3:11 PM, Qi Zheng wrote:
>> On 6/25/26 12:16 PM, Harry Yoo wrote:
>>>
>> [...]
>>
>>>
>>>> So lock_batch_lruvec() can be implemented like this:
>>>>
>>>> #ifdef CONFIG_MEMCG
>>>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>>>> {
>>>>       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>>>       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>>>
>>>>       rcu_read_lock();
>>>>
>>>>       /*
>>>>        * The memcg can be NULL when the memory controller is disabled.
>>>>        * Otherwise, the caller keeps the memcg owning @lruvec alive.
>>>>        */
>>>>       if (!memcg || !css_is_dying(&memcg->css))
>>>>           goto lock;
>>>>
>>>>       do {
>>>>           memcg = parent_mem_cgroup(memcg);
>>>>       } while (memcg && css_is_dying(&memcg->css));
>>>>       lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>>>
>>>> lock:
>>>>       spin_lock_irq(&lruvec->lru_lock);
>>>>
>>>>       return lruvec;
>>>> }
>>>> #else
>>>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>>>> {
>>>>       lruvec_lock_irq(lruvec);
>>>>
>>>>       return lruvec;
>>>> }
>>>> #endif
>>>>
>>>> Does this make sense?
>>>
>>> Yes, looks good to me!
>>
>> OK, this sync method makes more sense as it doesn't require adding a
>> new lrugen->reparente. I'll go with this method and update v3.
> 
> Thanks!
> 
> Just one thing to clarify...
> 
> So, when we check something that's updated _before_ grace period
> (CSS_DYING), RCU is sufficient.
> 
> But in folio_lruvec_lock*(), that is not the case because reparenting
> is performed in the RCU work, under the lruvec lock. So the check needs
> to be done under RCU and the lruvec lock.
> 
> This is quite subtle :D

Indeed.

And in theory, the l->nr_items check in lock_list_lru_of_memcg() could
also be replaced by the CSS_DYING check.

> 
>> Hi Barry and Baolin, what do you think? Since the sync method has been
>> changed, I will temporarily drop your previous Reviewed-by tags in v3. ;)
> 
> And hopefully Peiyang would kindly double check v3 still not reproduced
> on the machine :)

Yeah!

> 



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting
  2026-06-25  7:37                   ` Qi Zheng
@ 2026-06-25  8:09                     ` Peiyang He
  0 siblings, 0 replies; 14+ messages in thread
From: Peiyang He @ 2026-06-25  8:09 UTC (permalink / raw)
  To: Qi Zheng, Harry Yoo, akpm, david, kasong, shakeel.butt, baohua,
	axelrasmussen, yuanchu, weixugc, hannes, muchun.song, mhocko,
	roman.gushchin, ljs
  Cc: linux-mm, linux-kernel, Qi Zheng, stable



On 2026/6/25 15:37, Qi Zheng wrote:
> 
> 
> On 6/25/26 2:32 PM, Harry Yoo wrote:
>>
>>
>> On 6/25/26 3:11 PM, Qi Zheng wrote:
>>> On 6/25/26 12:16 PM, Harry Yoo wrote:
>>>>
>>> [...]
>>>
>>>>
>>>>> So lock_batch_lruvec() can be implemented like this:
>>>>>
>>>>> #ifdef CONFIG_MEMCG
>>>>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>>>>> {
>>>>>       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>>>>>       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
>>>>>
>>>>>       rcu_read_lock();
>>>>>
>>>>>       /*
>>>>>        * The memcg can be NULL when the memory controller is disabled.
>>>>>        * Otherwise, the caller keeps the memcg owning @lruvec alive.
>>>>>        */
>>>>>       if (!memcg || !css_is_dying(&memcg->css))
>>>>>           goto lock;
>>>>>
>>>>>       do {
>>>>>           memcg = parent_mem_cgroup(memcg);
>>>>>       } while (memcg && css_is_dying(&memcg->css));
>>>>>       lruvec = mem_cgroup_lruvec(memcg, pgdat);
>>>>>
>>>>> lock:
>>>>>       spin_lock_irq(&lruvec->lru_lock);
>>>>>
>>>>>       return lruvec;
>>>>> }
>>>>> #else
>>>>> static struct lruvec *lock_batch_lruvec(struct lruvec *lruvec)
>>>>> {
>>>>>       lruvec_lock_irq(lruvec);
>>>>>
>>>>>       return lruvec;
>>>>> }
>>>>> #endif
>>>>>
>>>>> Does this make sense?
>>>>
>>>> Yes, looks good to me!
>>>
>>> OK, this sync method makes more sense as it doesn't require adding a
>>> new lrugen->reparente. I'll go with this method and update v3.
>>
>> Thanks!
>>
>> Just one thing to clarify...
>>
>> So, when we check something that's updated _before_ grace period
>> (CSS_DYING), RCU is sufficient.
>>
>> But in folio_lruvec_lock*(), that is not the case because reparenting
>> is performed in the RCU work, under the lruvec lock. So the check needs
>> to be done under RCU and the lruvec lock.
>>
>> This is quite subtle :D
> 
> Indeed.
> 
> And in theory, the l->nr_items check in lock_list_lru_of_memcg() could
> also be replaced by the CSS_DYING check.
> 
>>
>>> Hi Barry and Baolin, what do you think? Since the sync method has been
>>> changed, I will temporarily drop your previous Reviewed-by tags in v3. ;)
>>
>> And hopefully Peiyang would kindly double check v3 still not reproduced
>> on the machine :)
> 
> Yeah!
No problem! I can help test v3.> 
>>
> 
> 




^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2026-06-25  8:10 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-23  2:42 [PATCH v2] mm: mglru: fix stale batch updates after memcg reparenting Qi Zheng
2026-06-23  2:56 ` Qi Zheng
2026-06-23  4:03 ` Baolin Wang
2026-06-23  6:17 ` Harry Yoo
2026-06-23  7:16   ` Qi Zheng
2026-06-23  8:18     ` Harry Yoo
2026-06-23  9:14       ` Qi Zheng
2026-06-24  4:29         ` Harry Yoo
2026-06-24  7:11           ` Qi Zheng
2026-06-25  4:16             ` Harry Yoo
2026-06-25  6:11               ` Qi Zheng
2026-06-25  6:32                 ` Harry Yoo
2026-06-25  7:37                   ` Qi Zheng
2026-06-25  8:09                     ` Peiyang He

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox