* [PATCH 1/4] memcg: add infra for nmi safe memcg stats
2025-05-09 23:28 [PATCH 0/4] memcg: nmi-safe kmem charging Shakeel Butt
@ 2025-05-09 23:28 ` Shakeel Butt
2025-05-09 23:28 ` [PATCH 2/4] memcg: add nmi-safe update for MEMCG_KMEM Shakeel Butt
` (4 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-09 23:28 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Vlastimil Babka, Alexei Starovoitov, Sebastian Andrzej Siewior,
bpf, linux-mm, cgroups, linux-kernel, Meta kernel team
BPF programs can trigger memcg charging in nmi context and at the moment
memcg charging code path for kernel memory does not have support for nmi
context. To support kernel memory charging for nmi support, we need to
make objcg charging nmi safe and also memcg stats nmi.
At the moment, the memcg stats which get updated in the objcg charging
path are MEMCG_KMEM, NR_SLAB_RECLAIMABLE_B & NR_SLAB_UNRECLAIMABLE_B.
Rather than adding support for all memcg stats to be nmi safe, let's
just add infra to make these three stats nmi safe which this patch is
doing.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
include/linux/memcontrol.h | 6 ++++++
mm/memcontrol.c | 43 ++++++++++++++++++++++++++++++++++++++
2 files changed, 49 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 308c01bf98f5..ed9acb68652a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -113,6 +113,9 @@ struct mem_cgroup_per_node {
CACHELINE_PADDING(_pad2_);
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter;
+ /* slab stats for nmi context */
+ atomic64_t slab_reclaimable;
+ atomic64_t slab_unreclaimable;
};
struct mem_cgroup_threshold {
@@ -236,6 +239,9 @@ struct mem_cgroup {
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS];
+ /* MEMCG_KMEM for nmi context */
+ atomic64_t kmem_stat;
+
/*
* Hint of reclaim pressure for socket memroy management. Note
* that this indicator should NOT be used in legacy cgroup mode
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ea6e5591cab..7200f6930daf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4023,6 +4023,47 @@ static void mem_cgroup_stat_aggregate(struct aggregate_control *ac)
}
}
+static void flush_nmi_stats(struct mem_cgroup *memcg, struct mem_cgroup *parent,
+ int cpu)
+{
+ int nid;
+
+ if (atomic64_read(&memcg->kmem_stat)) {
+ s64 kmem = atomic64_xchg(&memcg->kmem_stat, 0);
+ int index = memcg_stats_index(MEMCG_KMEM);
+
+ memcg->vmstats->state[index] += kmem;
+ if (parent)
+ parent->vmstats->state_pending[index] += kmem;
+ }
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+ struct lruvec_stats *lstats = pn->lruvec_stats;
+ struct lruvec_stats *plstats = NULL;
+
+ if (parent)
+ plstats = parent->nodeinfo[nid]->lruvec_stats;
+
+ if (atomic64_read(&pn->slab_reclaimable)) {
+ s64 slab = atomic64_xchg(&pn->slab_reclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_RECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ if (atomic64_read(&pn->slab_unreclaimable)) {
+ s64 slab = atomic64_xchg(&pn->slab_unreclaimable, 0);
+ int index = memcg_stats_index(NR_SLAB_UNRECLAIMABLE_B);
+
+ lstats->state[index] += slab;
+ if (plstats)
+ plstats->state_pending[index] += slab;
+ }
+ }
+}
+
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -4031,6 +4072,8 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct aggregate_control ac;
int nid;
+ flush_nmi_stats(memcg, parent, cpu);
+
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
ac = (struct aggregate_control) {
--
2.47.1
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 2/4] memcg: add nmi-safe update for MEMCG_KMEM
2025-05-09 23:28 [PATCH 0/4] memcg: nmi-safe kmem charging Shakeel Butt
2025-05-09 23:28 ` [PATCH 1/4] memcg: add infra for nmi safe memcg stats Shakeel Butt
@ 2025-05-09 23:28 ` Shakeel Butt
2025-05-09 23:28 ` [PATCH 3/4] memcg: nmi-safe slab stats updates Shakeel Butt
` (3 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-09 23:28 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Vlastimil Babka, Alexei Starovoitov, Sebastian Andrzej Siewior,
bpf, linux-mm, cgroups, linux-kernel, Meta kernel team
The objcg based kmem charging and uncharging code path needs to update
MEMCG_KMEM appropriately. Let's add support to update MEMCG_KMEM in
nmi-safe way for those code paths.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
mm/memcontrol.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7200f6930daf..e91e4368650f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2779,6 +2779,18 @@ struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
return objcg;
}
+
+static inline void account_kmem_nmi_safe(struct mem_cgroup *memcg, int val)
+{
+ if (likely(!in_nmi())) {
+ mod_memcg_state(memcg, MEMCG_KMEM, val);
+ } else {
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ atomic64_add(val, &memcg->kmem_stat);
+ }
+}
+
+
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
* @objcg: object cgroup to uncharge
@@ -2791,7 +2803,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages);
+ account_kmem_nmi_safe(memcg, -nr_pages);
memcg1_account_kmem(memcg, -nr_pages);
if (!mem_cgroup_is_root(memcg))
refill_stock(memcg, nr_pages);
@@ -2819,7 +2831,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ account_kmem_nmi_safe(memcg, nr_pages);
memcg1_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
--
2.47.1
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 3/4] memcg: nmi-safe slab stats updates
2025-05-09 23:28 [PATCH 0/4] memcg: nmi-safe kmem charging Shakeel Butt
2025-05-09 23:28 ` [PATCH 1/4] memcg: add infra for nmi safe memcg stats Shakeel Butt
2025-05-09 23:28 ` [PATCH 2/4] memcg: add nmi-safe update for MEMCG_KMEM Shakeel Butt
@ 2025-05-09 23:28 ` Shakeel Butt
2025-05-09 23:28 ` [PATCH 4/4] memcg: make objcg charging nmi safe Shakeel Butt
` (2 subsequent siblings)
5 siblings, 0 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-09 23:28 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Vlastimil Babka, Alexei Starovoitov, Sebastian Andrzej Siewior,
bpf, linux-mm, cgroups, linux-kernel, Meta kernel team
The objcg based kmem [un]charging can be called in nmi context and it
may need to update NR_SLAB_[UN]RECLAIMABLE_B stats. So, let's correctly
handle the updates of these stats in the nmi context.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
mm/memcontrol.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e91e4368650f..bba549c1f18c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2591,8 +2591,18 @@ static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_memcg_lruvec_state(lruvec, idx, nr);
+ if (likely(!in_nmi())) {
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ __mod_memcg_lruvec_state(lruvec, idx, nr);
+ } else {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id];
+
+ /* TODO: add to cgroup update tree once it is nmi-safe. */
+ if (idx == NR_SLAB_RECLAIMABLE_B)
+ atomic64_add(nr, &pn->slab_reclaimable);
+ else
+ atomic64_add(nr, &pn->slab_unreclaimable);
+ }
rcu_read_unlock();
}
--
2.47.1
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCH 4/4] memcg: make objcg charging nmi safe
2025-05-09 23:28 [PATCH 0/4] memcg: nmi-safe kmem charging Shakeel Butt
` (2 preceding siblings ...)
2025-05-09 23:28 ` [PATCH 3/4] memcg: nmi-safe slab stats updates Shakeel Butt
@ 2025-05-09 23:28 ` Shakeel Butt
2025-05-13 22:25 ` Alexei Starovoitov
2025-05-10 1:26 ` [PATCH 0/4] memcg: nmi-safe kmem charging Andrew Morton
2025-05-12 15:56 ` Vlastimil Babka
5 siblings, 1 reply; 18+ messages in thread
From: Shakeel Butt @ 2025-05-09 23:28 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Vlastimil Babka, Alexei Starovoitov, Sebastian Andrzej Siewior,
bpf, linux-mm, cgroups, linux-kernel, Meta kernel team
To enable memcg charged kernel memory allocations from nmi context,
consume_obj_stock() and refill_obj_stock() needs to be nmi safe. With
the simple in_nmi() check, take the slow path of the objcg charging
which handles the charging and memcg stats updates correctly for the nmi
context.
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---
mm/memcontrol.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bba549c1f18c..6cfa3550f300 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2965,6 +2965,9 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
bool ret = false;
+ if (unlikely(in_nmi()))
+ return ret;
+
local_lock_irqsave(&obj_stock.lock, flags);
stock = this_cpu_ptr(&obj_stock);
@@ -3068,6 +3071,15 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
+ if (unlikely(in_nmi())) {
+ if (pgdat)
+ __mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+ nr_bytes = nr_bytes & (PAGE_SIZE - 1);
+ atomic_add(nr_bytes, &objcg->nr_charged_bytes);
+ goto out;
+ }
+
local_lock_irqsave(&obj_stock.lock, flags);
stock = this_cpu_ptr(&obj_stock);
@@ -3091,7 +3103,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
}
local_unlock_irqrestore(&obj_stock.lock, flags);
-
+out:
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
}
--
2.47.1
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 4/4] memcg: make objcg charging nmi safe
2025-05-09 23:28 ` [PATCH 4/4] memcg: make objcg charging nmi safe Shakeel Butt
@ 2025-05-13 22:25 ` Alexei Starovoitov
2025-05-14 16:46 ` Shakeel Butt
0 siblings, 1 reply; 18+ messages in thread
From: Alexei Starovoitov @ 2025-05-13 22:25 UTC (permalink / raw)
To: Shakeel Butt
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
Muchun Song, Vlastimil Babka, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm,
open list:CONTROL GROUP (CGROUP), LKML, Meta kernel team
On Fri, May 9, 2025 at 4:29 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> To enable memcg charged kernel memory allocations from nmi context,
> consume_obj_stock() and refill_obj_stock() needs to be nmi safe. With
> the simple in_nmi() check, take the slow path of the objcg charging
> which handles the charging and memcg stats updates correctly for the nmi
> context.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
> mm/memcontrol.c | 14 +++++++++++++-
> 1 file changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index bba549c1f18c..6cfa3550f300 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2965,6 +2965,9 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> unsigned long flags;
> bool ret = false;
>
> + if (unlikely(in_nmi()))
> + return ret;
> +
> local_lock_irqsave(&obj_stock.lock, flags);
>
> stock = this_cpu_ptr(&obj_stock);
> @@ -3068,6 +3071,15 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> unsigned long flags;
> unsigned int nr_pages = 0;
>
> + if (unlikely(in_nmi())) {
> + if (pgdat)
> + __mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
> + nr_pages = nr_bytes >> PAGE_SHIFT;
> + nr_bytes = nr_bytes & (PAGE_SIZE - 1);
> + atomic_add(nr_bytes, &objcg->nr_charged_bytes);
> + goto out;
> + }
Now I see what I did incorrectly in my series and how this patch 4
combined with patch 3 is doing accounting properly.
The only issue here and in other patches is that in_nmi() is
an incomplete condition to check for.
The reentrance is possible through kprobe or tracepoint.
In PREEMP_RT we will be fully preemptible, but
obj_stock.lock will be already taken by the current task.
To fix it you need to use local_lock_is_locked(&obj_stock.lock)
instead of in_nmi() or use local_trylock_irqsave(&obj_stock.lock).
local_trylock_irqsave() is cleaner and works today,
while local_lock_is_locked() hasn't landed yet, but if we go
is_locked route we can decouple reentrant obj_stock operation vs normal.
Like the if (!local_lock_is_locked(&obj_stock.lock))
can be done much higher up the stack from
__memcg_slab_post_alloc_hook() the way I did in my series,
and if locked it can do atomic_add()-style charging.
So refill_obj_stock() and friends won't need to change.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 4/4] memcg: make objcg charging nmi safe
2025-05-13 22:25 ` Alexei Starovoitov
@ 2025-05-14 16:46 ` Shakeel Butt
0 siblings, 0 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-14 16:46 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
Muchun Song, Vlastimil Babka, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm,
open list:CONTROL GROUP (CGROUP), LKML, Meta kernel team
On Tue, May 13, 2025 at 03:25:31PM -0700, Alexei Starovoitov wrote:
> On Fri, May 9, 2025 at 4:29 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > To enable memcg charged kernel memory allocations from nmi context,
> > consume_obj_stock() and refill_obj_stock() needs to be nmi safe. With
> > the simple in_nmi() check, take the slow path of the objcg charging
> > which handles the charging and memcg stats updates correctly for the nmi
> > context.
> >
> > Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> > ---
> > mm/memcontrol.c | 14 +++++++++++++-
> > 1 file changed, 13 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index bba549c1f18c..6cfa3550f300 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2965,6 +2965,9 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> > unsigned long flags;
> > bool ret = false;
> >
> > + if (unlikely(in_nmi()))
> > + return ret;
> > +
> > local_lock_irqsave(&obj_stock.lock, flags);
> >
> > stock = this_cpu_ptr(&obj_stock);
> > @@ -3068,6 +3071,15 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> > unsigned long flags;
> > unsigned int nr_pages = 0;
> >
> > + if (unlikely(in_nmi())) {
> > + if (pgdat)
> > + __mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes);
> > + nr_pages = nr_bytes >> PAGE_SHIFT;
> > + nr_bytes = nr_bytes & (PAGE_SIZE - 1);
> > + atomic_add(nr_bytes, &objcg->nr_charged_bytes);
> > + goto out;
> > + }
>
>
> Now I see what I did incorrectly in my series and how this patch 4
> combined with patch 3 is doing accounting properly.
>
> The only issue here and in other patches is that in_nmi() is
> an incomplete condition to check for.
> The reentrance is possible through kprobe or tracepoint.
> In PREEMP_RT we will be fully preemptible, but
> obj_stock.lock will be already taken by the current task.
> To fix it you need to use local_lock_is_locked(&obj_stock.lock)
> instead of in_nmi() or use local_trylock_irqsave(&obj_stock.lock).
>
> local_trylock_irqsave() is cleaner and works today,
> while local_lock_is_locked() hasn't landed yet, but if we go
> is_locked route we can decouple reentrant obj_stock operation vs normal.
> Like the if (!local_lock_is_locked(&obj_stock.lock))
> can be done much higher up the stack from
> __memcg_slab_post_alloc_hook() the way I did in my series,
> and if locked it can do atomic_add()-style charging.
> So refill_obj_stock() and friends won't need to change.
Thanks Alexei for taking a look. For now I am going with the trylock
path and later will check if your suggested is_locked() makes things
better.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-09 23:28 [PATCH 0/4] memcg: nmi-safe kmem charging Shakeel Butt
` (3 preceding siblings ...)
2025-05-09 23:28 ` [PATCH 4/4] memcg: make objcg charging nmi safe Shakeel Butt
@ 2025-05-10 1:26 ` Andrew Morton
2025-05-10 3:11 ` Shakeel Butt
2025-05-12 15:56 ` Vlastimil Babka
5 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2025-05-10 1:26 UTC (permalink / raw)
To: Shakeel Butt
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Vlastimil Babka, Alexei Starovoitov, Sebastian Andrzej Siewior,
bpf, linux-mm, cgroups, linux-kernel, Meta kernel team
On Fri, 9 May 2025 16:28:55 -0700 Shakeel Butt <shakeel.butt@linux.dev> wrote:
> BPF programs can trigger memcg charged kernel allocations in nmi
> context. However memcg charging infra for kernel memory is not equipped
> to handle nmi context. This series adds support for kernel memory
> charging for nmi context.
The patchset adds quite a bit of material to core MM on behalf of a
single caller. So can we please take a close look at why BPF is doing
this?
What would be involved in changing BPF to avoid doing this, or of
changing BPF to handle things locally? What would be the end-user
impact of such an alteration? IOW, what is the value to our users of
the present BPF behavior?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-10 1:26 ` [PATCH 0/4] memcg: nmi-safe kmem charging Andrew Morton
@ 2025-05-10 3:11 ` Shakeel Butt
2025-05-10 7:00 ` Harry Yoo
2025-05-12 14:52 ` Vlastimil Babka
0 siblings, 2 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-10 3:11 UTC (permalink / raw)
To: Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Vlastimil Babka, Alexei Starovoitov, Sebastian Andrzej Siewior,
bpf, linux-mm, cgroups, linux-kernel, Meta kernel team
Hi Andrew,
On Fri, May 09, 2025 at 06:26:32PM -0700, Andrew Morton wrote:
> On Fri, 9 May 2025 16:28:55 -0700 Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> > BPF programs can trigger memcg charged kernel allocations in nmi
> > context. However memcg charging infra for kernel memory is not equipped
> > to handle nmi context. This series adds support for kernel memory
> > charging for nmi context.
>
> The patchset adds quite a bit of material to core MM on behalf of a
> single caller. So can we please take a close look at why BPF is doing
> this?
>
> What would be involved in changing BPF to avoid doing this, or of
> changing BPF to handle things locally? What would be the end-user
> impact of such an alteration? IOW, what is the value to our users of
> the present BPF behavior?
>
Before answering the questions, let me clarify that this series is
continuation of the work which added similar support for page allocator
and related memcg charging and now the work is happening for
kmalloc/slab allocations. Alexei has a proposal on reentrant kmalloc and
here I am providing how memcg charging for that (reentrant kmalloc)
should work.
Next let me take a stab in answering the questions and BPF folks can
correct me if I am wrong. From what I understand, users can attach BPF
programs at almost any place in kernel and those BPF programs can do
memory allocations. This line of work is to make those allocations work
if the any such BPF attach point is triggered in mni context.
Before this line of work (reentrant page and slab allocators), I think
BPF had its internal cache but it was very limited and can easily fail
allocation requests (please BPF folks correct me if I am wrong). This
was discussed in LSFMM this year as well.
Now regarding the impact to the users. First there will not be any
negative impact on the non-users of this feature. For the value this
feature will provide to users, I think this line of work will make BPF
programs of the users more reliable with better allocation behavior.
BPF folks, please add more comments for the value of these features.
thanks,
Shakeel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-10 3:11 ` Shakeel Butt
@ 2025-05-10 7:00 ` Harry Yoo
2025-05-12 14:52 ` Vlastimil Babka
1 sibling, 0 replies; 18+ messages in thread
From: Harry Yoo @ 2025-05-10 7:00 UTC (permalink / raw)
To: Shakeel Butt
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
Muchun Song, Vlastimil Babka, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team
On Fri, May 09, 2025 at 08:11:55PM -0700, Shakeel Butt wrote:
> Hi Andrew,
>
> On Fri, May 09, 2025 at 06:26:32PM -0700, Andrew Morton wrote:
> > On Fri, 9 May 2025 16:28:55 -0700 Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > > BPF programs can trigger memcg charged kernel allocations in nmi
> > > context. However memcg charging infra for kernel memory is not equipped
> > > to handle nmi context. This series adds support for kernel memory
> > > charging for nmi context.
> >
> > The patchset adds quite a bit of material to core MM on behalf of a
> > single caller. So can we please take a close look at why BPF is doing
> > this?
> >
> > What would be involved in changing BPF to avoid doing this, or of
> > changing BPF to handle things locally? What would be the end-user
> > impact of such an alteration? IOW, what is the value to our users of
> > the present BPF behavior?
> >
>
> Before answering the questions, let me clarify that this series is
> continuation of the work which added similar support for page allocator
> and related memcg charging and now the work is happening for
> kmalloc/slab allocations. Alexei has a proposal on reentrant kmalloc and
> here I am providing how memcg charging for that (reentrant kmalloc)
> should work.
>
> Next let me take a stab in answering the questions and BPF folks can
> correct me if I am wrong. From what I understand, users can attach BPF
> programs at almost any place in kernel and those BPF programs can do
> memory allocations. This line of work is to make those allocations work
> if the any such BPF attach point is triggered in mni context.
>
> Before this line of work (reentrant page and slab allocators), I think
> BPF had its internal cache but it was very limited and can easily fail
> allocation requests (please BPF folks correct me if I am wrong). This
> was discussed in LSFMM this year as well.
>
> Now regarding the impact to the users. First there will not be any
> negative impact on the non-users of this feature. For the value this
> feature will provide to users, I think this line of work will make BPF
> programs of the users more reliable with better allocation behavior.
> BPF folks, please add more comments for the value of these features.
If kmalloc gains NMI-context support, preallocation would no longer be
necessary, eliminating its memory overhead which has been observed to
reach up to 1.5GB in Meta's fleet [1].
[1] https://lore.kernel.org/linux-mm/20250327145159.99799-1-alexei.starovoitov@gmail.com
--
Cheers,
Harry / Hyeonggon
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-10 3:11 ` Shakeel Butt
2025-05-10 7:00 ` Harry Yoo
@ 2025-05-12 14:52 ` Vlastimil Babka
1 sibling, 0 replies; 18+ messages in thread
From: Vlastimil Babka @ 2025-05-12 14:52 UTC (permalink / raw)
To: Shakeel Butt, Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Alexei Starovoitov, Sebastian Andrzej Siewior, bpf, linux-mm,
cgroups, linux-kernel, Meta kernel team
On 5/10/25 05:11, Shakeel Butt wrote:
> Before answering the questions, let me clarify that this series is
> continuation of the work which added similar support for page allocator
> and related memcg charging and now the work is happening for
> kmalloc/slab allocations. Alexei has a proposal on reentrant kmalloc and
> here I am providing how memcg charging for that (reentrant kmalloc)
> should work.
>
> Next let me take a stab in answering the questions and BPF folks can
> correct me if I am wrong. From what I understand, users can attach BPF
> programs at almost any place in kernel and those BPF programs can do
> memory allocations. This line of work is to make those allocations work
> if the any such BPF attach point is triggered in mni context.
>
> Before this line of work (reentrant page and slab allocators), I think
> BPF had its internal cache but it was very limited and can easily fail
> allocation requests (please BPF folks correct me if I am wrong). This
> was discussed in LSFMM this year as well.
>
> Now regarding the impact to the users. First there will not be any
> negative impact on the non-users of this feature. For the value this
> feature will provide to users, I think this line of work will make BPF
> programs of the users more reliable with better allocation behavior.
> BPF folks, please add more comments for the value of these features.
Yes and I think this part of cover letter is also important:
> There will be a followup series which will make kernel memory charging
> reentrant for irq and will be able to do without disabling irqs.
The "without disabling irqs" part will improve performance for all users.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-09 23:28 [PATCH 0/4] memcg: nmi-safe kmem charging Shakeel Butt
` (4 preceding siblings ...)
2025-05-10 1:26 ` [PATCH 0/4] memcg: nmi-safe kmem charging Andrew Morton
@ 2025-05-12 15:56 ` Vlastimil Babka
2025-05-12 19:12 ` Shakeel Butt
5 siblings, 1 reply; 18+ messages in thread
From: Vlastimil Babka @ 2025-05-12 15:56 UTC (permalink / raw)
To: Shakeel Butt, Andrew Morton
Cc: Johannes Weiner, Michal Hocko, Roman Gushchin, Muchun Song,
Alexei Starovoitov, Sebastian Andrzej Siewior, bpf, linux-mm,
cgroups, linux-kernel, Meta kernel team
On 5/10/25 01:28, Shakeel Butt wrote:
> BPF programs can trigger memcg charged kernel allocations in nmi
> context. However memcg charging infra for kernel memory is not equipped
> to handle nmi context. This series adds support for kernel memory
> charging for nmi context.
>
> The initial prototype tried to make memcg charging infra for kernel
> memory re-entrant against irq and nmi. However upon realizing that
> this_cpu_* operations are not safe on all architectures (Tejun), this
I assume it was an off-list discussion?
Could we avoid this for the architectures where these are safe, which should
be the major ones I hope?
> series took a different approach targeting only nmi context. Since the
> number of stats that are updated in kernel memory charging path are 3,
> this series added special handling of those stats in nmi context rather
> than making all >100 memcg stats nmi safe.
Hmm so from patches 2 and 3 I see this relies on atomic64_add().
But AFAIU lib/atomic64.c has the generic fallback implementation for
architectures that don't know better, and that would be using the "void
generic_atomic64_##op" macro, which AFAICS is doing:
local_irq_save(flags); \
arch_spin_lock(lock); \
v->counter c_op a; \
arch_spin_unlock(lock); \
local_irq_restore(flags); \
so in case of a nmi hitting after the spin_lock this can still deadlock?
Hm or is there some assumption that we only use these paths when already
in_nmi() and then another nmi can't come in that context?
But even then, flush_nmi_stats() in patch 1 isn't done in_nmi() and uses
atomic64_xchg() which in generic_atomic64_xchg() implementation also has the
irq_save+spin_lock. So can't we deadlock there?
>
> There will be a followup series which will make kernel memory charging
> reentrant for irq and will be able to do without disabling irqs.
>
> We ran network intensive workload on this series and have not seen any
> significant performance differences with and without the series.
>
> Shakeel Butt (4):
> memcg: add infra for nmi safe memcg stats
> memcg: add nmi-safe update for MEMCG_KMEM
> memcg: nmi-safe slab stats updates
> memcg: make objcg charging nmi safe
>
> include/linux/memcontrol.h | 6 +++
> mm/memcontrol.c | 87 +++++++++++++++++++++++++++++++++++---
> 2 files changed, 88 insertions(+), 5 deletions(-)
>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-12 15:56 ` Vlastimil Babka
@ 2025-05-12 19:12 ` Shakeel Butt
2025-05-13 7:15 ` Vlastimil Babka
0 siblings, 1 reply; 18+ messages in thread
From: Shakeel Butt @ 2025-05-12 19:12 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Andrew Morton, Tejun Heo, Johannes Weiner, Michal Hocko,
Roman Gushchin, Muchun Song, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team
I forgot to CC Tejun, so doing it now.
On Mon, May 12, 2025 at 05:56:09PM +0200, Vlastimil Babka wrote:
> On 5/10/25 01:28, Shakeel Butt wrote:
> > BPF programs can trigger memcg charged kernel allocations in nmi
> > context. However memcg charging infra for kernel memory is not equipped
> > to handle nmi context. This series adds support for kernel memory
> > charging for nmi context.
> >
> > The initial prototype tried to make memcg charging infra for kernel
> > memory re-entrant against irq and nmi. However upon realizing that
> > this_cpu_* operations are not safe on all architectures (Tejun), this
>
> I assume it was an off-list discussion?
> Could we avoid this for the architectures where these are safe, which should
> be the major ones I hope?
Yes it was an off-list discussion. The discussion was more about the
this_cpu_* ops vs atomic_* ops as on x86 this_cpu_* does not have lock
prefix and how I should prefer this_cpu_* over atomic_* for my series on
objcg charging without disabling irqs. Tejun pointed out this_cpu_* are
not nmi safe for some archs and it would be better to handle nmi context
separately. So, I am not that worried about optimizing for NMI context
but your next comment on generic_atomic64_* ops is giving me headache.
>
> > series took a different approach targeting only nmi context. Since the
> > number of stats that are updated in kernel memory charging path are 3,
> > this series added special handling of those stats in nmi context rather
> > than making all >100 memcg stats nmi safe.
>
> Hmm so from patches 2 and 3 I see this relies on atomic64_add().
> But AFAIU lib/atomic64.c has the generic fallback implementation for
> architectures that don't know better, and that would be using the "void
> generic_atomic64_##op" macro, which AFAICS is doing:
>
> local_irq_save(flags); \
> arch_spin_lock(lock); \
> v->counter c_op a; \
> arch_spin_unlock(lock); \
> local_irq_restore(flags); \
>
> so in case of a nmi hitting after the spin_lock this can still deadlock?
>
> Hm or is there some assumption that we only use these paths when already
> in_nmi() and then another nmi can't come in that context?
>
> But even then, flush_nmi_stats() in patch 1 isn't done in_nmi() and uses
> atomic64_xchg() which in generic_atomic64_xchg() implementation also has the
> irq_save+spin_lock. So can't we deadlock there?
I was actually assuming that atomic_* ops are safe against nmis for all
archs. I looked at atomic_* ops in include/asm-generic/atomic.h and it
is using arch_cmpxchg() for CONFIG_SMP and it seems like for archs with
cmpxchg should be fine against nmi. I am not sure why atomic64_* are not
using arch_cmpxchg() instead. I will dig more.
I also have the followup series on objcg charging without irq almost
ready. I will send it out as rfc soon.
Thanks a lot for awesome and insightful comments.
Shakeel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-12 19:12 ` Shakeel Butt
@ 2025-05-13 7:15 ` Vlastimil Babka
2025-05-13 11:41 ` Peter Zijlstra
0 siblings, 1 reply; 18+ messages in thread
From: Vlastimil Babka @ 2025-05-13 7:15 UTC (permalink / raw)
To: Shakeel Butt
Cc: Andrew Morton, Tejun Heo, Johannes Weiner, Michal Hocko,
Roman Gushchin, Muchun Song, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team, Mathieu Desnoyers, Peter Zijlstra
On 5/12/25 21:12, Shakeel Butt wrote:
> I forgot to CC Tejun, so doing it now.
>
> On Mon, May 12, 2025 at 05:56:09PM +0200, Vlastimil Babka wrote:
>> On 5/10/25 01:28, Shakeel Butt wrote:
>> > BPF programs can trigger memcg charged kernel allocations in nmi
>> > context. However memcg charging infra for kernel memory is not equipped
>> > to handle nmi context. This series adds support for kernel memory
>> > charging for nmi context.
>> >
>> > The initial prototype tried to make memcg charging infra for kernel
>> > memory re-entrant against irq and nmi. However upon realizing that
>> > this_cpu_* operations are not safe on all architectures (Tejun), this
>>
>> I assume it was an off-list discussion?
>> Could we avoid this for the architectures where these are safe, which should
>> be the major ones I hope?
>
> Yes it was an off-list discussion. The discussion was more about the
> this_cpu_* ops vs atomic_* ops as on x86 this_cpu_* does not have lock
> prefix and how I should prefer this_cpu_* over atomic_* for my series on
> objcg charging without disabling irqs. Tejun pointed out this_cpu_* are
> not nmi safe for some archs and it would be better to handle nmi context
> separately. So, I am not that worried about optimizing for NMI context
Well, we're introducing in_nmi() check and different execution paths to all
charging. This could be e.g. compiled out for architectures where this_cpu*
is NMI safe or they don't have NMIs in the first place.
> but your next comment on generic_atomic64_* ops is giving me headache.
>
>>
>> > series took a different approach targeting only nmi context. Since the
>> > number of stats that are updated in kernel memory charging path are 3,
>> > this series added special handling of those stats in nmi context rather
>> > than making all >100 memcg stats nmi safe.
>>
>> Hmm so from patches 2 and 3 I see this relies on atomic64_add().
>> But AFAIU lib/atomic64.c has the generic fallback implementation for
>> architectures that don't know better, and that would be using the "void
>> generic_atomic64_##op" macro, which AFAICS is doing:
>>
>> local_irq_save(flags); \
>> arch_spin_lock(lock); \
>> v->counter c_op a; \
>> arch_spin_unlock(lock); \
>> local_irq_restore(flags); \
>>
>> so in case of a nmi hitting after the spin_lock this can still deadlock?
>>
>> Hm or is there some assumption that we only use these paths when already
>> in_nmi() and then another nmi can't come in that context?
>>
>> But even then, flush_nmi_stats() in patch 1 isn't done in_nmi() and uses
>> atomic64_xchg() which in generic_atomic64_xchg() implementation also has the
>> irq_save+spin_lock. So can't we deadlock there?
>
> I was actually assuming that atomic_* ops are safe against nmis for all
> archs. I looked at atomic_* ops in include/asm-generic/atomic.h and it
> is using arch_cmpxchg() for CONFIG_SMP and it seems like for archs with
> cmpxchg should be fine against nmi. I am not sure why atomic64_* are not
> using arch_cmpxchg() instead. I will dig more.
Yeah I've found https://docs.kernel.org/core-api/local_ops.html and since it
listed Mathieu we discussed on IRC and he mentioned the same thing that
atomic_ ops are fine, but the later added 64bit variant isn't, which PeterZ
(who added it) acknowledged.
But there could be way out if we could somehow compile-time assert that
either is true:
- CONFIG_HAVE_NMI=n - we can compile out all the nmi code
- this_cpu is safe on that arch - we can also compile out the nmi code
- (if the above leaves any 64bit arch) its 64bit atomics implementation is safe
- (if there are any 32bit applicable arch left) 32bit atomics should be
enough for the nmi counters even with >4GB memory as we flush them? and we
know the 32bit ops are safe
> I also have the followup series on objcg charging without irq almost
> ready. I will send it out as rfc soon.
>
> Thanks a lot for awesome and insightful comments.
> Shakeel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-13 7:15 ` Vlastimil Babka
@ 2025-05-13 11:41 ` Peter Zijlstra
2025-05-13 22:17 ` Shakeel Butt
0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2025-05-13 11:41 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Shakeel Butt, Andrew Morton, Tejun Heo, Johannes Weiner,
Michal Hocko, Roman Gushchin, Muchun Song, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team, Mathieu Desnoyers
On Tue, May 13, 2025 at 09:15:23AM +0200, Vlastimil Babka wrote:
> >> > The initial prototype tried to make memcg charging infra for kernel
> >> > memory re-entrant against irq and nmi. However upon realizing that
> >> > this_cpu_* operations are not safe on all architectures (Tejun), this
> >>
> >> I assume it was an off-list discussion?
> >> Could we avoid this for the architectures where these are safe, which should
> >> be the major ones I hope?
IIRC Power64 has issues here, 'funnily' their local_t is NMI safe.
Perhaps we could do the same for their this_cpu_*(), but ideally someone
with actual power hardware should do this ;-)
> > Yes it was an off-list discussion. The discussion was more about the
> > this_cpu_* ops vs atomic_* ops as on x86 this_cpu_* does not have lock
> > prefix and how I should prefer this_cpu_* over atomic_* for my series on
> > objcg charging without disabling irqs. Tejun pointed out this_cpu_* are
> > not nmi safe for some archs and it would be better to handle nmi context
> > separately. So, I am not that worried about optimizing for NMI context
>
> Well, we're introducing in_nmi() check and different execution paths to all
> charging. This could be e.g. compiled out for architectures where this_cpu*
> is NMI safe or they don't have NMIs in the first place.
Very few architectures one would care about do not have NMIs. Risc-V
seems to be the exception here ?!?
> > but your next comment on generic_atomic64_* ops is giving me headache.
> >
> >>
> >> > series took a different approach targeting only nmi context. Since the
> >> > number of stats that are updated in kernel memory charging path are 3,
> >> > this series added special handling of those stats in nmi context rather
> >> > than making all >100 memcg stats nmi safe.
> >>
> >> Hmm so from patches 2 and 3 I see this relies on atomic64_add().
> >> But AFAIU lib/atomic64.c has the generic fallback implementation for
> >> architectures that don't know better, and that would be using the "void
> >> generic_atomic64_##op" macro, which AFAICS is doing:
> >>
> >> local_irq_save(flags); \
> >> arch_spin_lock(lock); \
> >> v->counter c_op a; \
> >> arch_spin_unlock(lock); \
> >> local_irq_restore(flags); \
> >>
> >> so in case of a nmi hitting after the spin_lock this can still deadlock?
> >>
> >> Hm or is there some assumption that we only use these paths when already
> >> in_nmi() and then another nmi can't come in that context?
> >>
> >> But even then, flush_nmi_stats() in patch 1 isn't done in_nmi() and uses
> >> atomic64_xchg() which in generic_atomic64_xchg() implementation also has the
> >> irq_save+spin_lock. So can't we deadlock there?
> >
> > I was actually assuming that atomic_* ops are safe against nmis for all
> > archs.
We have HAVE_NMI_SAFE_CMPXCHG for this -- there are architectures where
this is not the case -- but again, those are typically oddball archs you
don't much care about.
But yes, *64 on 32bit archs is generally not NMI safe.
> I looked at atomic_* ops in include/asm-generic/atomic.h and it
> > is using arch_cmpxchg() for CONFIG_SMP and it seems like for archs with
> > cmpxchg should be fine against nmi. I am not sure why atomic64_* are not
> > using arch_cmpxchg() instead. I will dig more.
Not many 32bit architectures have 64bit cmpxchg. We're only now dropping
support for x86 chips without CMPXCHG8b.
> Yeah I've found https://docs.kernel.org/core-api/local_ops.html and since it
> listed Mathieu we discussed on IRC and he mentioned the same thing that
> atomic_ ops are fine, but the later added 64bit variant isn't, which PeterZ
> (who added it) acknowledged.
>
> But there could be way out if we could somehow compile-time assert that
> either is true:
> - CONFIG_HAVE_NMI=n - we can compile out all the nmi code
Note that in_nmi() is not depending on HAVE_NMI -- nor can it be. Many
architectures treat various traps as NMI-like, even though they might
not have real NMIs.
> - this_cpu is safe on that arch - we can also compile out the nmi code
There is no config symbol for this presently.
> - (if the above leaves any 64bit arch) its 64bit atomics implementation is safe
True, only because HPPA does not in fact have NMIs.
> - (if there are any 32bit applicable arch left) 32bit atomics should be
> enough for the nmi counters even with >4GB memory as we flush them? and we
> know the 32bit ops are safe
Older ARM might qualify here.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-13 11:41 ` Peter Zijlstra
@ 2025-05-13 22:17 ` Shakeel Butt
2025-05-14 7:11 ` Peter Zijlstra
2025-05-15 1:49 ` Shakeel Butt
0 siblings, 2 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-13 22:17 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Vlastimil Babka, Andrew Morton, Tejun Heo, Johannes Weiner,
Michal Hocko, Roman Gushchin, Muchun Song, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team, Mathieu Desnoyers
On Tue, May 13, 2025 at 01:41:25PM +0200, Peter Zijlstra wrote:
> On Tue, May 13, 2025 at 09:15:23AM +0200, Vlastimil Babka wrote:
>
> > >> > The initial prototype tried to make memcg charging infra for kernel
> > >> > memory re-entrant against irq and nmi. However upon realizing that
> > >> > this_cpu_* operations are not safe on all architectures (Tejun), this
> > >>
> > >> I assume it was an off-list discussion?
> > >> Could we avoid this for the architectures where these are safe, which should
> > >> be the major ones I hope?
>
> IIRC Power64 has issues here, 'funnily' their local_t is NMI safe.
> Perhaps we could do the same for their this_cpu_*(), but ideally someone
> with actual power hardware should do this ;-)
>
Is CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS the right config to
differentiate between such archs? I see Power64 does not have that
enabled.
> > > Yes it was an off-list discussion. The discussion was more about the
> > > this_cpu_* ops vs atomic_* ops as on x86 this_cpu_* does not have lock
> > > prefix and how I should prefer this_cpu_* over atomic_* for my series on
> > > objcg charging without disabling irqs. Tejun pointed out this_cpu_* are
> > > not nmi safe for some archs and it would be better to handle nmi context
> > > separately. So, I am not that worried about optimizing for NMI context
> >
> > Well, we're introducing in_nmi() check and different execution paths to all
> > charging. This could be e.g. compiled out for architectures where this_cpu*
> > is NMI safe or they don't have NMIs in the first place.
>
> Very few architectures one would care about do not have NMIs. Risc-V
> seems to be the exception here ?!?
>
> > > but your next comment on generic_atomic64_* ops is giving me headache.
> > >
> > >>
> > >> > series took a different approach targeting only nmi context. Since the
> > >> > number of stats that are updated in kernel memory charging path are 3,
> > >> > this series added special handling of those stats in nmi context rather
> > >> > than making all >100 memcg stats nmi safe.
> > >>
> > >> Hmm so from patches 2 and 3 I see this relies on atomic64_add().
> > >> But AFAIU lib/atomic64.c has the generic fallback implementation for
> > >> architectures that don't know better, and that would be using the "void
> > >> generic_atomic64_##op" macro, which AFAICS is doing:
> > >>
> > >> local_irq_save(flags); \
> > >> arch_spin_lock(lock); \
> > >> v->counter c_op a; \
> > >> arch_spin_unlock(lock); \
> > >> local_irq_restore(flags); \
> > >>
> > >> so in case of a nmi hitting after the spin_lock this can still deadlock?
> > >>
> > >> Hm or is there some assumption that we only use these paths when already
> > >> in_nmi() and then another nmi can't come in that context?
> > >>
> > >> But even then, flush_nmi_stats() in patch 1 isn't done in_nmi() and uses
> > >> atomic64_xchg() which in generic_atomic64_xchg() implementation also has the
> > >> irq_save+spin_lock. So can't we deadlock there?
> > >
> > > I was actually assuming that atomic_* ops are safe against nmis for all
> > > archs.
>
> We have HAVE_NMI_SAFE_CMPXCHG for this -- there are architectures where
> this is not the case -- but again, those are typically oddball archs you
> don't much care about.
>
> But yes, *64 on 32bit archs is generally not NMI safe.
>
> > I looked at atomic_* ops in include/asm-generic/atomic.h and it
> > > is using arch_cmpxchg() for CONFIG_SMP and it seems like for archs with
> > > cmpxchg should be fine against nmi. I am not sure why atomic64_* are not
> > > using arch_cmpxchg() instead. I will dig more.
>
> Not many 32bit architectures have 64bit cmpxchg. We're only now dropping
> support for x86 chips without CMPXCHG8b.
>
As Vlastimil pointed out (last point), I don't think I will need 64bit
cmpxchg, 32bit cmpxchg will be fine.
> > Yeah I've found https://docs.kernel.org/core-api/local_ops.html and since it
> > listed Mathieu we discussed on IRC and he mentioned the same thing that
> > atomic_ ops are fine, but the later added 64bit variant isn't, which PeterZ
> > (who added it) acknowledged.
> >
> > But there could be way out if we could somehow compile-time assert that
> > either is true:
> > - CONFIG_HAVE_NMI=n - we can compile out all the nmi code
>
> Note that in_nmi() is not depending on HAVE_NMI -- nor can it be. Many
> architectures treat various traps as NMI-like, even though they might
> not have real NMIs.
>
> > - this_cpu is safe on that arch - we can also compile out the nmi code
>
> There is no config symbol for this presently.
Hmm what about CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS?
>
> > - (if the above leaves any 64bit arch) its 64bit atomics implementation is safe
>
> True, only because HPPA does not in fact have NMIs.
What is HPPA?
>
> > - (if there are any 32bit applicable arch left) 32bit atomics should be
> > enough for the nmi counters even with >4GB memory as we flush them? and we
> > know the 32bit ops are safe
>
> Older ARM might qualify here.
Thanks a lot Vlastimil & Peter for the suggestions. Let me summarize
what I plan to do and please point out if I am doing something wrong:
#if defined(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS) || !defined(CONFIG_HAVE_NMI)
// Do normal this_cpu* ops
#elif defined(ARCH_HAVE_NMI_SAFE_CMPXCHG)
// Do 32 bit atomic ops with in_nmi() checks
#else
// Build error or ignore nmi stats??
#endif
WDYT?
(BTW Vlastimil, I might rebase send out the irq-safe series before this
nmi one.)
thanks,
Shakeel
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-13 22:17 ` Shakeel Butt
@ 2025-05-14 7:11 ` Peter Zijlstra
2025-05-15 1:49 ` Shakeel Butt
1 sibling, 0 replies; 18+ messages in thread
From: Peter Zijlstra @ 2025-05-14 7:11 UTC (permalink / raw)
To: Shakeel Butt
Cc: Vlastimil Babka, Andrew Morton, Tejun Heo, Johannes Weiner,
Michal Hocko, Roman Gushchin, Muchun Song, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team, Mathieu Desnoyers, npiggin
On Tue, May 13, 2025 at 03:17:00PM -0700, Shakeel Butt wrote:
> > IIRC Power64 has issues here, 'funnily' their local_t is NMI safe.
> > Perhaps we could do the same for their this_cpu_*(), but ideally someone
> > with actual power hardware should do this ;-)
> >
>
> Is CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS the right config to
> differentiate between such archs? I see Power64 does not have that
> enabled.
> > There is no config symbol for this presently.
>
> Hmm what about CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS?
Hmm, I didn't know about that one, and it escaped my grep yesterday.
Anyway, PPC is fixable, just not sure its worth it for them.
> >
> > > - (if the above leaves any 64bit arch) its 64bit atomics implementation is safe
> >
> > True, only because HPPA does not in fact have NMIs.
>
> What is HPPA?
arch/parisc the worst 64bit arch ever.
They saw sparc32-smp and thought that was a great idea, or something
along those lines. Both are quite terrible. Sparc64 realized the mistake
and fixed it -- it has cmpxchg.
Nick, is this something that's useful for you guys?
---
diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h
index ecf5ac70cfae..aa188db68ef5 100644
--- a/arch/powerpc/include/asm/percpu.h
+++ b/arch/powerpc/include/asm/percpu.h
@@ -25,6 +25,11 @@ DECLARE_STATIC_KEY_FALSE(__percpu_first_chunk_is_paged);
#define percpu_first_chunk_is_paged false
#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+#define __pcpu_local_irq_save(f) powerpc_local_irq_pmu_save(f)
+#define __pcpu_local_irq_restore(s) powerpc_local_irq_pmu_restore(f)
+#endif
+
#include <asm-generic/percpu.h>
#include <asm/paca.h>
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 02aeca21479a..5c8376588dfb 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -75,6 +75,11 @@ extern void setup_per_cpu_areas(void);
#define PER_CPU_ATTRIBUTES
#endif
+#ifndef __pcpu_local_irq_save
+#define __pcpu_local_irq_save(x) raw_local_irq_save(x)
+#define __pcpu_local_irq_restore(x) raw_local_irq_restore(x)
+#endif
+
#define raw_cpu_generic_read(pcp) \
({ \
*raw_cpu_ptr(&(pcp)); \
@@ -146,9 +151,9 @@ do { \
({ \
TYPEOF_UNQUAL(pcp) ___ret; \
unsigned long ___flags; \
- raw_local_irq_save(___flags); \
+ __pcpu_local_irq_save(___flags); \
___ret = raw_cpu_generic_read(pcp); \
- raw_local_irq_restore(___flags); \
+ __pcpu_local_irq_restore(___flags); \
___ret; \
})
@@ -165,9 +170,9 @@ do { \
#define this_cpu_generic_to_op(pcp, val, op) \
do { \
unsigned long __flags; \
- raw_local_irq_save(__flags); \
+ __pcpu_local_irq_save(__flags); \
raw_cpu_generic_to_op(pcp, val, op); \
- raw_local_irq_restore(__flags); \
+ __pcpu_local_irq_restore(__flags); \
} while (0)
@@ -175,9 +180,9 @@ do { \
({ \
TYPEOF_UNQUAL(pcp) __ret; \
unsigned long __flags; \
- raw_local_irq_save(__flags); \
+ __pcpu_local_irq_save(__flags); \
__ret = raw_cpu_generic_add_return(pcp, val); \
- raw_local_irq_restore(__flags); \
+ __pcpu_local_irq_restore(__flags); \
__ret; \
})
@@ -185,9 +190,9 @@ do { \
({ \
TYPEOF_UNQUAL(pcp) __ret; \
unsigned long __flags; \
- raw_local_irq_save(__flags); \
+ __pcpu_local_irq_save(__flags); \
__ret = raw_cpu_generic_xchg(pcp, nval); \
- raw_local_irq_restore(__flags); \
+ __pcpu_local_irq_restore(__flags); \
__ret; \
})
@@ -195,9 +200,9 @@ do { \
({ \
bool __ret; \
unsigned long __flags; \
- raw_local_irq_save(__flags); \
+ __pcpu_local_irq_save(__flags); \
__ret = raw_cpu_generic_try_cmpxchg(pcp, ovalp, nval); \
- raw_local_irq_restore(__flags); \
+ __pcpu_local_irq_restore(__flags); \
__ret; \
})
@@ -205,9 +210,9 @@ do { \
({ \
TYPEOF_UNQUAL(pcp) __ret; \
unsigned long __flags; \
- raw_local_irq_save(__flags); \
+ __pcpu_local_irq_save(__flags); \
__ret = raw_cpu_generic_cmpxchg(pcp, oval, nval); \
- raw_local_irq_restore(__flags); \
+ __pcpu_local_irq_restore(__flags); \
__ret; \
})
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCH 0/4] memcg: nmi-safe kmem charging
2025-05-13 22:17 ` Shakeel Butt
2025-05-14 7:11 ` Peter Zijlstra
@ 2025-05-15 1:49 ` Shakeel Butt
1 sibling, 0 replies; 18+ messages in thread
From: Shakeel Butt @ 2025-05-15 1:49 UTC (permalink / raw)
To: Vlastimil Babka
Cc: Peter Zijlstra, Andrew Morton, Tejun Heo, Johannes Weiner,
Michal Hocko, Roman Gushchin, Muchun Song, Alexei Starovoitov,
Sebastian Andrzej Siewior, bpf, linux-mm, cgroups, linux-kernel,
Meta kernel team, Mathieu Desnoyers
On Tue, May 13, 2025 at 03:17:00PM -0700, Shakeel Butt wrote:
[...]
>
> Thanks a lot Vlastimil & Peter for the suggestions. Let me summarize
> what I plan to do and please point out if I am doing something wrong:
>
>
> #if defined(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS) || !defined(CONFIG_HAVE_NMI)
>
> // Do normal this_cpu* ops
>
> #elif defined(ARCH_HAVE_NMI_SAFE_CMPXCHG)
>
> // Do 32 bit atomic ops with in_nmi() checks
>
> #else
>
> // Build error or ignore nmi stats??
>
> #endif
>
>
Just wanted to circle back on the updated plan:
#if defined(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS) || !defined(CONFIG_HAVE_NMI)
- Allow memcg charging in nmi context
- Use this_cpu_ops for memcg stats even in nmi context
#elif defined(ARCH_HAVE_NMI_SAFE_CMPXCHG)
- Allow memcg charging in nmi context
- Use atomic* ops for selected memcg stats in nmi context
#else
- Do not allow memcg charging in nmi context
#endif
^ permalink raw reply [flat|nested] 18+ messages in thread