* [PATCH net-next 01/18] net: page_pool: Don't recycle into cache on PREEMPT_RT.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 02/18] net: dst_cache: Use nested-BH locking for dst_cache::cache Sebastian Andrzej Siewior
` (16 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Jesper Dangaard Brouer, Ilias Apalodimas
With preemptible softirq and no per-CPU locking in local_bh_disable() on
PREEMPT_RT the consumer can be preempted while a skb is returned.
Avoid the race by disabling the recycle into the cache on PREEMPT_RT.
Cc: Jesper Dangaard Brouer <hawk@kernel.org>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/core/page_pool.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f5e908c9e7ad8..1d669d4382729 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -800,6 +800,10 @@ static bool page_pool_napi_local(const struct page_pool *pool)
const struct napi_struct *napi;
u32 cpuid;
+ /* On PREEMPT_RT the softirq can be preempted by the consumer */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
+
if (unlikely(!in_softirq()))
return false;
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 02/18] net: dst_cache: Use nested-BH locking for dst_cache::cache.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 01/18] net: page_pool: Don't recycle into cache on PREEMPT_RT Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 03/18] ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT Sebastian Andrzej Siewior
` (15 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior
dst_cache::cache is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Add a local_lock_t to the data structure and use
local_lock_nested_bh() for locking. This change adds only lockdep
coverage and does not alter the functional behaviour for !PREEMPT_RT.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/core/dst_cache.c | 30 +++++++++++++++++++++++++++---
1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 70c634b9e7b02..93a04d18e5054 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -17,6 +17,7 @@
struct dst_cache_pcpu {
unsigned long refresh_ts;
struct dst_entry *dst;
+ local_lock_t bh_lock;
u32 cookie;
union {
struct in_addr in_saddr;
@@ -65,10 +66,15 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
{
+ struct dst_entry *dst;
+
if (!dst_cache->cache)
return NULL;
- return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get);
@@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in_saddr.s_addr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst_rtable(dst);
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
@@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst, 0);
idst->in_saddr.s_addr = saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
@@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst,
rt6_get_cookie(dst_rt6_info(dst)));
idst->in6_saddr = *saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
@@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in6_saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
@@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
{
+ unsigned int i;
+
dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
gfp | __GFP_ZERO);
if (!dst_cache->cache)
return -ENOMEM;
+ for_each_possible_cpu(i)
+ local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock);
dst_cache_reset(dst_cache);
return 0;
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 03/18] ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 01/18] net: page_pool: Don't recycle into cache on PREEMPT_RT Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 02/18] net: dst_cache: Use nested-BH locking for dst_cache::cache Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 04/18] ipv6: sr: Use nested-BH locking for hmac_storage Sebastian Andrzej Siewior
` (14 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
David Ahern
The statistics are incremented with raw_cpu_inc() assuming it always
happens with bottom half disabled. Without per-CPU locking in
local_bh_disable() on PREEMPT_RT this is no longer true.
Use this_cpu_inc() on PREEMPT_RT for the increment to not worry about
preemption.
Cc: David Ahern <dsahern@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/ipv4/route.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 753704f75b2c6..5d7c7efea66cc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = {
EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#ifndef CONFIG_PREEMPT_RT
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+#else
+#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
+#endif
#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 04/18] ipv6: sr: Use nested-BH locking for hmac_storage.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (2 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 03/18] ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 05/18] xdp: Use nested-BH locking for system_page_pool Sebastian Andrzej Siewior
` (13 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
David Ahern
hmac_storage is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Add a local_lock_t to the data structure and use
local_lock_nested_bh() for locking. This change adds only lockdep
coverage and does not alter the functional behaviour for !PREEMPT_RT.
Cc: David Ahern <dsahern@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/ipv6/seg6_hmac.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index bbf5b84a70fca..f78ecb6ad8383 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -40,7 +40,14 @@
#include <net/seg6_hmac.h>
#include <linux/random.h>
-static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
+struct hmac_storage {
+ local_lock_t bh_lock;
+ char hmac_ring[SEG6_HMAC_RING_SIZE];
+};
+
+static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
{
@@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
*/
local_bh_disable();
- ring = this_cpu_ptr(hmac_ring);
+ local_lock_nested_bh(&hmac_storage.bh_lock);
+ ring = this_cpu_ptr(hmac_storage.hmac_ring);
off = ring;
/* source address */
@@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
SEG6_HMAC_MAX_DIGESTSIZE);
+ local_unlock_nested_bh(&hmac_storage.bh_lock);
local_bh_enable();
if (dgsize < 0)
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 05/18] xdp: Use nested-BH locking for system_page_pool.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (3 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 04/18] ipv6: sr: Use nested-BH locking for hmac_storage Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 06/18] netfilter: nf_dup{4, 6}: Move duplication check to task_struct Sebastian Andrzej Siewior
` (12 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Andrew Lunn, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend
system_page_pool is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Make a struct with a page_pool member (original system_page_pool) and a
local_lock_t and use local_lock_nested_bh() for locking. This change
adds only lockdep coverage and does not alter the functional behaviour
for !PREEMPT_RT.
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jesper Dangaard Brouer <hawk@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/netdevice.h | 7 ++++++-
net/core/dev.c | 15 ++++++++++-----
net/core/xdp.c | 11 +++++++++--
3 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ab550a89b9bfa..6b740a4d7ee6d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3465,7 +3465,12 @@ struct softnet_data {
};
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
-DECLARE_PER_CPU(struct page_pool *, system_page_pool);
+
+struct page_pool_bh {
+ struct page_pool *pool;
+ local_lock_t bh_lock;
+};
+DECLARE_PER_CPU(struct page_pool_bh, system_page_pool);
#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 30da277c5a6f8..a9955b1c33001 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -460,7 +460,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#ifdef CONFIG_LOCKDEP
/*
@@ -5253,7 +5255,10 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
struct sk_buff *skb = *pskb;
int err, hroom, troom;
- if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
return 0;
/* In case we have to go down the path and also linearize,
@@ -12522,7 +12527,7 @@ static int net_page_pool_create(int cpuid)
return err;
}
- per_cpu(system_page_pool, cpuid) = pp_ptr;
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
return 0;
}
@@ -12652,13 +12657,13 @@ static int __init net_dev_init(void)
for_each_possible_cpu(i) {
struct page_pool *pp_ptr;
- pp_ptr = per_cpu(system_page_pool, i);
+ pp_ptr = per_cpu(system_page_pool.pool, i);
if (!pp_ptr)
continue;
xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
- per_cpu(system_page_pool, i) = NULL;
+ per_cpu(system_page_pool.pool, i) = NULL;
}
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 2c6ab6fb452f7..4b09c96e384dc 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -747,10 +747,10 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
*/
struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
{
- struct page_pool *pp = this_cpu_read(system_page_pool);
const struct xdp_rxq_info *rxq = xdp->rxq;
u32 len = xdp->data_end - xdp->data_meta;
u32 truesize = xdp->frame_sz;
+ struct page_pool *pp;
struct sk_buff *skb;
int metalen;
void *data;
@@ -758,13 +758,18 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
if (!IS_ENABLED(CONFIG_PAGE_POOL))
return NULL;
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ pp = this_cpu_read(system_page_pool.pool);
data = page_pool_dev_alloc_va(pp, &truesize);
- if (unlikely(!data))
+ if (unlikely(!data)) {
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
return NULL;
+ }
skb = napi_build_skb(data, truesize);
if (unlikely(!skb)) {
page_pool_free_va(pp, data, true);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
return NULL;
}
@@ -783,9 +788,11 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
if (unlikely(xdp_buff_has_frags(xdp)) &&
unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
napi_consume_skb(skb, true);
return NULL;
}
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
xsk_buff_free(xdp);
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 06/18] netfilter: nf_dup{4, 6}: Move duplication check to task_struct.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (4 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 05/18] xdp: Use nested-BH locking for system_page_pool Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-17 17:29 ` Paolo Abeni
2025-03-09 14:46 ` [PATCH net-next 07/18] netfilter: nft_inner: Use nested-BH locking for nft_pcpu_tun_ctx Sebastian Andrzej Siewior
` (11 subsequent siblings)
17 siblings, 1 reply; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Pablo Neira Ayuso, Jozsef Kadlecsik, Ingo Molnar, Peter Zijlstra,
Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
Ben Segall, Mel Gorman, Valentin Schneider
nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Due to the recursion involved, the simplest change is to make it a
per-task variable.
Move the per-CPU variable nf_skb_duplicated to task_struct and name it
in_nf_duplicate. Add it to the existing bitfield so it doesn't use
additional memory.
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Jozsef Kadlecsik <kadlec@netfilter.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/netfilter.h | 11 -----------
include/linux/sched.h | 1 +
net/ipv4/netfilter/ip_tables.c | 2 +-
net/ipv4/netfilter/nf_dup_ipv4.c | 6 +++---
net/ipv6/netfilter/ip6_tables.c | 2 +-
net/ipv6/netfilter/nf_dup_ipv6.c | 6 +++---
net/netfilter/core.c | 3 ---
7 files changed, 9 insertions(+), 22 deletions(-)
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 2b8aac2c70ada..892d12823ed4b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -497,17 +497,6 @@ struct nf_defrag_hook {
extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook;
extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook;
-/*
- * nf_skb_duplicated - TEE target has sent a packet
- *
- * When a xtables target sends a packet, the OUTPUT and POSTROUTING
- * hooks are traversed again, i.e. nft and xtables are invoked recursively.
- *
- * This is used by xtables TEE target to prevent the duplicated skb from
- * being duplicated again.
- */
-DECLARE_PER_CPU(bool, nf_skb_duplicated);
-
/*
* Contains bitmask of ctnetlink event subscribers, if any.
* Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9632e3318e0d6..c52b778777691 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1022,6 +1022,7 @@ struct task_struct {
/* delay due to memory thrashing */
unsigned in_thrashing:1;
#endif
+ unsigned in_nf_duplicate:1;
#ifdef CONFIG_PREEMPT_RT
struct netdev_xmit net_xmit;
#endif
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 3d101613f27fa..23c8deff8095a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -270,7 +270,7 @@ ipt_do_table(void *priv,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 25e1e8eb18dd5..ed08fb78cfa8c 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
struct iphdr *iph;
local_bh_disable();
- if (this_cpu_read(nf_skb_duplicated))
+ if (current->in_nf_duplicate)
goto out;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->ttl;
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7d5602950ae72..d585ac3c11133 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 0c39c77fe8a8a..b903c62c00c9e 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in6_addr *gw, int oif)
{
local_bh_disable();
- if (this_cpu_read(nf_skb_duplicated))
+ if (current->in_nf_duplicate)
goto out;
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
@@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->hop_limit;
}
if (nf_dup_ipv6_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip6_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b9f551f02c813..11a702065bab5 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -31,9 +31,6 @@
const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_ipv6_ops);
-DEFINE_PER_CPU(bool, nf_skb_duplicated);
-EXPORT_SYMBOL_GPL(nf_skb_duplicated);
-
#ifdef CONFIG_JUMP_LABEL
struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
EXPORT_SYMBOL(nf_hooks_needed);
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* Re: [PATCH net-next 06/18] netfilter: nf_dup{4, 6}: Move duplication check to task_struct.
2025-03-09 14:46 ` [PATCH net-next 06/18] netfilter: nf_dup{4, 6}: Move duplication check to task_struct Sebastian Andrzej Siewior
@ 2025-03-17 17:29 ` Paolo Abeni
2025-04-07 9:33 ` Juri Lelli
0 siblings, 1 reply; 32+ messages in thread
From: Paolo Abeni @ 2025-03-17 17:29 UTC (permalink / raw)
To: Sebastian Andrzej Siewior, Peter Zijlstra, Juri Lelli,
Valentin Schneider
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Simon Horman,
Thomas Gleixner, Pablo Neira Ayuso, Jozsef Kadlecsik, Ingo Molnar,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, netdev, linux-rt-devel
Hi,
On 3/9/25 3:46 PM, Sebastian Andrzej Siewior wrote:
> nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its
> locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
> this data structure requires explicit locking.
>
> Due to the recursion involved, the simplest change is to make it a
> per-task variable.
>
> Move the per-CPU variable nf_skb_duplicated to task_struct and name it
> in_nf_duplicate. Add it to the existing bitfield so it doesn't use
> additional memory.
>
> Cc: Pablo Neira Ayuso <pablo@netfilter.org>
> Cc: Jozsef Kadlecsik <kadlec@netfilter.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Juri Lelli <juri.lelli@redhat.com>
> Cc: Vincent Guittot <vincent.guittot@linaro.org>
> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Ben Segall <bsegall@google.com>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Valentin Schneider <vschneid@redhat.com>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
I'm not a super-fan of adding more flags to 'struct task', but in this
specific case I agree is the better option, as otherwise we should
acquire the local lock for a relatively large scope - the whole packet
processing by nft, right?
Still this needs some explicit ack from the relevant maintainers.
@Peter, @Juri, @Valentin: could you please have a look?
Thanks!
Paolo
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [PATCH net-next 06/18] netfilter: nf_dup{4, 6}: Move duplication check to task_struct.
2025-03-17 17:29 ` Paolo Abeni
@ 2025-04-07 9:33 ` Juri Lelli
0 siblings, 0 replies; 32+ messages in thread
From: Juri Lelli @ 2025-04-07 9:33 UTC (permalink / raw)
To: Paolo Abeni
Cc: Sebastian Andrzej Siewior, Peter Zijlstra, Valentin Schneider,
David S. Miller, Eric Dumazet, Jakub Kicinski, Simon Horman,
Thomas Gleixner, Pablo Neira Ayuso, Jozsef Kadlecsik, Ingo Molnar,
Vincent Guittot, Dietmar Eggemann, Steven Rostedt, Ben Segall,
Mel Gorman, netdev, linux-rt-devel
Hi Paolo,
On 17/03/25 18:29, Paolo Abeni wrote:
> Hi,
>
> On 3/9/25 3:46 PM, Sebastian Andrzej Siewior wrote:
> > nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its
> > locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
> > this data structure requires explicit locking.
> >
> > Due to the recursion involved, the simplest change is to make it a
> > per-task variable.
> >
> > Move the per-CPU variable nf_skb_duplicated to task_struct and name it
> > in_nf_duplicate. Add it to the existing bitfield so it doesn't use
> > additional memory.
> >
> > Cc: Pablo Neira Ayuso <pablo@netfilter.org>
> > Cc: Jozsef Kadlecsik <kadlec@netfilter.org>
> > Cc: Ingo Molnar <mingo@redhat.com>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Juri Lelli <juri.lelli@redhat.com>
> > Cc: Vincent Guittot <vincent.guittot@linaro.org>
> > Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Ben Segall <bsegall@google.com>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: Valentin Schneider <vschneid@redhat.com>
> > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>
> I'm not a super-fan of adding more flags to 'struct task', but in this
> specific case I agree is the better option, as otherwise we should
> acquire the local lock for a relatively large scope - the whole packet
> processing by nft, right?
>
> Still this needs some explicit ack from the relevant maintainers.
> @Peter, @Juri, @Valentin: could you please have a look?
The additional flag fills a hole, so, FWIW, I don't see particular
problems with it.
Best,
Juri
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH net-next 07/18] netfilter: nft_inner: Use nested-BH locking for nft_pcpu_tun_ctx.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (5 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 06/18] netfilter: nf_dup{4, 6}: Move duplication check to task_struct Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 08/18] netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit Sebastian Andrzej Siewior
` (10 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Pablo Neira Ayuso, Jozsef Kadlecsik, netfilter-devel, coreteam
nft_pcpu_tun_ctx is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Make a struct with a nft_inner_tun_ctx member (original
nft_pcpu_tun_ctx) and a local_lock_t and use local_lock_nested_bh() for
locking. This change adds only lockdep coverage and does not alter the
functional behaviour for !PREEMPT_RT.
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Jozsef Kadlecsik <kadlec@netfilter.org>
Cc: netfilter-devel@vger.kernel.org
Cc: coreteam@netfilter.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/netfilter/nft_inner.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 817ab978d24a1..c4569d4b92285 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -23,7 +23,14 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
-static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
+struct nft_inner_tun_ctx_locked {
+ struct nft_inner_tun_ctx ctx;
+ local_lock_t bh_lock;
+};
+
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
/* Same layout as nft_expr but it embeds the private expression data area. */
struct __nft_expr {
@@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *this_cpu_tun_ctx;
local_bh_disable();
- this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) {
local_bh_enable();
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
return false;
}
*tun_ctx = *this_cpu_tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
local_bh_enable();
return true;
@@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt,
struct nft_inner_tun_ctx *this_cpu_tun_ctx;
local_bh_disable();
- this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
+ local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
+ this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx);
if (this_cpu_tun_ctx->cookie != tun_ctx->cookie)
*this_cpu_tun_ctx = *tun_ctx;
+ local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock);
local_bh_enable();
}
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 08/18] netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (6 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 07/18] netfilter: nft_inner: Use nested-BH locking for nft_pcpu_tun_ctx Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 09/18] xfrm: Use nested-BH locking for nat_keepalive_sk_ipv[46] Sebastian Andrzej Siewior
` (9 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Pablo Neira Ayuso, Jozsef Kadlecsik, netfilter-devel, coreteam
nf_dup_skb_recursion is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Move nf_dup_skb_recursion to struct netdev_xmit, provide wrappers.
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Jozsef Kadlecsik <kadlec@netfilter.org>
Cc: netfilter-devel@vger.kernel.org
Cc: coreteam@netfilter.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/netdevice_xmit.h | 3 +++
net/netfilter/nf_dup_netdev.c | 22 ++++++++++++++++++----
2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 38325e0702968..3bbbc1a9860a3 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -8,6 +8,9 @@ struct netdev_xmit {
#ifdef CONFIG_NET_EGRESS
u8 skip_txqueue;
#endif
+#if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
+ u8 nf_dup_skb_recursion;
+#endif
};
#endif
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index a8e2425e43b0d..fab8b9011098f 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -15,12 +15,26 @@
#define NF_RECURSION_LIMIT 2
-static DEFINE_PER_CPU(u8, nf_dup_skb_recursion);
+#ifndef CONFIG_PREEMPT_RT
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+}
+#else
+
+static u8 *nf_get_nf_dup_skb_recursion(void)
+{
+ return ¤t->net_xmit.nf_dup_skb_recursion;
+}
+
+#endif
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
- if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT)
+ u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
+
+ if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
goto err;
if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
@@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb_clear_tstamp(skb);
- __this_cpu_inc(nf_dup_skb_recursion);
+ (*nf_dup_skb_recursion)++;
dev_queue_xmit(skb);
- __this_cpu_dec(nf_dup_skb_recursion);
+ (*nf_dup_skb_recursion)--;
return;
err:
kfree_skb(skb);
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 09/18] xfrm: Use nested-BH locking for nat_keepalive_sk_ipv[46].
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (7 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 08/18] netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 10/18] openvswitch: Merge three per-CPU structures into one Sebastian Andrzej Siewior
` (8 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Steffen Klassert, Herbert Xu
nat_keepalive_sk_ipv[46] is a per-CPU variable and relies on disabled BH
for its locking. Without per-CPU locking in local_bh_disable() on
PREEMPT_RT this data structure requires explicit locking.
Use sock_bh_locked which has a sock pointer and a local_lock_t. Use
local_lock_nested_bh() for locking. This change adds only lockdep
coverage and does not alter the functional behaviour for !PREEMPT_RT.
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/xfrm/xfrm_nat_keepalive.c | 30 ++++++++++++++++++++----------
1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c
index 82f0a301683f0..ebf95d48e86c1 100644
--- a/net/xfrm/xfrm_nat_keepalive.c
+++ b/net/xfrm/xfrm_nat_keepalive.c
@@ -9,9 +9,13 @@
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
-static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4);
+static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#if IS_ENABLED(CONFIG_IPV6)
-static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6);
+static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#endif
struct nat_keepalive {
@@ -56,10 +60,12 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb,
skb_dst_set(skb, &rt->dst);
- sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4);
+ local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock);
+ sk = this_cpu_read(nat_keepalive_sk_ipv4.sock);
sock_net_set(sk, net);
err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos);
sock_net_set(sk, &init_net);
+ local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock);
return err;
}
@@ -89,15 +95,19 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb,
fl6.fl6_sport = ka->encap_sport;
fl6.fl6_dport = ka->encap_dport;
- sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6);
+ local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
+ sk = this_cpu_read(nat_keepalive_sk_ipv6.sock);
sock_net_set(sk, net);
dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL);
- if (IS_ERR(dst))
+ if (IS_ERR(dst)) {
+ local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
return PTR_ERR(dst);
+ }
skb_dst_set(skb, dst);
err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0);
sock_net_set(sk, &init_net);
+ local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);
return err;
}
#endif
@@ -202,7 +212,7 @@ static void nat_keepalive_work(struct work_struct *work)
(ctx.next_run - ctx.now) * HZ);
}
-static int nat_keepalive_sk_init(struct sock * __percpu *socks,
+static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks,
unsigned short family)
{
struct sock *sk;
@@ -214,22 +224,22 @@ static int nat_keepalive_sk_init(struct sock * __percpu *socks,
if (err < 0)
goto err;
- *per_cpu_ptr(socks, i) = sk;
+ per_cpu_ptr(socks, i)->sock = sk;
}
return 0;
err:
for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(socks, i));
+ inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock);
return err;
}
-static void nat_keepalive_sk_fini(struct sock * __percpu *socks)
+static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks)
{
int i;
for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(socks, i));
+ inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock);
}
void xfrm_nat_keepalive_state_updated(struct xfrm_state *x)
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 10/18] openvswitch: Merge three per-CPU structures into one.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (8 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 09/18] xfrm: Use nested-BH locking for nat_keepalive_sk_ipv[46] Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-17 17:34 ` Paolo Abeni
2025-03-09 14:46 ` [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions Sebastian Andrzej Siewior
` (7 subsequent siblings)
17 siblings, 1 reply; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Pravin B Shelar, dev
exec_actions_level is a per-CPU integer allocated at compile time.
action_fifos and flow_keys are per-CPU pointer and have their data
allocated at module init time.
There is no gain in splitting it, once the module is allocated, the
structures are allocated.
Merge the three per-CPU variables into ovs_action, adapt callers.
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: dev@openvswitch.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/openvswitch/actions.c | 49 +++++++++++++-------------------------
net/openvswitch/datapath.c | 9 +------
net/openvswitch/datapath.h | 3 ---
3 files changed, 17 insertions(+), 44 deletions(-)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 704c858cf2093..322ca7b30c3bc 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -78,17 +78,22 @@ struct action_flow_keys {
struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
};
-static struct action_fifo __percpu *action_fifos;
-static struct action_flow_keys __percpu *flow_keys;
-static DEFINE_PER_CPU(int, exec_actions_level);
+struct ovs_action {
+ struct action_fifo action_fifos;
+ struct action_flow_keys flow_keys;
+ int exec_level;
+};
+
+static DEFINE_PER_CPU(struct ovs_action, ovs_actions);
/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
* space. Return NULL if out of key spaces.
*/
static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
{
- struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
- int level = this_cpu_read(exec_actions_level);
+ struct ovs_action *ovs_act = this_cpu_ptr(&ovs_actions);
+ struct action_flow_keys *keys = &ovs_act->flow_keys;
+ int level = ovs_act->exec_level;
struct sw_flow_key *key = NULL;
if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
@@ -132,10 +137,9 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
const struct nlattr *actions,
const int actions_len)
{
- struct action_fifo *fifo;
+ struct action_fifo *fifo = this_cpu_ptr(&ovs_actions.action_fifos);
struct deferred_action *da;
- fifo = this_cpu_ptr(action_fifos);
da = action_fifo_put(fifo);
if (da) {
da->skb = skb;
@@ -1615,13 +1619,13 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
if (actions) { /* Sample action */
if (clone_flow_key)
- __this_cpu_inc(exec_actions_level);
+ __this_cpu_inc(ovs_actions.exec_level);
err = do_execute_actions(dp, skb, clone,
actions, len);
if (clone_flow_key)
- __this_cpu_dec(exec_actions_level);
+ __this_cpu_dec(ovs_actions.exec_level);
} else { /* Recirc action */
clone->recirc_id = recirc_id;
ovs_dp_process_packet(skb, clone);
@@ -1657,7 +1661,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb,
static void process_deferred_actions(struct datapath *dp)
{
- struct action_fifo *fifo = this_cpu_ptr(action_fifos);
+ struct action_fifo *fifo = this_cpu_ptr(&ovs_actions.action_fifos);
/* Do not touch the FIFO in case there is no deferred actions. */
if (action_fifo_is_empty(fifo))
@@ -1688,7 +1692,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
{
int err, level;
- level = __this_cpu_inc_return(exec_actions_level);
+ level = __this_cpu_inc_return(ovs_actions.exec_level);
if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
ovs_dp_name(dp));
@@ -1705,27 +1709,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
process_deferred_actions(dp);
out:
- __this_cpu_dec(exec_actions_level);
+ __this_cpu_dec(ovs_actions.exec_level);
return err;
}
-
-int action_fifos_init(void)
-{
- action_fifos = alloc_percpu(struct action_fifo);
- if (!action_fifos)
- return -ENOMEM;
-
- flow_keys = alloc_percpu(struct action_flow_keys);
- if (!flow_keys) {
- free_percpu(action_fifos);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-void action_fifos_exit(void)
-{
- free_percpu(action_fifos);
- free_percpu(flow_keys);
-}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 5d548eda742df..aaa6277bb49c2 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2729,13 +2729,9 @@ static int __init dp_init(void)
pr_info("Open vSwitch switching datapath\n");
- err = action_fifos_init();
- if (err)
- goto error;
-
err = ovs_internal_dev_rtnl_link_register();
if (err)
- goto error_action_fifos_exit;
+ goto error;
err = ovs_flow_init();
if (err)
@@ -2778,8 +2774,6 @@ static int __init dp_init(void)
ovs_flow_exit();
error_unreg_rtnl_link:
ovs_internal_dev_rtnl_link_unregister();
-error_action_fifos_exit:
- action_fifos_exit();
error:
return err;
}
@@ -2795,7 +2789,6 @@ static void dp_cleanup(void)
ovs_vport_exit();
ovs_flow_exit();
ovs_internal_dev_rtnl_link_unregister();
- action_fifos_exit();
}
module_init(dp_init);
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 365b9bb7f546e..1fce99992d25d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -270,9 +270,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
void ovs_dp_notify_wq(struct work_struct *work);
-int action_fifos_init(void);
-void action_fifos_exit(void);
-
/* 'KEY' must not have any bits set outside of the 'MASK' */
#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK))
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* Re: [PATCH net-next 10/18] openvswitch: Merge three per-CPU structures into one.
2025-03-09 14:46 ` [PATCH net-next 10/18] openvswitch: Merge three per-CPU structures into one Sebastian Andrzej Siewior
@ 2025-03-17 17:34 ` Paolo Abeni
2025-03-17 17:38 ` Paolo Abeni
0 siblings, 1 reply; 32+ messages in thread
From: Paolo Abeni @ 2025-03-17 17:34 UTC (permalink / raw)
To: Sebastian Andrzej Siewior, netdev, linux-rt-devel, Aaron Conole,
Eelco Chaudron, Ilya Maximets
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Simon Horman,
Thomas Gleixner, Pravin B Shelar, dev
On 3/9/25 3:46 PM, Sebastian Andrzej Siewior wrote:
> exec_actions_level is a per-CPU integer allocated at compile time.
> action_fifos and flow_keys are per-CPU pointer and have their data
> allocated at module init time.
> There is no gain in splitting it, once the module is allocated, the
> structures are allocated.
>
> Merge the three per-CPU variables into ovs_action, adapt callers.
>
> Cc: Pravin B Shelar <pshelar@ovn.org>
> Cc: dev@openvswitch.org
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> net/openvswitch/actions.c | 49 +++++++++++++-------------------------
> net/openvswitch/datapath.c | 9 +------
> net/openvswitch/datapath.h | 3 ---
> 3 files changed, 17 insertions(+), 44 deletions(-)
>
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 704c858cf2093..322ca7b30c3bc 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -78,17 +78,22 @@ struct action_flow_keys {
> struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
> };
>
> -static struct action_fifo __percpu *action_fifos;
> -static struct action_flow_keys __percpu *flow_keys;
> -static DEFINE_PER_CPU(int, exec_actions_level);
> +struct ovs_action {
> + struct action_fifo action_fifos;
> + struct action_flow_keys flow_keys;
> + int exec_level;
> +};
I have the feeling this is not a very good name, as 'OVS action' has a
quite specific meaning, not really matched here.
Also more OVS people, as Pravin is not really active anymore.
Thanks,
Paolo
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [PATCH net-next 10/18] openvswitch: Merge three per-CPU structures into one.
2025-03-17 17:34 ` Paolo Abeni
@ 2025-03-17 17:38 ` Paolo Abeni
0 siblings, 0 replies; 32+ messages in thread
From: Paolo Abeni @ 2025-03-17 17:38 UTC (permalink / raw)
To: Sebastian Andrzej Siewior, netdev, linux-rt-devel, Aaron Conole,
Eelco Chaudron, Ilya Maximets
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Simon Horman,
Thomas Gleixner, Pravin B Shelar, dev
On 3/17/25 6:34 PM, Paolo Abeni wrote:
> On 3/9/25 3:46 PM, Sebastian Andrzej Siewior wrote:
>> exec_actions_level is a per-CPU integer allocated at compile time.
>> action_fifos and flow_keys are per-CPU pointer and have their data
>> allocated at module init time.
>> There is no gain in splitting it, once the module is allocated, the
>> structures are allocated.
>>
>> Merge the three per-CPU variables into ovs_action, adapt callers.
>>
>> Cc: Pravin B Shelar <pshelar@ovn.org>
>> Cc: dev@openvswitch.org
>> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>> ---
>> net/openvswitch/actions.c | 49 +++++++++++++-------------------------
>> net/openvswitch/datapath.c | 9 +------
>> net/openvswitch/datapath.h | 3 ---
>> 3 files changed, 17 insertions(+), 44 deletions(-)
>>
>> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> index 704c858cf2093..322ca7b30c3bc 100644
>> --- a/net/openvswitch/actions.c
>> +++ b/net/openvswitch/actions.c
>> @@ -78,17 +78,22 @@ struct action_flow_keys {
>> struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
>> };
>>
>> -static struct action_fifo __percpu *action_fifos;
>> -static struct action_flow_keys __percpu *flow_keys;
>> -static DEFINE_PER_CPU(int, exec_actions_level);
>> +struct ovs_action {
>> + struct action_fifo action_fifos;
>> + struct action_flow_keys flow_keys;
>> + int exec_level;
>> +};
>
> I have the feeling this is not a very good name, as 'OVS action' has a
> quite specific meaning, not really matched here.
>
> Also more OVS people, as Pravin is not really active anymore.
FTR, I'm processing the PW backlog in sequence (and with latency...),
and I see only now this point has been discussed in the next patch.
Sorry for the noise.
/P
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (9 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 10/18] openvswitch: Merge three per-CPU structures into one Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-10 14:17 ` [ovs-dev] " Ilya Maximets
2025-03-09 14:46 ` [PATCH net-next 12/18] openvswitch: Move ovs_frag_data_storage into the struct ovs_action Sebastian Andrzej Siewior
` (6 subsequent siblings)
17 siblings, 1 reply; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Pravin B Shelar, dev
ovs_actions is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
The data structure can be referenced recursive and there is a recursion
counter to avoid too many recursions.
Add a local_lock_t to the data structure and use
local_lock_nested_bh() for locking. Add an owner of the struct which is
the current task and acquire the lock only if the structure is not owned
by the current task.
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: dev@openvswitch.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/openvswitch/actions.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 322ca7b30c3bc..c4131e04c1284 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -82,6 +82,8 @@ struct ovs_action {
struct action_fifo action_fifos;
struct action_flow_keys flow_keys;
int exec_level;
+ struct task_struct *owner;
+ local_lock_t bh_lock;
};
static DEFINE_PER_CPU(struct ovs_action, ovs_actions);
@@ -1690,8 +1692,14 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
+ struct ovs_action *ovs_act = this_cpu_ptr(&ovs_actions);
int err, level;
+ if (ovs_act->owner != current) {
+ local_lock_nested_bh(&ovs_actions.bh_lock);
+ ovs_act->owner = current;
+ }
+
level = __this_cpu_inc_return(ovs_actions.exec_level);
if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
@@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
out:
__this_cpu_dec(ovs_actions.exec_level);
+
+ if (level == 1) {
+ ovs_act->owner = NULL;
+ local_unlock_nested_bh(&ovs_actions.bh_lock);
+ }
return err;
}
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-09 14:46 ` [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions Sebastian Andrzej Siewior
@ 2025-03-10 14:17 ` Ilya Maximets
2025-03-10 14:44 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 32+ messages in thread
From: Ilya Maximets @ 2025-03-10 14:17 UTC (permalink / raw)
To: Sebastian Andrzej Siewior, netdev, linux-rt-devel
Cc: dev, Eric Dumazet, Simon Horman, Jakub Kicinski, Thomas Gleixner,
Paolo Abeni, David S. Miller, i.maximets
On 3/9/25 15:46, Sebastian Andrzej Siewior wrote:
> ovs_actions is a per-CPU variable and relies on disabled BH for its
> locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
> this data structure requires explicit locking.
> The data structure can be referenced recursive and there is a recursion
> counter to avoid too many recursions.
>
> Add a local_lock_t to the data structure and use
> local_lock_nested_bh() for locking. Add an owner of the struct which is
> the current task and acquire the lock only if the structure is not owned
> by the current task.
>
> Cc: Pravin B Shelar <pshelar@ovn.org>
> Cc: dev@openvswitch.org
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> net/openvswitch/actions.c | 13 +++++++++++++
> 1 file changed, 13 insertions(+)
>
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 322ca7b30c3bc..c4131e04c1284 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -82,6 +82,8 @@ struct ovs_action {
> struct action_fifo action_fifos;
> struct action_flow_keys flow_keys;
> int exec_level;
> + struct task_struct *owner;
> + local_lock_t bh_lock;
> };
>
> static DEFINE_PER_CPU(struct ovs_action, ovs_actions);
> @@ -1690,8 +1692,14 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
> const struct sw_flow_actions *acts,
> struct sw_flow_key *key)
> {
> + struct ovs_action *ovs_act = this_cpu_ptr(&ovs_actions);
> int err, level;
>
> + if (ovs_act->owner != current) {
> + local_lock_nested_bh(&ovs_actions.bh_lock);
Wouldn't this cause a warning when we're in a syscall/process context?
We will also be taking a spinlock in a general case here, which doesn't
sound particularly great, since we can potentially be holding it for a
long time and it's also not free to take/release on this hot path.
Is there a version of this lock that's a no-op on non-RT?
> + ovs_act->owner = current;
> + }
> +
> level = __this_cpu_inc_return(ovs_actions.exec_level);
> if (unlikely(level > OVS_RECURSION_LIMIT)) {
> net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
> @@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
>
> out:
> __this_cpu_dec(ovs_actions.exec_level);
> +
> + if (level == 1) {
> + ovs_act->owner = NULL;
> + local_unlock_nested_bh(&ovs_actions.bh_lock);
> + }
Seems dangerous to lock every time the owner changes but unlock only
once on level 1. Even if this works fine, it seems unnecessarily
complicated. Maybe it's better to just lock once before calling
ovs_execute_actions() instead?
Also, the name of the struct ovs_action doesn't make a lot of sense,
I'd suggest to call it pcpu_storage or something like that instead.
I.e. have a more generic name as the fields inside are not directly
related to each other.
Best regards, Ilya Maximets.
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-10 14:17 ` [ovs-dev] " Ilya Maximets
@ 2025-03-10 14:44 ` Sebastian Andrzej Siewior
2025-03-10 16:56 ` Ilya Maximets
0 siblings, 1 reply; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-10 14:44 UTC (permalink / raw)
To: Ilya Maximets
Cc: netdev, linux-rt-devel, dev, Eric Dumazet, Simon Horman,
Jakub Kicinski, Thomas Gleixner, Paolo Abeni, David S. Miller
On 2025-03-10 15:17:17 [+0100], Ilya Maximets wrote:
> > --- a/net/openvswitch/actions.c
> > +++ b/net/openvswitch/actions.c
> > @@ -82,6 +82,8 @@ struct ovs_action {
> > struct action_fifo action_fifos;
> > struct action_flow_keys flow_keys;
> > int exec_level;
> > + struct task_struct *owner;
> > + local_lock_t bh_lock;
> > };
> >
> > static DEFINE_PER_CPU(struct ovs_action, ovs_actions);
> > @@ -1690,8 +1692,14 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
> > const struct sw_flow_actions *acts,
> > struct sw_flow_key *key)
> > {
> > + struct ovs_action *ovs_act = this_cpu_ptr(&ovs_actions);
> > int err, level;
> >
> > + if (ovs_act->owner != current) {
> > + local_lock_nested_bh(&ovs_actions.bh_lock);
>
> Wouldn't this cause a warning when we're in a syscall/process context?
My understanding is that is only invoked in softirq context. Did I
misunderstood it? Otherwise that this_cpu_ptr() above should complain
that preemption is not disabled and if preemption is indeed not disabled
how do you ensure that you don't get preempted after the
__this_cpu_inc_return() in several tasks (at the same time) leading to
exceeding the OVS_RECURSION_LIMIT?
> We will also be taking a spinlock in a general case here, which doesn't
> sound particularly great, since we can potentially be holding it for a
> long time and it's also not free to take/release on this hot path.
> Is there a version of this lock that's a no-op on non-RT?
local_lock_nested_bh() does not acquire any lock on !PREEMPT_RT. It only
verifies that in_softirq() is true.
> > + ovs_act->owner = current;
> > + }
> > +
> > level = __this_cpu_inc_return(ovs_actions.exec_level);
> > if (unlikely(level > OVS_RECURSION_LIMIT)) {
> > net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
> > @@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
> >
> > out:
> > __this_cpu_dec(ovs_actions.exec_level);
> > +
> > + if (level == 1) {
> > + ovs_act->owner = NULL;
> > + local_unlock_nested_bh(&ovs_actions.bh_lock);
> > + }
>
> Seems dangerous to lock every time the owner changes but unlock only
> once on level 1. Even if this works fine, it seems unnecessarily
> complicated. Maybe it's better to just lock once before calling
> ovs_execute_actions() instead?
My understanding is this can be invoked recursively. That means on first
invocation owner == NULL and then you acquire the lock at which point
exec_level goes 0->1. On the recursive invocation owner == current and
you skip the lock but exec_level goes 1 -> 2.
On your return path once level becomes 1, then it means that dec made it
go 1 -> 0, you unlock the lock.
The locking part happens only on PREEMPT_RT because !PREEMPT_RT has
softirqs disabled which guarantee that there will be no preemption.
tools/testing/selftests/net/openvswitch should cover this?
> Also, the name of the struct ovs_action doesn't make a lot of sense,
> I'd suggest to call it pcpu_storage or something like that instead.
> I.e. have a more generic name as the fields inside are not directly
> related to each other.
Understood. ovs_pcpu_storage maybe?
> Best regards, Ilya Maximets.
Sebastian
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-10 14:44 ` Sebastian Andrzej Siewior
@ 2025-03-10 16:56 ` Ilya Maximets
2025-03-13 11:50 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 32+ messages in thread
From: Ilya Maximets @ 2025-03-10 16:56 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: i.maximets, netdev, linux-rt-devel, dev, Eric Dumazet,
Simon Horman, Jakub Kicinski, Thomas Gleixner, Paolo Abeni,
David S. Miller
On 3/10/25 15:44, Sebastian Andrzej Siewior wrote:
> On 2025-03-10 15:17:17 [+0100], Ilya Maximets wrote:
>>> --- a/net/openvswitch/actions.c
>>> +++ b/net/openvswitch/actions.c
>>> @@ -82,6 +82,8 @@ struct ovs_action {
>>> struct action_fifo action_fifos;
>>> struct action_flow_keys flow_keys;
>>> int exec_level;
>>> + struct task_struct *owner;
>>> + local_lock_t bh_lock;
>>> };
>>>
>>> static DEFINE_PER_CPU(struct ovs_action, ovs_actions);
>>> @@ -1690,8 +1692,14 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
>>> const struct sw_flow_actions *acts,
>>> struct sw_flow_key *key)
>>> {
>>> + struct ovs_action *ovs_act = this_cpu_ptr(&ovs_actions);
>>> int err, level;
>>>
>>> + if (ovs_act->owner != current) {
>>> + local_lock_nested_bh(&ovs_actions.bh_lock);
>>
>> Wouldn't this cause a warning when we're in a syscall/process context?
>
> My understanding is that is only invoked in softirq context. Did I
> misunderstood it?
It can be called from the syscall/process context while processing
OVS_PACKET_CMD_EXECUTE request.
> Otherwise that this_cpu_ptr() above should complain
> that preemption is not disabled and if preemption is indeed not disabled
> how do you ensure that you don't get preempted after the
> __this_cpu_inc_return() in several tasks (at the same time) leading to
> exceeding the OVS_RECURSION_LIMIT?
We disable BH in this case, so it should be safe (on non-RT). See the
ovs_packet_cmd_execute() for more details.
>
>> We will also be taking a spinlock in a general case here, which doesn't
>> sound particularly great, since we can potentially be holding it for a
>> long time and it's also not free to take/release on this hot path.
>> Is there a version of this lock that's a no-op on non-RT?
>
> local_lock_nested_bh() does not acquire any lock on !PREEMPT_RT. It only
> verifies that in_softirq() is true.
Ah, you're right. It does more things, but only under lock debug.
>
>>> + ovs_act->owner = current;
>>> + }
>>> +
>>> level = __this_cpu_inc_return(ovs_actions.exec_level);
>>> if (unlikely(level > OVS_RECURSION_LIMIT)) {
>>> net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
>>> @@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
>>>
>>> out:
>>> __this_cpu_dec(ovs_actions.exec_level);
>>> +
>>> + if (level == 1) {
>>> + ovs_act->owner = NULL;
>>> + local_unlock_nested_bh(&ovs_actions.bh_lock);
>>> + }
>>
>> Seems dangerous to lock every time the owner changes but unlock only
>> once on level 1. Even if this works fine, it seems unnecessarily
>> complicated. Maybe it's better to just lock once before calling
>> ovs_execute_actions() instead?
>
> My understanding is this can be invoked recursively. That means on first
> invocation owner == NULL and then you acquire the lock at which point
> exec_level goes 0->1. On the recursive invocation owner == current and
> you skip the lock but exec_level goes 1 -> 2.
> On your return path once level becomes 1, then it means that dec made it
> go 1 -> 0, you unlock the lock.
My point is: why locking here with some extra non-obvious logic of owner
tracking if we can lock (unconditionally?) in ovs_packet_cmd_execute() and
ovs_dp_process_packet() instead? We already disable BH in one of those
and take appropriate RCU locks in both. So, feels like a better place
for the extra locking if necessary. We will also not need to move around
any code in actions.c if the code there is guaranteed to be safe by holding
locks outside of it.
> The locking part happens only on PREEMPT_RT because !PREEMPT_RT has
> softirqs disabled which guarantee that there will be no preemption.
>
> tools/testing/selftests/net/openvswitch should cover this?
It's not a comprehensive test suite, it covers some cases, but it
doesn't test anything related to preemptions specifically.
>
>> Also, the name of the struct ovs_action doesn't make a lot of sense,
>> I'd suggest to call it pcpu_storage or something like that instead.
>> I.e. have a more generic name as the fields inside are not directly
>> related to each other.
>
> Understood. ovs_pcpu_storage maybe?
It's OK, I guess, but see also a point about locking inside datapath.c
instead and probably not needing to change anything in actions.c.
>
>> Best regards, Ilya Maximets.
>
> Sebastian
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-10 16:56 ` Ilya Maximets
@ 2025-03-13 11:50 ` Sebastian Andrzej Siewior
2025-03-13 13:23 ` Ilya Maximets
0 siblings, 1 reply; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-13 11:50 UTC (permalink / raw)
To: Ilya Maximets
Cc: netdev, linux-rt-devel, dev, Eric Dumazet, Simon Horman,
Jakub Kicinski, Thomas Gleixner, Paolo Abeni, David S. Miller
On 2025-03-10 17:56:09 [+0100], Ilya Maximets wrote:
> >>> + local_lock_nested_bh(&ovs_actions.bh_lock);
> >>
> >> Wouldn't this cause a warning when we're in a syscall/process context?
> >
> > My understanding is that is only invoked in softirq context. Did I
> > misunderstood it?
>
> It can be called from the syscall/process context while processing
> OVS_PACKET_CMD_EXECUTE request.
>
> > Otherwise that this_cpu_ptr() above should complain
> > that preemption is not disabled and if preemption is indeed not disabled
> > how do you ensure that you don't get preempted after the
> > __this_cpu_inc_return() in several tasks (at the same time) leading to
> > exceeding the OVS_RECURSION_LIMIT?
>
> We disable BH in this case, so it should be safe (on non-RT). See the
> ovs_packet_cmd_execute() for more details.
Yes, exactly. So if BH is disabled then local_lock_nested_bh() can
safely acquire a per-CPU spinlock on PREEMPT_RT here. This basically
mimics the local_bh_disable() behaviour in terms of exclusive data
structures on a smaller scope.
> >>> + ovs_act->owner = current;
> >>> + }
> >>> +
> >>> level = __this_cpu_inc_return(ovs_actions.exec_level);
> >>> if (unlikely(level > OVS_RECURSION_LIMIT)) {
> >>> net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
> >>> @@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
> >>>
> >>> out:
> >>> __this_cpu_dec(ovs_actions.exec_level);
> >>> +
> >>> + if (level == 1) {
> >>> + ovs_act->owner = NULL;
> >>> + local_unlock_nested_bh(&ovs_actions.bh_lock);
> >>> + }
> >>
> >> Seems dangerous to lock every time the owner changes but unlock only
> >> once on level 1. Even if this works fine, it seems unnecessarily
> >> complicated. Maybe it's better to just lock once before calling
> >> ovs_execute_actions() instead?
> >
> > My understanding is this can be invoked recursively. That means on first
> > invocation owner == NULL and then you acquire the lock at which point
> > exec_level goes 0->1. On the recursive invocation owner == current and
> > you skip the lock but exec_level goes 1 -> 2.
> > On your return path once level becomes 1, then it means that dec made it
> > go 1 -> 0, you unlock the lock.
>
> My point is: why locking here with some extra non-obvious logic of owner
> tracking if we can lock (unconditionally?) in ovs_packet_cmd_execute() and
> ovs_dp_process_packet() instead? We already disable BH in one of those
> and take appropriate RCU locks in both. So, feels like a better place
> for the extra locking if necessary. We will also not need to move around
> any code in actions.c if the code there is guaranteed to be safe by holding
> locks outside of it.
I think I was considering it but dropped it because it looks like one
can call the other.
ovs_packet_cmd_execute() is an unique entry to ovs_execute_actions().
This could the lock unconditionally.
Then we have ovs_dp_process_packet() as the second entry point towards
ovs_execute_actions() and is the tricky one. One originates from
netdev_frame_hook() which the "normal" packet receiving.
Then within ovs_execute_actions() there is ovs_vport_send() which could
use internal_dev_recv() for forwarding. This one throws the packet into
the networking stack so it could come back via netdev_frame_hook().
Then there is this internal forwarding via internal_dev_xmit() which
also ends up in ovs_execute_actions(). Here I don't know if this can
originate from within the recursion.
After looking at this and seeing the internal_dev_recv() I decided to
move it to within ovs_execute_actions() where the recursion check itself
is.
> > The locking part happens only on PREEMPT_RT because !PREEMPT_RT has
> > softirqs disabled which guarantee that there will be no preemption.
> >
> > tools/testing/selftests/net/openvswitch should cover this?
>
> It's not a comprehensive test suite, it covers some cases, but it
> doesn't test anything related to preemptions specifically.
From looking at the traces, everything originates from
netdev_frame_hook() and there is sometimes one recursion from within
ovs_execute_actions(). I haven't seen anything else.
> >> Also, the name of the struct ovs_action doesn't make a lot of sense,
> >> I'd suggest to call it pcpu_storage or something like that instead.
> >> I.e. have a more generic name as the fields inside are not directly
> >> related to each other.
> >
> > Understood. ovs_pcpu_storage maybe?
>
> It's OK, I guess, but see also a point about locking inside datapath.c
> instead and probably not needing to change anything in actions.c.
If you say that adding a lock to ovs_dp_process_packet() and another to
ovs_packet_cmd_execute() then I can certainly update. However based on
what I wrote above, I am not sure.
> >> Best regards, Ilya Maximets.
Sebastian
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-13 11:50 ` Sebastian Andrzej Siewior
@ 2025-03-13 13:23 ` Ilya Maximets
2025-03-13 14:02 ` Sebastian Andrzej Siewior
2025-03-13 14:11 ` Aaron Conole
0 siblings, 2 replies; 32+ messages in thread
From: Ilya Maximets @ 2025-03-13 13:23 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: i.maximets, netdev, linux-rt-devel, dev, Eric Dumazet,
Simon Horman, Jakub Kicinski, Thomas Gleixner, Paolo Abeni,
David S. Miller, Aaron Conole, Eelco Chaudron
On 3/13/25 12:50, Sebastian Andrzej Siewior wrote:
> On 2025-03-10 17:56:09 [+0100], Ilya Maximets wrote:
>>>>> + local_lock_nested_bh(&ovs_actions.bh_lock);
>>>>
>>>> Wouldn't this cause a warning when we're in a syscall/process context?
>>>
>>> My understanding is that is only invoked in softirq context. Did I
>>> misunderstood it?
>>
>> It can be called from the syscall/process context while processing
>> OVS_PACKET_CMD_EXECUTE request.
>>
>>> Otherwise that this_cpu_ptr() above should complain
>>> that preemption is not disabled and if preemption is indeed not disabled
>>> how do you ensure that you don't get preempted after the
>>> __this_cpu_inc_return() in several tasks (at the same time) leading to
>>> exceeding the OVS_RECURSION_LIMIT?
>>
>> We disable BH in this case, so it should be safe (on non-RT). See the
>> ovs_packet_cmd_execute() for more details.
>
> Yes, exactly. So if BH is disabled then local_lock_nested_bh() can
> safely acquire a per-CPU spinlock on PREEMPT_RT here. This basically
> mimics the local_bh_disable() behaviour in terms of exclusive data
> structures on a smaller scope.
OK. I missed that is_softirq() returns true when BH is disabled manually.
>
>>>>> + ovs_act->owner = current;
>>>>> + }
>>>>> +
>>>>> level = __this_cpu_inc_return(ovs_actions.exec_level);
>>>>> if (unlikely(level > OVS_RECURSION_LIMIT)) {
>>>>> net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
>>>>> @@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
>>>>>
>>>>> out:
>>>>> __this_cpu_dec(ovs_actions.exec_level);
>>>>> +
>>>>> + if (level == 1) {
>>>>> + ovs_act->owner = NULL;
>>>>> + local_unlock_nested_bh(&ovs_actions.bh_lock);
>>>>> + }
>>>>
>>>> Seems dangerous to lock every time the owner changes but unlock only
>>>> once on level 1. Even if this works fine, it seems unnecessarily
>>>> complicated. Maybe it's better to just lock once before calling
>>>> ovs_execute_actions() instead?
>>>
>>> My understanding is this can be invoked recursively. That means on first
>>> invocation owner == NULL and then you acquire the lock at which point
>>> exec_level goes 0->1. On the recursive invocation owner == current and
>>> you skip the lock but exec_level goes 1 -> 2.
>>> On your return path once level becomes 1, then it means that dec made it
>>> go 1 -> 0, you unlock the lock.
>>
>> My point is: why locking here with some extra non-obvious logic of owner
>> tracking if we can lock (unconditionally?) in ovs_packet_cmd_execute() and
>> ovs_dp_process_packet() instead? We already disable BH in one of those
>> and take appropriate RCU locks in both. So, feels like a better place
>> for the extra locking if necessary. We will also not need to move around
>> any code in actions.c if the code there is guaranteed to be safe by holding
>> locks outside of it.
>
> I think I was considering it but dropped it because it looks like one
> can call the other.
> ovs_packet_cmd_execute() is an unique entry to ovs_execute_actions().
> This could the lock unconditionally.
> Then we have ovs_dp_process_packet() as the second entry point towards
> ovs_execute_actions() and is the tricky one. One originates from
> netdev_frame_hook() which the "normal" packet receiving.
> Then within ovs_execute_actions() there is ovs_vport_send() which could
> use internal_dev_recv() for forwarding. This one throws the packet into
> the networking stack so it could come back via netdev_frame_hook().
> Then there is this internal forwarding via internal_dev_xmit() which
> also ends up in ovs_execute_actions(). Here I don't know if this can
> originate from within the recursion.
It's true that ovs_packet_cmd_execute() can not be re-intered, while
ovs_dp_process_packet() can be re-entered if the packet leaves OVS and
then comes back from another port. It's still better to handle all the
locking within datapath.c and not lock for RT in actions.c and for non-RT
in datapath.c.
>
> After looking at this and seeing the internal_dev_recv() I decided to
> move it to within ovs_execute_actions() where the recursion check itself
> is.
>
>>> The locking part happens only on PREEMPT_RT because !PREEMPT_RT has
>>> softirqs disabled which guarantee that there will be no preemption.
>>>
>>> tools/testing/selftests/net/openvswitch should cover this?
>>
>> It's not a comprehensive test suite, it covers some cases, but it
>> doesn't test anything related to preemptions specifically.
>
> From looking at the traces, everything originates from
> netdev_frame_hook() and there is sometimes one recursion from within
> ovs_execute_actions(). I haven't seen anything else.
>
>>>> Also, the name of the struct ovs_action doesn't make a lot of sense,
>>>> I'd suggest to call it pcpu_storage or something like that instead.
>>>> I.e. have a more generic name as the fields inside are not directly
>>>> related to each other.
>>>
>>> Understood. ovs_pcpu_storage maybe?
>>
>> It's OK, I guess, but see also a point about locking inside datapath.c
>> instead and probably not needing to change anything in actions.c.
>
> If you say that adding a lock to ovs_dp_process_packet() and another to
> ovs_packet_cmd_execute() then I can certainly update. However based on
> what I wrote above, I am not sure.
I think, it's better if we keep all the locks in datapath.c and let
actions.c assume that all the operations are always safe as it was
originally intended.
Cc: Aaron and Eelco, in case they have some thoughts on this as well.
Best regards, Ilya Maximets.
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-13 13:23 ` Ilya Maximets
@ 2025-03-13 14:02 ` Sebastian Andrzej Siewior
2025-03-13 14:11 ` Aaron Conole
1 sibling, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-13 14:02 UTC (permalink / raw)
To: Ilya Maximets
Cc: netdev, linux-rt-devel, dev, Eric Dumazet, Simon Horman,
Jakub Kicinski, Thomas Gleixner, Paolo Abeni, David S. Miller,
Aaron Conole, Eelco Chaudron
On 2025-03-13 14:23:16 [+0100], Ilya Maximets wrote:
> > originate from within the recursion.
>
> It's true that ovs_packet_cmd_execute() can not be re-intered, while
> ovs_dp_process_packet() can be re-entered if the packet leaves OVS and
> then comes back from another port. It's still better to handle all the
> locking within datapath.c and not lock for RT in actions.c and for non-RT
> in datapath.c.
Okay.
> >>>> Also, the name of the struct ovs_action doesn't make a lot of sense,
> >>>> I'd suggest to call it pcpu_storage or something like that instead.
> >>>> I.e. have a more generic name as the fields inside are not directly
> >>>> related to each other.
> >>>
> >>> Understood. ovs_pcpu_storage maybe?
> >>
> >> It's OK, I guess, but see also a point about locking inside datapath.c
> >> instead and probably not needing to change anything in actions.c.
> >
> > If you say that adding a lock to ovs_dp_process_packet() and another to
> > ovs_packet_cmd_execute() then I can certainly update. However based on
> > what I wrote above, I am not sure.
>
> I think, it's better if we keep all the locks in datapath.c and let
> actions.c assume that all the operations are always safe as it was
> originally intended.
If you say so. Then I move the logic to the two callers to datapath.c
then. But I would need the same recursive lock-detection as I currently
have in ovs_dp_process_packet(). That means we would have the lock
datapath.c and the data structure it protects in actions.c.
> Cc: Aaron and Eelco, in case they have some thoughts on this as well.
While at it, I would keep "openvswitch: Merge three per-CPU structures
into one." since it looks like a nice clean up.
> Best regards, Ilya Maximets.
Sebastian
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [ovs-dev] [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions.
2025-03-13 13:23 ` Ilya Maximets
2025-03-13 14:02 ` Sebastian Andrzej Siewior
@ 2025-03-13 14:11 ` Aaron Conole
1 sibling, 0 replies; 32+ messages in thread
From: Aaron Conole @ 2025-03-13 14:11 UTC (permalink / raw)
To: Ilya Maximets
Cc: Sebastian Andrzej Siewior, netdev, linux-rt-devel, dev,
Eric Dumazet, Simon Horman, Jakub Kicinski, Thomas Gleixner,
Paolo Abeni, David S. Miller, Eelco Chaudron
Ilya Maximets <i.maximets@ovn.org> writes:
> On 3/13/25 12:50, Sebastian Andrzej Siewior wrote:
>> On 2025-03-10 17:56:09 [+0100], Ilya Maximets wrote:
>>>>>> + local_lock_nested_bh(&ovs_actions.bh_lock);
>>>>>
>>>>> Wouldn't this cause a warning when we're in a syscall/process context?
>>>>
>>>> My understanding is that is only invoked in softirq context. Did I
>>>> misunderstood it?
>>>
>>> It can be called from the syscall/process context while processing
>>> OVS_PACKET_CMD_EXECUTE request.
>>>
>>>> Otherwise that this_cpu_ptr() above should complain
>>>> that preemption is not disabled and if preemption is indeed not disabled
>>>> how do you ensure that you don't get preempted after the
>>>> __this_cpu_inc_return() in several tasks (at the same time) leading to
>>>> exceeding the OVS_RECURSION_LIMIT?
>>>
>>> We disable BH in this case, so it should be safe (on non-RT). See the
>>> ovs_packet_cmd_execute() for more details.
>>
>> Yes, exactly. So if BH is disabled then local_lock_nested_bh() can
>> safely acquire a per-CPU spinlock on PREEMPT_RT here. This basically
>> mimics the local_bh_disable() behaviour in terms of exclusive data
>> structures on a smaller scope.
>
> OK. I missed that is_softirq() returns true when BH is disabled manually.
>
>>
>>>>>> + ovs_act->owner = current;
>>>>>> + }
>>>>>> +
>>>>>> level = __this_cpu_inc_return(ovs_actions.exec_level);
>>>>>> if (unlikely(level > OVS_RECURSION_LIMIT)) {
>>>>>> net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
>>>>>> @@ -1710,5 +1718,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
>>>>>>
>>>>>> out:
>>>>>> __this_cpu_dec(ovs_actions.exec_level);
>>>>>> +
>>>>>> + if (level == 1) {
>>>>>> + ovs_act->owner = NULL;
>>>>>> + local_unlock_nested_bh(&ovs_actions.bh_lock);
>>>>>> + }
>>>>>
>>>>> Seems dangerous to lock every time the owner changes but unlock only
>>>>> once on level 1. Even if this works fine, it seems unnecessarily
>>>>> complicated. Maybe it's better to just lock once before calling
>>>>> ovs_execute_actions() instead?
>>>>
>>>> My understanding is this can be invoked recursively. That means on first
>>>> invocation owner == NULL and then you acquire the lock at which point
>>>> exec_level goes 0->1. On the recursive invocation owner == current and
>>>> you skip the lock but exec_level goes 1 -> 2.
>>>> On your return path once level becomes 1, then it means that dec made it
>>>> go 1 -> 0, you unlock the lock.
>>>
>>> My point is: why locking here with some extra non-obvious logic of owner
>>> tracking if we can lock (unconditionally?) in ovs_packet_cmd_execute() and
>>> ovs_dp_process_packet() instead? We already disable BH in one of those
>>> and take appropriate RCU locks in both. So, feels like a better place
>>> for the extra locking if necessary. We will also not need to move around
>>> any code in actions.c if the code there is guaranteed to be safe by holding
>>> locks outside of it.
>>
>> I think I was considering it but dropped it because it looks like one
>> can call the other.
>> ovs_packet_cmd_execute() is an unique entry to ovs_execute_actions().
>> This could the lock unconditionally.
>> Then we have ovs_dp_process_packet() as the second entry point towards
>> ovs_execute_actions() and is the tricky one. One originates from
>> netdev_frame_hook() which the "normal" packet receiving.
>> Then within ovs_execute_actions() there is ovs_vport_send() which could
>> use internal_dev_recv() for forwarding. This one throws the packet into
>> the networking stack so it could come back via netdev_frame_hook().
>> Then there is this internal forwarding via internal_dev_xmit() which
>> also ends up in ovs_execute_actions(). Here I don't know if this can
>> originate from within the recursion.
Just a note that it still needs to go through `ovs_dp_process_packet()`
before it will enter the `ovs_execute_actions()` call by way of the
`ovs_vport_receive()` call. So keeping the locking in datapath.c should
not be a complex task.
> It's true that ovs_packet_cmd_execute() can not be re-intered, while
> ovs_dp_process_packet() can be re-entered if the packet leaves OVS and
> then comes back from another port. It's still better to handle all the
> locking within datapath.c and not lock for RT in actions.c and for non-RT
> in datapath.c.
+1
>>
>> After looking at this and seeing the internal_dev_recv() I decided to
>> move it to within ovs_execute_actions() where the recursion check itself
>> is.
>>
>>>> The locking part happens only on PREEMPT_RT because !PREEMPT_RT has
>>>> softirqs disabled which guarantee that there will be no preemption.
>>>>
>>>> tools/testing/selftests/net/openvswitch should cover this?
>>>
>>> It's not a comprehensive test suite, it covers some cases, but it
>>> doesn't test anything related to preemptions specifically.
Yes, this would be good to enhance, and the plan is to improve it as we
can. If during the course of this work you identify a nice test case,
please do feel empowered to add it.
>> From looking at the traces, everything originates from
>> netdev_frame_hook() and there is sometimes one recursion from within
>> ovs_execute_actions(). I haven't seen anything else.
>>
>>>>> Also, the name of the struct ovs_action doesn't make a lot of sense,
>>>>> I'd suggest to call it pcpu_storage or something like that instead.
>>>>> I.e. have a more generic name as the fields inside are not directly
>>>>> related to each other.
>>>>
>>>> Understood. ovs_pcpu_storage maybe?
>>>
>>> It's OK, I guess, but see also a point about locking inside datapath.c
>>> instead and probably not needing to change anything in actions.c.
>>
>> If you say that adding a lock to ovs_dp_process_packet() and another to
>> ovs_packet_cmd_execute() then I can certainly update. However based on
>> what I wrote above, I am not sure.
>
> I think, it's better if we keep all the locks in datapath.c and let
> actions.c assume that all the operations are always safe as it was
> originally intended.
Agreed - reading through actions code can be complex enough without need
to remember to do recursions checks there.
> Cc: Aaron and Eelco, in case they have some thoughts on this as well.
Thanks Ilya - I think you covered the major points.
> Best regards, Ilya Maximets.
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH net-next 12/18] openvswitch: Move ovs_frag_data_storage into the struct ovs_action.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (10 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 11/18] openvswitch: Use nested-BH locking for ovs_actions Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 13/18] net/sched: act_mirred: Move the recursion counter struct netdev_xmit Sebastian Andrzej Siewior
` (5 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Pravin B Shelar, dev
ovs_frag_data_storage is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Move ovs_frag_data_storage into the struct ovs_action which already
provides locking for the structure.
Cc: Pravin B Shelar <pshelar@ovn.org>
Cc: dev@openvswitch.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/openvswitch/actions.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c4131e04c1284..3ced7d8a946c2 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -62,8 +62,6 @@ struct ovs_frag_data {
u8 l2_data[MAX_L2_LEN];
};
-static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
-
#define DEFERRED_ACTION_FIFO_SIZE 10
#define OVS_RECURSION_LIMIT 5
#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
@@ -81,6 +79,7 @@ struct action_flow_keys {
struct ovs_action {
struct action_fifo action_fifos;
struct action_flow_keys flow_keys;
+ struct ovs_frag_data ovs_frag_data_storage;
int exec_level;
struct task_struct *owner;
local_lock_t bh_lock;
@@ -800,7 +799,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
static int ovs_vport_output(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
+ struct ovs_frag_data *data = this_cpu_ptr(&ovs_actions.ovs_frag_data_storage);
struct vport *vport = data->vport;
if (skb_cow_head(skb, data->l2_len) < 0) {
@@ -852,7 +851,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
- data = this_cpu_ptr(&ovs_frag_data_storage);
+ data = this_cpu_ptr(&ovs_actions.ovs_frag_data_storage);
data->dst = skb->_skb_refdst;
data->vport = vport;
data->cb = *OVS_CB(skb);
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 13/18] net/sched: act_mirred: Move the recursion counter struct netdev_xmit.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (11 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 12/18] openvswitch: Move ovs_frag_data_storage into the struct ovs_action Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-21 17:14 ` Davide Caratti
2025-03-09 14:46 ` [PATCH net-next 14/18] net/sched: Use nested-BH locking for sch_frag_data_storage Sebastian Andrzej Siewior
` (4 subsequent siblings)
17 siblings, 1 reply; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Jamal Hadi Salim, Cong Wang, Jiri Pirko
mirred_nest_level is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers.
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/netdevice_xmit.h | 3 +++
net/sched/act_mirred.c | 28 +++++++++++++++++++++++++---
2 files changed, 28 insertions(+), 3 deletions(-)
diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 3bbbc1a9860a3..4793ec42b1faa 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -11,6 +11,9 @@ struct netdev_xmit {
#if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
u8 nf_dup_skb_recursion;
#endif
+#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
+ u8 sched_mirred_nest;
+#endif
};
#endif
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5b38143659249..8d8cfac6cc6af 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
#define MIRRED_NEST_LIMIT 4
-static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
+
+#ifndef CONFIG_PREEMPT_RT
+static u8 tcf_mirred_nest_level_inc_return(void)
+{
+ return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
+}
+
+static void tcf_mirred_nest_level_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
+}
+
+#else
+static u8 tcf_mirred_nest_level_inc_return(void)
+{
+ return current->net_xmit.nf_dup_skb_recursion++;
+}
+
+static void tcf_mirred_nest_level_dec(void)
+{
+ current->net_xmit.nf_dup_skb_recursion--;
+}
+#endif
static bool tcf_mirred_is_act_redirect(int action)
{
@@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
int m_eaction;
u32 blockid;
- nest_level = __this_cpu_inc_return(mirred_nest_level);
+ nest_level = tcf_mirred_nest_level_inc_return();
if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
netdev_name(skb->dev));
@@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
retval);
dec_nest_level:
- __this_cpu_dec(mirred_nest_level);
+ tcf_mirred_nest_level_dec();
return retval;
}
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* Re: [PATCH net-next 13/18] net/sched: act_mirred: Move the recursion counter struct netdev_xmit.
2025-03-09 14:46 ` [PATCH net-next 13/18] net/sched: act_mirred: Move the recursion counter struct netdev_xmit Sebastian Andrzej Siewior
@ 2025-03-21 17:14 ` Davide Caratti
2025-04-11 13:12 ` Sebastian Andrzej Siewior
0 siblings, 1 reply; 32+ messages in thread
From: Davide Caratti @ 2025-03-21 17:14 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: netdev, linux-rt-devel, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Thomas Gleixner,
Jamal Hadi Salim, Cong Wang, Jiri Pirko
hi,
On Sun, Mar 9, 2025 at 3:48 PM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> mirred_nest_level is a per-CPU variable and relies on disabled BH for its
> locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
> this data structure requires explicit locking.
>
> Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers.
>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Cc: Cong Wang <xiyou.wangcong@gmail.com>
> Cc: Jiri Pirko <jiri@resnulli.us>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> include/linux/netdevice_xmit.h | 3 +++
> net/sched/act_mirred.c | 28 +++++++++++++++++++++++++---
> 2 files changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
> index 3bbbc1a9860a3..4793ec42b1faa 100644
> --- a/include/linux/netdevice_xmit.h
> +++ b/include/linux/netdevice_xmit.h
> @@ -11,6 +11,9 @@ struct netdev_xmit {
> #if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
> u8 nf_dup_skb_recursion;
> #endif
> +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
> + u8 sched_mirred_nest;
> +#endif
> };
>
> #endif
> diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
> index 5b38143659249..8d8cfac6cc6af 100644
> --- a/net/sched/act_mirred.c
> +++ b/net/sched/act_mirred.c
> @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list);
> static DEFINE_SPINLOCK(mirred_list_lock);
>
> #define MIRRED_NEST_LIMIT 4
> -static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
> +
> +#ifndef CONFIG_PREEMPT_RT
> +static u8 tcf_mirred_nest_level_inc_return(void)
> +{
> + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
> +}
> +
> +static void tcf_mirred_nest_level_dec(void)
> +{
> + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
> +}
> +
> +#else
> +static u8 tcf_mirred_nest_level_inc_return(void)
> +{
> + return current->net_xmit.nf_dup_skb_recursion++;
> +}
> +
> +static void tcf_mirred_nest_level_dec(void)
> +{
> + current->net_xmit.nf_dup_skb_recursion--;
> +}
> +#endif
sorry for reviewing this late - but shouldn't we use sched_mirred_nest
instead of nf_dup_skb_recursion in case CONFIG_PREEMPT_RT is set?
thanks,
--
davide
^ permalink raw reply [flat|nested] 32+ messages in thread* Re: [PATCH net-next 13/18] net/sched: act_mirred: Move the recursion counter struct netdev_xmit.
2025-03-21 17:14 ` Davide Caratti
@ 2025-04-11 13:12 ` Sebastian Andrzej Siewior
0 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-04-11 13:12 UTC (permalink / raw)
To: Davide Caratti
Cc: netdev, linux-rt-devel, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Thomas Gleixner,
Jamal Hadi Salim, Cong Wang, Jiri Pirko
On 2025-03-21 18:14:50 [+0100], Davide Caratti wrote:
> hi,
Hi,
> > index 5b38143659249..8d8cfac6cc6af 100644
> > --- a/net/sched/act_mirred.c
> > +++ b/net/sched/act_mirred.c
> > @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list);
…
> > +#else
> > +static u8 tcf_mirred_nest_level_inc_return(void)
> > +{
> > + return current->net_xmit.nf_dup_skb_recursion++;
> > +}
> > +
> > +static void tcf_mirred_nest_level_dec(void)
> > +{
> > + current->net_xmit.nf_dup_skb_recursion--;
> > +}
> > +#endif
>
> sorry for reviewing this late - but shouldn't we use sched_mirred_nest
> instead of nf_dup_skb_recursion in case CONFIG_PREEMPT_RT is set?
you are correct, thank you.
> thanks,
Sebastian
^ permalink raw reply [flat|nested] 32+ messages in thread
* [PATCH net-next 14/18] net/sched: Use nested-BH locking for sch_frag_data_storage.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (12 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 13/18] net/sched: act_mirred: Move the recursion counter struct netdev_xmit Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 15/18] mptcp: Use nested-BH locking for hmac_storage Sebastian Andrzej Siewior
` (3 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Jamal Hadi Salim, Cong Wang, Jiri Pirko
sch_frag_data_storage is a per-CPU variable and relies on disabled BH
for its locking. Without per-CPU locking in local_bh_disable() on
PREEMPT_RT this data structure requires explicit locking.
Add local_lock_t to the struct and use local_lock_nested_bh() for locking.
This change adds only lockdep coverage and does not alter the functional
behaviour for !PREEMPT_RT.
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/sched/sch_frag.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c
index ce63414185fd6..d1d87dce7f3f7 100644
--- a/net/sched/sch_frag.c
+++ b/net/sched/sch_frag.c
@@ -16,14 +16,18 @@ struct sch_frag_data {
unsigned int l2_len;
u8 l2_data[VLAN_ETH_HLEN];
int (*xmit)(struct sk_buff *skb);
+ local_lock_t bh_lock;
};
-static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage);
+static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage);
+ lockdep_assert_held(&data->bh_lock);
if (skb_cow_head(skb, data->l2_len) < 0) {
kfree_skb(skb);
return -ENOMEM;
@@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
struct rtable sch_frag_rt = { 0 };
unsigned long orig_dst;
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
sch_frag_prepare_frag(skb, xmit);
dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
DST_OBSOLETE_NONE, DST_NOCOUNT);
@@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
IPCB(skb)->frag_max_size = mru;
ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
refdst_drop(orig_dst);
} else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) {
unsigned long orig_dst;
struct rt6_info sch_frag_rt;
+ local_lock_nested_bh(&sch_frag_data_storage.bh_lock);
sch_frag_prepare_frag(skb, xmit);
memset(&sch_frag_rt, 0, sizeof(sch_frag_rt));
dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL,
@@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb,
ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb,
sch_frag_xmit);
+ local_unlock_nested_bh(&sch_frag_data_storage.bh_lock);
refdst_drop(orig_dst);
} else {
net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n",
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 15/18] mptcp: Use nested-BH locking for hmac_storage.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (13 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 14/18] net/sched: Use nested-BH locking for sch_frag_data_storage Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 16/18] rds: Disable only bottom halves in rds_page_remainder_alloc() Sebastian Andrzej Siewior
` (2 subsequent siblings)
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Matthieu Baerts, Mat Martineau, Geliang Tang, mptcp
mptcp_delegated_actions is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data
structure requires explicit locking.
Add a local_lock_t to the data structure and use local_lock_nested_bh() for
locking. This change adds only lockdep coverage and does not alter the
functional behaviour for !PREEMPT_RT.
Cc: Matthieu Baerts <matttbe@kernel.org>
Cc: Mat Martineau <martineau@kernel.org>
Cc: Geliang Tang <geliang@kernel.org>
Cc: mptcp@lists.linux.dev
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/mptcp/protocol.c | 4 +++-
net/mptcp/protocol.h | 9 ++++++++-
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 6bd8190474706..922b5baf3372a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -46,7 +46,9 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm
static void __mptcp_destroy_sock(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);
-DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static struct net_device *mptcp_napi_dev;
/* Returns end sequence number of the receiver's advertised window */
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index ad21925af0612..8705d699e9870 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -480,6 +480,7 @@ mptcp_subflow_rsk(const struct request_sock *rsk)
struct mptcp_delegated_action {
struct napi_struct napi;
+ local_lock_t bh_lock;
struct list_head head;
};
@@ -671,9 +672,11 @@ static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow,
if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node)))
return;
+ local_lock_nested_bh(&mptcp_delegated_actions.bh_lock);
delegated = this_cpu_ptr(&mptcp_delegated_actions);
schedule = list_empty(&delegated->head);
list_add_tail(&subflow->delegated_node, &delegated->head);
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
sock_hold(mptcp_subflow_tcp_sock(subflow));
if (schedule)
napi_schedule(&delegated->napi);
@@ -685,11 +688,15 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
{
struct mptcp_subflow_context *ret;
- if (list_empty(&delegated->head))
+ local_lock_nested_bh(&mptcp_delegated_actions.bh_lock);
+ if (list_empty(&delegated->head)) {
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
return NULL;
+ }
ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
list_del_init(&ret->delegated_node);
+ local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock);
return ret;
}
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 16/18] rds: Disable only bottom halves in rds_page_remainder_alloc().
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (14 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 15/18] mptcp: Use nested-BH locking for hmac_storage Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 17/18] rds: Acquire per-CPU pointer within BH disabled section Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 18/18] rds: Use nested-BH locking for rds_page_remainder Sebastian Andrzej Siewior
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Allison Henderson, linux-rdma
rds_page_remainder_alloc() is invoked from a preemptible context or a
tasklet. There is no need to disable interrupts for locking.
Use local_bh_disable() instead of local_irq_save() for locking.
Cc: Allison Henderson <allison.henderson@oracle.com>
Cc: linux-rdma@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/rds/page.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/net/rds/page.c b/net/rds/page.c
index 7cc57e098ddb9..e0dd4f62ea47a 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -69,7 +69,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
gfp_t gfp)
{
struct rds_page_remainder *rem;
- unsigned long flags;
struct page *page;
int ret;
@@ -88,7 +87,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
}
rem = &per_cpu(rds_page_remainders, get_cpu());
- local_irq_save(flags);
+ local_bh_disable();
while (1) {
/* avoid a tiny region getting stuck by tossing it */
@@ -116,13 +115,13 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
}
/* alloc if there is nothing for us to use */
- local_irq_restore(flags);
+ local_bh_enable();
put_cpu();
page = alloc_page(gfp);
rem = &per_cpu(rds_page_remainders, get_cpu());
- local_irq_save(flags);
+ local_bh_disable();
if (!page) {
ret = -ENOMEM;
@@ -140,7 +139,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
rem->r_offset = 0;
}
- local_irq_restore(flags);
+ local_bh_enable();
put_cpu();
out:
rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 17/18] rds: Acquire per-CPU pointer within BH disabled section.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (15 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 16/18] rds: Disable only bottom halves in rds_page_remainder_alloc() Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
2025-03-09 14:46 ` [PATCH net-next 18/18] rds: Use nested-BH locking for rds_page_remainder Sebastian Andrzej Siewior
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Allison Henderson, linux-rdma
rds_page_remainder_alloc() obtains the current CPU with get_cpu() while
disabling preemption. Then the CPU number is used to access the per-CPU
data structure via per_cpu().
This can be optimized by relying on local_bh_disable() to provide a
stable CPU number/ avoid migration and then using this_cpu_ptr() to
retrieve the data structure.
Cc: Allison Henderson <allison.henderson@oracle.com>
Cc: linux-rdma@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/rds/page.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/net/rds/page.c b/net/rds/page.c
index e0dd4f62ea47a..58a8548a915a9 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -86,8 +86,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
goto out;
}
- rem = &per_cpu(rds_page_remainders, get_cpu());
local_bh_disable();
+ rem = this_cpu_ptr(&rds_page_remainders);
while (1) {
/* avoid a tiny region getting stuck by tossing it */
@@ -116,12 +116,11 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
/* alloc if there is nothing for us to use */
local_bh_enable();
- put_cpu();
page = alloc_page(gfp);
- rem = &per_cpu(rds_page_remainders, get_cpu());
local_bh_disable();
+ rem = this_cpu_ptr(&rds_page_remainders);
if (!page) {
ret = -ENOMEM;
@@ -140,7 +139,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
}
local_bh_enable();
- put_cpu();
out:
rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread* [PATCH net-next 18/18] rds: Use nested-BH locking for rds_page_remainder.
2025-03-09 14:46 [PATCH net-next 00/18] net: Cover more per-CPU storage with local nested BH locking Sebastian Andrzej Siewior
` (16 preceding siblings ...)
2025-03-09 14:46 ` [PATCH net-next 17/18] rds: Acquire per-CPU pointer within BH disabled section Sebastian Andrzej Siewior
@ 2025-03-09 14:46 ` Sebastian Andrzej Siewior
17 siblings, 0 replies; 32+ messages in thread
From: Sebastian Andrzej Siewior @ 2025-03-09 14:46 UTC (permalink / raw)
To: netdev, linux-rt-devel
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Thomas Gleixner, Sebastian Andrzej Siewior,
Allison Henderson, linux-rdma
rds_page_remainder is a per-CPU variable and relies on disabled BH for its
locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT
this data structure requires explicit locking.
Add a local_lock_t to the data structure and use
local_lock_nested_bh() for locking. This change adds only lockdep
coverage and does not alter the functional behaviour for !PREEMPT_RT.
Cc: Allison Henderson <allison.henderson@oracle.com>
Cc: linux-rdma@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/rds/page.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/net/rds/page.c b/net/rds/page.c
index 58a8548a915a9..afb151eac271c 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -40,10 +40,12 @@
struct rds_page_remainder {
struct page *r_page;
unsigned long r_offset;
+ local_lock_t bh_lock;
};
-static
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
/**
* rds_page_remainder_alloc - build up regions of a message.
@@ -87,6 +89,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
}
local_bh_disable();
+ local_lock_nested_bh(&rds_page_remainders.bh_lock);
rem = this_cpu_ptr(&rds_page_remainders);
while (1) {
@@ -115,11 +118,13 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
}
/* alloc if there is nothing for us to use */
+ local_unlock_nested_bh(&rds_page_remainders.bh_lock);
local_bh_enable();
page = alloc_page(gfp);
local_bh_disable();
+ local_lock_nested_bh(&rds_page_remainders.bh_lock);
rem = this_cpu_ptr(&rds_page_remainders);
if (!page) {
@@ -138,6 +143,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
rem->r_offset = 0;
}
+ local_unlock_nested_bh(&rds_page_remainders.bh_lock);
local_bh_enable();
out:
rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
--
2.47.2
^ permalink raw reply related [flat|nested] 32+ messages in thread