* [PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity
@ 2026-04-28 2:54 Chuang Wang
2026-04-28 5:09 ` Eric Dumazet
0 siblings, 1 reply; 2+ messages in thread
From: Chuang Wang @ 2026-04-28 2:54 UTC (permalink / raw)
Cc: Chuang Wang, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Stanislav Fomichev, Kuniyuki Iwashima,
Samiullah Khawaja, Hangbin Liu, Krishna Kumar, Neal Cardwell,
Martin KaFai Lau, netdev, linux-kernel
The current implementation of rps_record_sock_flow() updates the flow
table every time a socket is processed on a different CPU. In high-load
scenarios, especially with Accelerated RFS (ARFS), this triggers
frequent flow steering updates via ndo_rx_flow_steer.
For drivers like mlx5 that implement hardware flow steering, these
constant updates lead to significant contention on internal driver locks
(e.g., arfs_lock). This contention often becomes a performance
bottleneck that outweighs the steering benefits.
This patch introduces a cache-aware update strategy: the flow record is
only updated if the flow migrates across Last Level Cache (LLC)
boundaries. This minimizes expensive hardware reconfigurations while
preserving cache locality for the application. A new sysctl,
net.core.rps_feat_llc_affinity, is added to toggle this feature.
Performance Test Results:
The patch was tested in a K8s environment (AMD CPU 128*2, 16-core Pod
with CPU pinning, mlx5 NIC) using brpc[1] echo_server and rpc_press.
rpc_press Commands:
for i in {1..8}; do
./rpc_press -proto=./echo.proto -method=example.EchoService.Echo
-server=<IP>:8000 -input='{"message":"hello"}'
-qps=0 -thread_num=512 -connection_type=pooled &
done
Monitor mlx5e_rx_flow_steer frequency:
/usr/share/bcc/tools/funccount -i 1 mlx5e_rx_flow_steer
Frequency of mlx5e_rx_flow_steer (via funccount[2]):
Before: ~335,000 counts/sec
After: ~23,000 counts/sec (reduced by ~93%)
System Metrics (after enabling rps_feat_llc_affinity):
CPU Utilization: 38% -> 32%
CPU PSI (Pressure Stall Information): 20% -> 10%
These results demonstrate that filtering updates by LLC affinity
significantly reduces driver lock contention and improves overall
CPU efficiency under heavy network load.
[1] https://github.com/apache/brpc/
[2] https://github.com/iovisor/bcc/blob/master/tools/funccount.py
Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
---
v2 -> v3: patch net -> net-next
v1 -> v2: add rps_feat_llc_affinity; add brpc tests
include/net/rps.h | 18 ++--------
net/core/dev.c | 72 ++++++++++++++++++++++++++++++++++++++
net/core/sysctl_net_core.c | 34 ++++++++++++++++++
3 files changed, 108 insertions(+), 16 deletions(-)
diff --git a/include/net/rps.h b/include/net/rps.h
index e33c6a2fa8bb..37bbb7009c36 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -12,6 +12,7 @@
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
+extern struct static_key_false rps_feat_llc_affinity;
/*
* This structure holds an RPS map which can be of variable length. The
@@ -55,22 +56,7 @@ struct rps_sock_flow_table {
#define RPS_NO_CPU 0xffff
-static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
-{
- unsigned int index = hash & rps_tag_to_mask(tag_ptr);
- u32 val = hash & ~net_hotdata.rps_cpu_mask;
- struct rps_sock_flow_table *table;
-
- /* We only give a hint, preemption can change CPU under us */
- val |= raw_smp_processor_id();
-
- table = rps_tag_to_table(tag_ptr);
- /* The following WRITE_ONCE() is paired with the READ_ONCE()
- * here, and another one in get_rps_cpu().
- */
- if (READ_ONCE(table[index].ent) != val)
- WRITE_ONCE(table[index].ent, val);
-}
+void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
static inline void _sock_rps_record_flow_hash(__u32 hash)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index 203dc36aaed5..630a7f21d8de 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4964,6 +4964,8 @@ struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+struct static_key_false rps_feat_llc_affinity __read_mostly;
+EXPORT_SYMBOL(rps_feat_llc_affinity);
static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
{
@@ -5175,6 +5177,76 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
return cpu;
}
+/**
+ * rps_record_cond - Determine if RPS flow table should be updated
+ * @old_val: Previous flow record value
+ * @new_val: Target flow record value
+ *
+ * Returns true if the record needs an update.
+ */
+static inline bool rps_record_cond(u32 old_val, u32 new_val)
+{
+ u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
+ u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
+
+ if (old_val == new_val)
+ return false;
+
+ /*
+ * RPS LLC Affinity Feature:
+ * Reduce RFS/ARFS flow updates by checking LLC affinity.
+ *
+ * Frequent flow table updates can trigger constant hardware steering
+ * reconfigurations (e.g., ndo_rx_flow_steer), leading to significant
+ * contention on driver internal locks (like mlx5's arfs_lock).
+ *
+ * This strategy only updates the flow record if it migrates across LLC
+ * boundaries. This minimizes expensive hardware updates while preserving
+ * cache locality for the application.
+ */
+ if (static_branch_unlikely(&rps_feat_llc_affinity)) {
+ /* Force update if the recorded CPU is invalid or has gone offline */
+ if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
+ return true;
+
+ /*
+ * Force an update if the current task is no longer permitted
+ * to run on the old_cpu.
+ */
+ if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
+ return true;
+
+ /*
+ * If CPUs do not share a cache, allow the update to prevent
+ * expensive remote memory accesses and cache misses.
+ */
+ if (!cpus_share_cache(old_cpu, new_cpu))
+ return true;
+
+ return false;
+ }
+
+ return true;
+}
+
+void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
+{
+ unsigned int index = hash & rps_tag_to_mask(tag_ptr);
+ u32 val = hash & ~net_hotdata.rps_cpu_mask;
+ struct rps_sock_flow_table *table;
+
+ /* We only give a hint, preemption can change CPU under us */
+ val |= raw_smp_processor_id();
+
+ table = rps_tag_to_table(tag_ptr);
+ /* The following WRITE_ONCE() is paired with the READ_ONCE()
+ * here, and another one in get_rps_cpu().
+ */
+ if (rps_record_cond(READ_ONCE(table[index].ent), val))
+ WRITE_ONCE(table[index].ent, val);
+}
+EXPORT_SYMBOL(rps_record_sock_flow);
+
#ifdef CONFIG_RFS_ACCEL
/**
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 502705e04649..dbc99aea7bb0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -210,6 +210,32 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
kvfree_rcu_mightsleep(tofree);
return ret;
}
+
+static int rps_feat_llc_affinity_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ u8 curr_state;
+ int ret;
+ const struct ctl_table tmp = {
+ .data = &curr_state,
+ .maxlen = sizeof(curr_state),
+ .mode = table->mode,
+ .extra1 = table->extra1,
+ .extra2 = table->extra2
+ };
+
+ curr_state = static_branch_unlikely(&rps_feat_llc_affinity) ? 1 : 0;
+
+ ret = proc_dou8vec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && ret == 0) {
+ if (curr_state && !static_branch_unlikely(&rps_feat_llc_affinity))
+ static_branch_enable(&rps_feat_llc_affinity);
+ else if (!curr_state && static_branch_unlikely(&rps_feat_llc_affinity))
+ static_branch_disable(&rps_feat_llc_affinity);
+ }
+
+ return ret;
+}
#endif /* CONFIG_RPS */
#ifdef CONFIG_NET_FLOW_LIMIT
@@ -531,6 +557,14 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = rps_sock_flow_sysctl
},
+ {
+ .procname = "rps_feat_llc_affinity",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = rps_feat_llc_affinity_sysctl,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
{
--
2.47.3
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity
2026-04-28 2:54 [PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity Chuang Wang
@ 2026-04-28 5:09 ` Eric Dumazet
0 siblings, 0 replies; 2+ messages in thread
From: Eric Dumazet @ 2026-04-28 5:09 UTC (permalink / raw)
To: Chuang Wang
Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
Stanislav Fomichev, Kuniyuki Iwashima, Samiullah Khawaja,
Hangbin Liu, Krishna Kumar, Neal Cardwell, Martin KaFai Lau,
netdev, linux-kernel
On Mon, Apr 27, 2026 at 7:56 PM Chuang Wang <nashuiliang@gmail.com> wrote:
>
> The current implementation of rps_record_sock_flow() updates the flow
> table every time a socket is processed on a different CPU. In high-load
> scenarios, especially with Accelerated RFS (ARFS), this triggers
> frequent flow steering updates via ndo_rx_flow_steer.
>
> For drivers like mlx5 that implement hardware flow steering, these
> constant updates lead to significant contention on internal driver locks
> (e.g., arfs_lock). This contention often becomes a performance
> bottleneck that outweighs the steering benefits.
>
> This patch introduces a cache-aware update strategy: the flow record is
> only updated if the flow migrates across Last Level Cache (LLC)
> boundaries. This minimizes expensive hardware reconfigurations while
> preserving cache locality for the application. A new sysctl,
> net.core.rps_feat_llc_affinity, is added to toggle this feature.
>
> Performance Test Results:
> The patch was tested in a K8s environment (AMD CPU 128*2, 16-core Pod
> with CPU pinning, mlx5 NIC) using brpc[1] echo_server and rpc_press.
>
> rpc_press Commands:
>
> for i in {1..8}; do
> ./rpc_press -proto=./echo.proto -method=example.EchoService.Echo
> -server=<IP>:8000 -input='{"message":"hello"}'
> -qps=0 -thread_num=512 -connection_type=pooled &
> done
>
> Monitor mlx5e_rx_flow_steer frequency:
>
> /usr/share/bcc/tools/funccount -i 1 mlx5e_rx_flow_steer
>
> Frequency of mlx5e_rx_flow_steer (via funccount[2]):
>
> Before: ~335,000 counts/sec
> After: ~23,000 counts/sec (reduced by ~93%)
>
> System Metrics (after enabling rps_feat_llc_affinity):
>
> CPU Utilization: 38% -> 32%
> CPU PSI (Pressure Stall Information): 20% -> 10%
>
> These results demonstrate that filtering updates by LLC affinity
> significantly reduces driver lock contention and improves overall
> CPU efficiency under heavy network load.
>
> [1] https://github.com/apache/brpc/
> [2] https://github.com/iovisor/bcc/blob/master/tools/funccount.py
>
> Signed-off-by: Chuang Wang <nashuiliang@gmail.com>
> ---
> v2 -> v3: patch net -> net-next
> v1 -> v2: add rps_feat_llc_affinity; add brpc tests
>
> include/net/rps.h | 18 ++--------
> net/core/dev.c | 72 ++++++++++++++++++++++++++++++++++++++
> net/core/sysctl_net_core.c | 34 ++++++++++++++++++
> 3 files changed, 108 insertions(+), 16 deletions(-)
>
> diff --git a/include/net/rps.h b/include/net/rps.h
> index e33c6a2fa8bb..37bbb7009c36 100644
> --- a/include/net/rps.h
> +++ b/include/net/rps.h
> @@ -12,6 +12,7 @@
>
> extern struct static_key_false rps_needed;
> extern struct static_key_false rfs_needed;
> +extern struct static_key_false rps_feat_llc_affinity;
>
> /*
> * This structure holds an RPS map which can be of variable length. The
> @@ -55,22 +56,7 @@ struct rps_sock_flow_table {
>
> #define RPS_NO_CPU 0xffff
>
> -static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> -{
> - unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> - u32 val = hash & ~net_hotdata.rps_cpu_mask;
> - struct rps_sock_flow_table *table;
> -
> - /* We only give a hint, preemption can change CPU under us */
> - val |= raw_smp_processor_id();
> -
> - table = rps_tag_to_table(tag_ptr);
> - /* The following WRITE_ONCE() is paired with the READ_ONCE()
> - * here, and another one in get_rps_cpu().
> - */
> - if (READ_ONCE(table[index].ent) != val)
> - WRITE_ONCE(table[index].ent, val);
> -}
> +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash);
>
> static inline void _sock_rps_record_flow_hash(__u32 hash)
> {
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 203dc36aaed5..630a7f21d8de 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4964,6 +4964,8 @@ struct static_key_false rps_needed __read_mostly;
> EXPORT_SYMBOL(rps_needed);
> struct static_key_false rfs_needed __read_mostly;
> EXPORT_SYMBOL(rfs_needed);
> +struct static_key_false rps_feat_llc_affinity __read_mostly;
> +EXPORT_SYMBOL(rps_feat_llc_affinity);
>
> static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
> {
> @@ -5175,6 +5177,76 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
> return cpu;
> }
>
> +/**
> + * rps_record_cond - Determine if RPS flow table should be updated
> + * @old_val: Previous flow record value
> + * @new_val: Target flow record value
> + *
> + * Returns true if the record needs an update.
> + */
> +static inline bool rps_record_cond(u32 old_val, u32 new_val)
> +{
> + u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask;
> + u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask;
> +
> + if (old_val == new_val)
> + return false;
> +
> + /*
> + * RPS LLC Affinity Feature:
> + * Reduce RFS/ARFS flow updates by checking LLC affinity.
> + *
> + * Frequent flow table updates can trigger constant hardware steering
> + * reconfigurations (e.g., ndo_rx_flow_steer), leading to significant
> + * contention on driver internal locks (like mlx5's arfs_lock).
> + *
> + * This strategy only updates the flow record if it migrates across LLC
> + * boundaries. This minimizes expensive hardware updates while preserving
> + * cache locality for the application.
> + */
> + if (static_branch_unlikely(&rps_feat_llc_affinity)) {
> + /* Force update if the recorded CPU is invalid or has gone offline */
> + if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu))
> + return true;
> +
> + /*
> + * Force an update if the current task is no longer permitted
> + * to run on the old_cpu.
> + */
> + if (!cpumask_test_cpu(old_cpu, current->cpus_ptr))
> + return true;
> +
> + /*
> + * If CPUs do not share a cache, allow the update to prevent
> + * expensive remote memory accesses and cache misses.
> + */
> + if (!cpus_share_cache(old_cpu, new_cpu))
> + return true;
> +
> + return false;
> + }
> +
> + return true;
> +}
> +
> +void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
> +{
> + unsigned int index = hash & rps_tag_to_mask(tag_ptr);
> + u32 val = hash & ~net_hotdata.rps_cpu_mask;
> + struct rps_sock_flow_table *table;
> +
> + /* We only give a hint, preemption can change CPU under us */
> + val |= raw_smp_processor_id();
> +
> + table = rps_tag_to_table(tag_ptr);
> + /* The following WRITE_ONCE() is paired with the READ_ONCE()
> + * here, and another one in get_rps_cpu().
> + */
> + if (rps_record_cond(READ_ONCE(table[index].ent), val))
> + WRITE_ONCE(table[index].ent, val);
> +}
> +EXPORT_SYMBOL(rps_record_sock_flow);
We do not want to put rps_record_sock_flow out of line.
rps_llc_check() is probably fine, it should not be called often.
diff --git a/include/net/rps.h b/include/net/rps.h
index e33c6a2fa8bbca3555ecccbbf9132d01cc433c36..7e98918d8751eb824b7057cca9e5d40c28e5f18a
100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -55,10 +55,12 @@ struct rps_sock_flow_table {
#define RPS_NO_CPU 0xffff
+bool rps_llc_check(u32 old_val, u32 new_val);
+
static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
{
unsigned int index = hash & rps_tag_to_mask(tag_ptr);
- u32 val = hash & ~net_hotdata.rps_cpu_mask;
+ u32 old_val, val = hash & ~net_hotdata.rps_cpu_mask;
struct rps_sock_flow_table *table;
/* We only give a hint, preemption can change CPU under us */
@@ -68,7 +70,8 @@ static inline void rps_record_sock_flow(rps_tag_ptr
tag_ptr, u32 hash)
/* The following WRITE_ONCE() is paired with the READ_ONCE()
* here, and another one in get_rps_cpu().
*/
- if (READ_ONCE(table[index].ent) != val)
+ old_val = READ_ONCE(table[index].ent);
+ if (old_val != val && rps_llc_check(old_val, val))
WRITE_ONCE(table[index].ent, val);
}
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-04-28 5:09 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-28 2:54 [PATCH net-next v3] net: reduce RFS/ARFS flow updates by checking LLC affinity Chuang Wang
2026-04-28 5:09 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox