* [PATCH net-next] net: rfs: hash function change
@ 2025-03-21 17:13 Eric Dumazet
2025-03-24 14:16 ` Simon Horman
2025-03-25 15:30 ` patchwork-bot+netdevbpf
0 siblings, 2 replies; 5+ messages in thread
From: Eric Dumazet @ 2025-03-21 17:13 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Willem de Bruijn, Kuniyuki Iwashima, Neal Cardwell, Simon Horman,
netdev, eric.dumazet, Eric Dumazet, Tom Herbert
RFS is using two kinds of hash tables.
First one is controled by /proc/sys/net/core/rps_sock_flow_entries = 2^N
and using the N low order bits of the l4 hash is good enough.
Then each RX queue has its own hash table, controled by
/sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X
Current hash function, using the X low order bits is suboptimal,
because RSS is usually using Func(hash) = (hash % power_of_two);
For example, with 32 RX queues, 6 low order bits have no entropy
for a given queue.
Switch this hash function to hash_32(hash, log) to increase
chances to use all possible slots and reduce collisions.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
---
include/net/rps.h | 2 +-
net/core/dev.c | 13 +++++++++----
net/core/net-sysfs.c | 4 ++--
3 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/include/net/rps.h b/include/net/rps.h
index a93401d23d66e45210acc73f0326087813b69d59..e358e9711f27523534fdf4cbf57729cbdf629b8a 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -39,7 +39,7 @@ struct rps_dev_flow {
* The rps_dev_flow_table structure contains a table of flow mappings.
*/
struct rps_dev_flow_table {
- unsigned int mask;
+ u8 log;
struct rcu_head rcu;
struct rps_dev_flow flows[];
};
diff --git a/net/core/dev.c b/net/core/dev.c
index 2355603417650fe10d075c8e85416a488e00626d..c5bf3b1684fe6dc7141b1c341b14423111fe8894 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4731,6 +4731,11 @@ EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
@@ -4757,7 +4762,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb_get_hash(skb) & flow_table->mask;
+ flow_id = rfs_slot(skb_get_hash(skb), flow_table);
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
@@ -4836,7 +4841,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[hash & flow_table->mask];
+ rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
tcpu = rflow->cpu;
/*
@@ -4903,13 +4908,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id <= flow_table->mask) {
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
READ_ONCE(rflow->last_qtail)) <
- (int)(10 * flow_table->mask)))
+ (int)(10 << flow_table->log)))
expire = false;
}
rcu_read_unlock();
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index abaa1c919b984cb3340e99bf1af26864a0cc3405..b6fbe629ccee1ea874a796c6146e4a6e8296c5dc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1056,7 +1056,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = (unsigned long)flow_table->mask + 1;
+ val = 1UL << flow_table->log;
rcu_read_unlock();
return sysfs_emit(buf, "%lu\n", val);
@@ -1109,7 +1109,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
if (!table)
return -ENOMEM;
- table->mask = mask;
+ table->log = ilog2(mask) + 1;
for (count = 0; count <= mask; count++)
table->flows[count].cpu = RPS_NO_CPU;
} else {
--
2.49.0.395.g12beb8f557-goog
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH net-next] net: rfs: hash function change
2025-03-21 17:13 [PATCH net-next] net: rfs: hash function change Eric Dumazet
@ 2025-03-24 14:16 ` Simon Horman
2025-03-24 14:35 ` Eric Dumazet
2025-03-25 15:30 ` patchwork-bot+netdevbpf
1 sibling, 1 reply; 5+ messages in thread
From: Simon Horman @ 2025-03-24 14:16 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet,
Tom Herbert
On Fri, Mar 21, 2025 at 05:13:09PM +0000, Eric Dumazet wrote:
> RFS is using two kinds of hash tables.
>
> First one is controled by /proc/sys/net/core/rps_sock_flow_entries = 2^N
> and using the N low order bits of the l4 hash is good enough.
>
> Then each RX queue has its own hash table, controled by
> /sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X
>
> Current hash function, using the X low order bits is suboptimal,
> because RSS is usually using Func(hash) = (hash % power_of_two);
>
> For example, with 32 RX queues, 6 low order bits have no entropy
> for a given queue.
>
> Switch this hash function to hash_32(hash, log) to increase
> chances to use all possible slots and reduce collisions.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Tom Herbert <tom@herbertland.com>
Reviewed-by: Simon Horman <horms@kernel.org>
...
> @@ -4903,13 +4908,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
>
> rcu_read_lock();
> flow_table = rcu_dereference(rxqueue->rps_flow_table);
> - if (flow_table && flow_id <= flow_table->mask) {
> + if (flow_table && flow_id < (1UL << flow_table->log)) {
> rflow = &flow_table->flows[flow_id];
> cpu = READ_ONCE(rflow->cpu);
> if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
> ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
> READ_ONCE(rflow->last_qtail)) <
> - (int)(10 * flow_table->mask)))
> + (int)(10 << flow_table->log)))
I am assuming that we don't care that (10 * flow_table->mask) and
(10 << flow_table->log) are close but not exactly the same.
e.g. mask = 0x3f => log = 6
10 * 0x3f = 630
10 << 6 = 640
> expire = false;
> }
> rcu_read_unlock();
...
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH net-next] net: rfs: hash function change
2025-03-24 14:16 ` Simon Horman
@ 2025-03-24 14:35 ` Eric Dumazet
2025-03-25 15:14 ` Simon Horman
0 siblings, 1 reply; 5+ messages in thread
From: Eric Dumazet @ 2025-03-24 14:35 UTC (permalink / raw)
To: Simon Horman
Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet,
Tom Herbert
On Mon, Mar 24, 2025 at 3:16 PM Simon Horman <horms@kernel.org> wrote:
>
> On Fri, Mar 21, 2025 at 05:13:09PM +0000, Eric Dumazet wrote:
> > RFS is using two kinds of hash tables.
> >
> > First one is controled by /proc/sys/net/core/rps_sock_flow_entries = 2^N
> > and using the N low order bits of the l4 hash is good enough.
> >
> > Then each RX queue has its own hash table, controled by
> > /sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X
> >
> > Current hash function, using the X low order bits is suboptimal,
> > because RSS is usually using Func(hash) = (hash % power_of_two);
> >
> > For example, with 32 RX queues, 6 low order bits have no entropy
> > for a given queue.
> >
> > Switch this hash function to hash_32(hash, log) to increase
> > chances to use all possible slots and reduce collisions.
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > Cc: Tom Herbert <tom@herbertland.com>
>
> Reviewed-by: Simon Horman <horms@kernel.org>
>
> ...
>
> > @@ -4903,13 +4908,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
> >
> > rcu_read_lock();
> > flow_table = rcu_dereference(rxqueue->rps_flow_table);
> > - if (flow_table && flow_id <= flow_table->mask) {
> > + if (flow_table && flow_id < (1UL << flow_table->log)) {
> > rflow = &flow_table->flows[flow_id];
> > cpu = READ_ONCE(rflow->cpu);
> > if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
> > ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
> > READ_ONCE(rflow->last_qtail)) <
> > - (int)(10 * flow_table->mask)))
> > + (int)(10 << flow_table->log)))
>
> I am assuming that we don't care that (10 * flow_table->mask) and
> (10 << flow_table->log) are close but not exactly the same.
>
> e.g. mask = 0x3f => log = 6
Yes, I doubt we care.
The 10 constant seems quite arbitrary anyway.
We also could keep both fields in flow_table : ->log and ->mask
I chose to remove ->mask mostly to detect all places needing scrutiny
for the hash function change.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH net-next] net: rfs: hash function change
2025-03-24 14:35 ` Eric Dumazet
@ 2025-03-25 15:14 ` Simon Horman
0 siblings, 0 replies; 5+ messages in thread
From: Simon Horman @ 2025-03-25 15:14 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
Kuniyuki Iwashima, Neal Cardwell, netdev, eric.dumazet,
Tom Herbert
On Mon, Mar 24, 2025 at 03:35:39PM +0100, Eric Dumazet wrote:
> On Mon, Mar 24, 2025 at 3:16 PM Simon Horman <horms@kernel.org> wrote:
> >
> > On Fri, Mar 21, 2025 at 05:13:09PM +0000, Eric Dumazet wrote:
> > > RFS is using two kinds of hash tables.
> > >
> > > First one is controled by /proc/sys/net/core/rps_sock_flow_entries = 2^N
> > > and using the N low order bits of the l4 hash is good enough.
> > >
> > > Then each RX queue has its own hash table, controled by
> > > /sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X
> > >
> > > Current hash function, using the X low order bits is suboptimal,
> > > because RSS is usually using Func(hash) = (hash % power_of_two);
> > >
> > > For example, with 32 RX queues, 6 low order bits have no entropy
> > > for a given queue.
> > >
> > > Switch this hash function to hash_32(hash, log) to increase
> > > chances to use all possible slots and reduce collisions.
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > Cc: Tom Herbert <tom@herbertland.com>
> >
> > Reviewed-by: Simon Horman <horms@kernel.org>
> >
> > ...
> >
> > > @@ -4903,13 +4908,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
> > >
> > > rcu_read_lock();
> > > flow_table = rcu_dereference(rxqueue->rps_flow_table);
> > > - if (flow_table && flow_id <= flow_table->mask) {
> > > + if (flow_table && flow_id < (1UL << flow_table->log)) {
> > > rflow = &flow_table->flows[flow_id];
> > > cpu = READ_ONCE(rflow->cpu);
> > > if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
> > > ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
> > > READ_ONCE(rflow->last_qtail)) <
> > > - (int)(10 * flow_table->mask)))
> > > + (int)(10 << flow_table->log)))
> >
> > I am assuming that we don't care that (10 * flow_table->mask) and
> > (10 << flow_table->log) are close but not exactly the same.
> >
> > e.g. mask = 0x3f => log = 6
>
> Yes, I doubt we care.
> The 10 constant seems quite arbitrary anyway.
>
> We also could keep both fields in flow_table : ->log and ->mask
>
> I chose to remove ->mask mostly to detect all places needing scrutiny
> for the hash function change.
Thanks for the clarification. I agree that 10 seems arbitrary
and that it is nice to remove ->mask entirely.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH net-next] net: rfs: hash function change
2025-03-21 17:13 [PATCH net-next] net: rfs: hash function change Eric Dumazet
2025-03-24 14:16 ` Simon Horman
@ 2025-03-25 15:30 ` patchwork-bot+netdevbpf
1 sibling, 0 replies; 5+ messages in thread
From: patchwork-bot+netdevbpf @ 2025-03-25 15:30 UTC (permalink / raw)
To: Eric Dumazet
Cc: davem, kuba, pabeni, willemb, kuniyu, ncardwell, horms, netdev,
eric.dumazet, tom
Hello:
This patch was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Fri, 21 Mar 2025 17:13:09 +0000 you wrote:
> RFS is using two kinds of hash tables.
>
> First one is controled by /proc/sys/net/core/rps_sock_flow_entries = 2^N
> and using the N low order bits of the l4 hash is good enough.
>
> Then each RX queue has its own hash table, controled by
> /sys/class/net/eth1/queues/rx-$q/rps_flow_cnt = 2^X
>
> [...]
Here is the summary with links:
- [net-next] net: rfs: hash function change
https://git.kernel.org/netdev/net-next/c/f3483c8e1da6
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-03-25 15:30 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-21 17:13 [PATCH net-next] net: rfs: hash function change Eric Dumazet
2025-03-24 14:16 ` Simon Horman
2025-03-24 14:35 ` Eric Dumazet
2025-03-25 15:14 ` Simon Horman
2025-03-25 15:30 ` patchwork-bot+netdevbpf
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).