From: Eric Dumazet <edumazet@google.com>
To: "David S . Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>,
Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>,
Kuniyuki Iwashima <kuniyu@google.com>,
netdev@vger.kernel.org, eric.dumazet@gmail.com,
Eric Dumazet <edumazet@google.com>
Subject: [PATCH v3 net-next 7/7] net-sysfs: use rps_tag_ptr and remove metadata from rps_dev_flow_table
Date: Mon, 2 Mar 2026 18:14:32 +0000 [thread overview]
Message-ID: <20260302181432.1836150-8-edumazet@google.com> (raw)
In-Reply-To: <20260302181432.1836150-1-edumazet@google.com>
Instead of storing the @log at the beginning of rps_dev_flow_table
use 5 low order bits of the rps_tag_ptr to store the log of the size.
This removes a potential cache line miss (for light traffic).
This allows us to switch to one high-order allocation instead of vmalloc()
when CONFIG_RFS_ACCEL is not set.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/netdev_rx_queue.h | 3 +-
include/net/rps.h | 10 -----
net/core/dev.c | 53 +++++++++++++++-----------
net/core/net-sysfs.c | 70 +++++++++++++++++------------------
4 files changed, 67 insertions(+), 69 deletions(-)
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index cfa72c4853876c6fcb84b5c551580d9205f7b29d..08f81329fc11dc86767f9da661be8c7194dc1da2 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -8,13 +8,14 @@
#include <net/xdp.h>
#include <net/page_pool/types.h>
#include <net/netdev_queues.h>
+#include <net/rps-types.h>
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct xdp_rxq_info xdp_rxq;
#ifdef CONFIG_RPS
struct rps_map __rcu *rps_map;
- struct rps_dev_flow_table __rcu *rps_flow_table;
+ rps_tag_ptr rps_flow_table;
#endif
struct kobject kobj;
const struct attribute_group **groups;
diff --git a/include/net/rps.h b/include/net/rps.h
index e900480e828b487c721b3ef392f4abb427ad442c..e33c6a2fa8bbca3555ecccbbf9132d01cc433c36 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -39,16 +39,6 @@ struct rps_dev_flow {
};
#define RPS_NO_FILTER 0xffff
-/*
- * The rps_dev_flow_table structure contains a table of flow mappings.
- */
-struct rps_dev_flow_table {
- u8 log;
- struct rps_dev_flow flows[];
-};
-#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
- ((_num) * sizeof(struct rps_dev_flow)))
-
/*
* The rps_sock_flow_table contains mappings of flows to the last CPU
* on which they were processed by the application (set in recvmsg).
diff --git a/net/core/dev.c b/net/core/dev.c
index d4837b058b2ff02e94f9590e310edbcb06dad0f2..053a30a8c0ea4464d3b61c7dde8ad916eeef1c19 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4968,16 +4968,16 @@ EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
-static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
{
- return hash_32(hash, flow_table->log);
+ return hash_32(hash, rps_tag_to_log(tag_ptr));
}
#ifdef CONFIG_RFS_ACCEL
/**
* rps_flow_is_active - check whether the flow is recently active.
* @rflow: Specific flow to check activity.
- * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @log: ilog2(hashsize).
* @cpu: CPU saved in @rflow.
*
* If the CPU has processed many packets since the flow's last activity
@@ -4986,7 +4986,7 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
* Return: true if flow was recently active.
*/
static bool rps_flow_is_active(struct rps_dev_flow *rflow,
- struct rps_dev_flow_table *flow_table,
+ u8 log,
unsigned int cpu)
{
unsigned int flow_last_active;
@@ -4999,7 +4999,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow,
flow_last_active = READ_ONCE(rflow->last_qtail);
return (int)(sd_input_head - flow_last_active) <
- (int)(10 << flow_table->log);
+ (int)(10 << log);
}
#endif
@@ -5011,9 +5011,10 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
u32 head;
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
- struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *flow_table;
struct rps_dev_flow *old_rflow;
struct rps_dev_flow *tmp_rflow;
+ rps_tag_ptr q_tag_ptr;
unsigned int tmp_cpu;
u16 rxq_index;
u32 flow_id;
@@ -5028,16 +5029,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
goto out;
rxqueue = dev->_rx + rxq_index;
- flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (!flow_table)
+ q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
+ if (!q_tag_ptr)
goto out;
- flow_id = rfs_slot(hash, flow_table);
- tmp_rflow = &flow_table->flows[flow_id];
+ flow_id = rfs_slot(hash, q_tag_ptr);
+ flow_table = rps_tag_to_table(q_tag_ptr);
+ tmp_rflow = flow_table + flow_id;
tmp_cpu = READ_ONCE(tmp_rflow->cpu);
if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
- if (rps_flow_is_active(tmp_rflow, flow_table,
+ if (rps_flow_is_active(tmp_rflow,
+ rps_tag_to_log(q_tag_ptr),
tmp_cpu)) {
if (hash != READ_ONCE(tmp_rflow->hash) ||
next_cpu == tmp_cpu)
@@ -5076,8 +5079,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
struct netdev_rx_queue *rxqueue = dev->_rx;
- struct rps_dev_flow_table *flow_table;
- rps_tag_ptr global_tag_ptr;
+ rps_tag_ptr global_tag_ptr, q_tag_ptr;
struct rps_map *map;
int cpu = -1;
u32 tcpu;
@@ -5098,9 +5100,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
- flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map);
- if (!flow_table && !map)
+ if (!q_tag_ptr && !map)
goto done;
skb_reset_network_header(skb);
@@ -5109,8 +5111,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
goto done;
global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
- if (flow_table && global_tag_ptr) {
+ if (q_tag_ptr && global_tag_ptr) {
struct rps_sock_flow_table *sock_flow_table;
+ struct rps_dev_flow *flow_table;
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 flow_id;
@@ -5130,7 +5133,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
+ flow_id = rfs_slot(hash, q_tag_ptr);
+ flow_table = rps_tag_to_table(q_tag_ptr);
+ rflow = flow_table + flow_id;
tcpu = rflow->cpu;
/*
@@ -5190,19 +5195,23 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
u32 flow_id, u16 filter_id)
{
struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
- struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *flow_table;
struct rps_dev_flow *rflow;
+ rps_tag_ptr q_tag_ptr;
bool expire = true;
+ u8 log;
rcu_read_lock();
- flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id < (1UL << flow_table->log)) {
+ q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
+ log = rps_tag_to_log(q_tag_ptr);
+ if (q_tag_ptr && flow_id < (1UL << log)) {
unsigned int cpu;
- rflow = &flow_table->flows[flow_id];
+ flow_table = rps_tag_to_table(q_tag_ptr);
+ rflow = flow_table + flow_id;
cpu = READ_ONCE(rflow->cpu);
if (READ_ONCE(rflow->filter) == filter_id &&
- rps_flow_is_active(rflow, flow_table, cpu))
+ rps_flow_is_active(rflow, log, cpu))
expire = false;
}
rcu_read_unlock();
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fd6f81930bc6437957f32206c84db87ee242fede..2ce011fae2490b3bd950cf8d9089e7d71cc0fd7a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1060,14 +1060,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
char *buf)
{
- struct rps_dev_flow_table *flow_table;
unsigned long val = 0;
+ rps_tag_ptr tag_ptr;
- rcu_read_lock();
- flow_table = rcu_dereference(queue->rps_flow_table);
- if (flow_table)
- val = 1UL << flow_table->log;
- rcu_read_unlock();
+ tag_ptr = READ_ONCE(queue->rps_flow_table);
+ if (tag_ptr)
+ val = 1UL << rps_tag_to_log(tag_ptr);
return sysfs_emit(buf, "%lu\n", val);
}
@@ -1075,8 +1073,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
const char *buf, size_t len)
{
+ rps_tag_ptr otag, tag_ptr = 0UL;
+ struct rps_dev_flow *table;
unsigned long mask, count;
- struct rps_dev_flow_table *table, *old_table;
+ size_t sz;
int rc;
if (!capable(CAP_NET_ADMIN))
@@ -1093,38 +1093,36 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
*/
while ((mask | (mask >> 1)) != mask)
mask |= (mask >> 1);
- /* On 64 bit arches, must check mask fits in table->mask (u32),
- * and on 32bit arches, must check
- * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
- */
-#if BITS_PER_LONG > 32
- if (mask > (unsigned long)(u32)mask)
- return -EINVAL;
-#else
- if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
- / sizeof(struct rps_dev_flow)) {
- /* Enforce a limit to prevent overflow */
+
+ /* Do not accept too large tables. */
+ if (mask > (INT_MAX / sizeof(*table) - 1))
return -EINVAL;
- }
-#endif
- table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
+
+ sz = max_t(size_t, sizeof(*table) * (mask + 1),
+ PAGE_SIZE);
+ if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
+ is_power_of_2(sizeof(*table)))
+ table = kvmalloc(sz, GFP_KERNEL);
+ else
+ table = vmalloc(sz);
if (!table)
return -ENOMEM;
-
- table->log = ilog2(mask) + 1;
+ tag_ptr = (rps_tag_ptr)table;
+ if (rps_tag_to_log(tag_ptr)) {
+ pr_err_once("store_rps_dev_flow_table_cnt() got a non page aligned allocation.\n");
+ kvfree(table);
+ return -ENOMEM;
+ }
+ tag_ptr |= (ilog2(mask) + 1);
for (count = 0; count <= mask; count++) {
- table->flows[count].cpu = RPS_NO_CPU;
- table->flows[count].filter = RPS_NO_FILTER;
+ table[count].cpu = RPS_NO_CPU;
+ table[count].filter = RPS_NO_FILTER;
}
- } else {
- table = NULL;
}
- old_table = unrcu_pointer(xchg(&queue->rps_flow_table,
- RCU_INITIALIZER(table)));
-
- if (old_table)
- kvfree_rcu_mightsleep(old_table);
+ otag = xchg(&queue->rps_flow_table, tag_ptr);
+ if (otag)
+ kvfree_rcu_mightsleep(rps_tag_to_table(otag));
return len;
}
@@ -1150,7 +1148,7 @@ static void rx_queue_release(struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
#ifdef CONFIG_RPS
- struct rps_dev_flow_table *old_table;
+ rps_tag_ptr tag_ptr;
struct rps_map *map;
map = rcu_dereference_protected(queue->rps_map, 1);
@@ -1159,9 +1157,9 @@ static void rx_queue_release(struct kobject *kobj)
kfree_rcu(map, rcu);
}
- old_table = unrcu_pointer(xchg(&queue->rps_flow_table, NULL));
- if (old_table)
- kvfree_rcu_mightsleep(old_table);
+ tag_ptr = xchg(&queue->rps_flow_table, 0UL);
+ if (tag_ptr)
+ kvfree_rcu_mightsleep(rps_tag_to_table(tag_ptr));
#endif
memset(kobj, 0, sizeof(*kobj));
--
2.53.0.473.g4a7958ca14-goog
next prev parent reply other threads:[~2026-03-02 18:14 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-02 18:14 [PATCH v3 net-next 0/7] rfs: use high-order allocations for hash tables Eric Dumazet
2026-03-02 18:14 ` [PATCH v3 net-next 1/7] net: add rps_tag_ptr type and helpers Eric Dumazet
2026-03-03 19:47 ` Kuniyuki Iwashima
2026-03-02 18:14 ` [PATCH v3 net-next 2/7] net-sysfs: remove rcu field from 'struct rps_sock_flow_table' Eric Dumazet
2026-03-03 19:24 ` Kuniyuki Iwashima
2026-03-02 18:14 ` [PATCH v3 net-next 3/7] net-sysfs: add rps_sock_flow_table_mask() helper Eric Dumazet
2026-03-03 19:28 ` Kuniyuki Iwashima
2026-03-02 18:14 ` [PATCH v3 net-next 4/7] net-sysfs: use rps_tag_ptr and remove metadata from rps_sock_flow_table Eric Dumazet
2026-03-03 19:46 ` Kuniyuki Iwashima
2026-03-02 18:14 ` [PATCH v3 net-next 5/7] net-sysfs: get rid of rps_dev_flow_lock Eric Dumazet
2026-03-03 19:48 ` Kuniyuki Iwashima
2026-03-02 18:14 ` [PATCH v3 net-next 6/7] net-sysfs: remove rcu field from 'struct rps_dev_flow_table' Eric Dumazet
2026-03-03 19:49 ` Kuniyuki Iwashima
2026-03-02 18:14 ` Eric Dumazet [this message]
2026-03-03 20:06 ` [PATCH v3 net-next 7/7] net-sysfs: use rps_tag_ptr and remove metadata from rps_dev_flow_table Kuniyuki Iwashima
2026-03-05 1:00 ` [PATCH v3 net-next 0/7] rfs: use high-order allocations for hash tables patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260302181432.1836150-8-edumazet@google.com \
--to=edumazet@google.com \
--cc=davem@davemloft.net \
--cc=eric.dumazet@gmail.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox