* [PATCH net-next 00/10] net_sched: speedup qdisc dequeue
@ 2025-11-10 9:44 Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 01/10] net_sched: make room for (struct qdisc_skb_cb)->pkt_segs Eric Dumazet
` (10 more replies)
0 siblings, 11 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:44 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
Idea is to cache gso_segs at enqueue time before spinlock is
acquired, in the first skb cache line, where we already
have qdisc_skb_cb(skb)->pkt_len.
This series gives a 8 % improvement in a TX intensive workload.
(120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)
Eric Dumazet (10):
net_sched: make room for (struct qdisc_skb_cb)->pkt_segs
net: init shinfo->gso_segs from qdisc_pkt_len_init()
net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in
qdisc_pkt_len_init()
net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update()
net_sched: cake: use qdisc_pkt_segs()
net_sched: add Qdisc_read_mostly and Qdisc_write groups
net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb()
net_sched: sch_fq: prefetch one skb ahead in dequeue()
net: prefech skb->priority in __dev_xmit_skb()
net: annotate a data-race in __dev_xmit_skb()
include/net/sch_generic.h | 60 ++++++++++++++++++++++++---------------
net/core/dev.c | 23 ++++++++++-----
net/sched/act_ct.c | 8 +++---
net/sched/cls_api.c | 6 ++--
net/sched/cls_flower.c | 2 +-
net/sched/sch_cake.c | 13 +++------
net/sched/sch_dualpi2.c | 1 +
net/sched/sch_fq.c | 9 ++++--
net/sched/sch_netem.c | 1 +
net/sched/sch_qfq.c | 2 +-
net/sched/sch_taprio.c | 1 +
net/sched/sch_tbf.c | 1 +
12 files changed, 76 insertions(+), 51 deletions(-)
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH net-next 01/10] net_sched: make room for (struct qdisc_skb_cb)->pkt_segs
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
@ 2025-11-10 9:44 ` Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 02/10] net: init shinfo->gso_segs from qdisc_pkt_len_init() Eric Dumazet
` (9 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:44 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Add a new u16 field, next to pkt_len : pkt_segs
This will cache shinfo->gso_segs to speed up qdisc deqeue().
Move slave_dev_queue_mapping at the end of qdisc_skb_cb,
and move three bits from tc_skb_cb :
- post_ct
- post_ct_snat
- post_ct_dnat
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sch_generic.h | 18 +++++++++---------
net/core/dev.c | 2 +-
net/sched/act_ct.c | 8 ++++----
net/sched/cls_api.c | 6 +++---
net/sched/cls_flower.c | 2 +-
5 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 94966692ccdf51db085c236319705aecba8c30cf..9cd8b5d4b23698fd8959ef40c303468e31c1d4af 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -429,13 +429,16 @@ struct tcf_proto {
};
struct qdisc_skb_cb {
- struct {
- unsigned int pkt_len;
- u16 slave_dev_queue_mapping;
- u16 tc_classid;
- };
+ unsigned int pkt_len;
+ u16 pkt_segs;
+ u16 tc_classid;
#define QDISC_CB_PRIV_LEN 20
unsigned char data[QDISC_CB_PRIV_LEN];
+
+ u16 slave_dev_queue_mapping;
+ u8 post_ct:1;
+ u8 post_ct_snat:1;
+ u8 post_ct_dnat:1;
};
typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
@@ -1064,11 +1067,8 @@ struct tc_skb_cb {
struct qdisc_skb_cb qdisc_cb;
u32 drop_reason;
- u16 zone; /* Only valid if post_ct = true */
+ u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */
u16 mru;
- u8 post_ct:1;
- u8 post_ct_snat:1;
- u8 post_ct_dnat:1;
};
static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
diff --git a/net/core/dev.c b/net/core/dev.c
index 69515edd17bc6a157046f31b3dd343a59ae192ab..46ce6c6107805132b1322128e86634eca91e3340 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4355,7 +4355,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
return ret;
tc_skb_cb(skb)->mru = 0;
- tc_skb_cb(skb)->post_ct = false;
+ qdisc_skb_cb(skb)->post_ct = false;
tcf_set_drop_reason(skb, *drop_reason);
mini_qdisc_bstats_cpu_update(miniq, skb);
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 6749a4a9a9cd0a43897fcd20d228721ce057cb88..2b6ac7069dc168da2c534bddc5d4398e5e7a18c4 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
return err & NF_VERDICT_MASK;
if (action & BIT(NF_NAT_MANIP_SRC))
- tc_skb_cb(skb)->post_ct_snat = 1;
+ qdisc_skb_cb(skb)->post_ct_snat = 1;
if (action & BIT(NF_NAT_MANIP_DST))
- tc_skb_cb(skb)->post_ct_dnat = 1;
+ qdisc_skb_cb(skb)->post_ct_dnat = 1;
return err;
#else
@@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
tcf_action_update_bstats(&c->common, skb);
if (clear) {
- tc_skb_cb(skb)->post_ct = false;
+ qdisc_skb_cb(skb)->post_ct = false;
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
nf_ct_put(ct);
@@ -1097,7 +1097,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
out_push:
skb_push_rcsum(skb, nh_ofs);
- tc_skb_cb(skb)->post_ct = true;
+ qdisc_skb_cb(skb)->post_ct = true;
tc_skb_cb(skb)->zone = p->zone;
out_clear:
if (defrag)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f751cd5eeac8d72b4c4d138f45d25a8ba62fb1bd..ebca4b926dcf76daa3abb8ffe221503e33de30e3 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb,
}
ext->chain = last_executed_chain;
ext->mru = cb->mru;
- ext->post_ct = cb->post_ct;
- ext->post_ct_snat = cb->post_ct_snat;
- ext->post_ct_dnat = cb->post_ct_dnat;
+ ext->post_ct = qdisc_skb_cb(skb)->post_ct;
+ ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat;
+ ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat;
ext->zone = cb->zone;
}
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 099ff6a3e1f516a50cfac578666f6d5f4fbe8f29..7669371c1354c27ede83c2c83aaea5c0402e6552 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb,
struct tcf_result *res)
{
struct cls_fl_head *head = rcu_dereference_bh(tp->root);
- bool post_ct = tc_skb_cb(skb)->post_ct;
+ bool post_ct = qdisc_skb_cb(skb)->post_ct;
u16 zone = tc_skb_cb(skb)->zone;
struct fl_flow_key skb_key;
struct fl_flow_mask *mask;
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 02/10] net: init shinfo->gso_segs from qdisc_pkt_len_init()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 01/10] net_sched: make room for (struct qdisc_skb_cb)->pkt_segs Eric Dumazet
@ 2025-11-10 9:44 ` Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 03/10] net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in qdisc_pkt_len_init() Eric Dumazet
` (8 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:44 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Qdisc use shinfo->gso_segs for their pkts stats in bstats_update(),
but this field needs to be initialized for SKB_GSO_DODGY users.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/core/dev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index 46ce6c6107805132b1322128e86634eca91e3340..dba9eef8bd83dda89b5edd870b47373722264f48 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4071,7 +4071,7 @@ EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
qdisc_skb_cb(skb)->pkt_len = skb->len;
@@ -4112,6 +4112,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
if (payload <= 0)
return;
gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ shinfo->gso_segs = gso_segs;
}
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 03/10] net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in qdisc_pkt_len_init()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 01/10] net_sched: make room for (struct qdisc_skb_cb)->pkt_segs Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 02/10] net: init shinfo->gso_segs from qdisc_pkt_len_init() Eric Dumazet
@ 2025-11-10 9:44 ` Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 04/10] net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update() Eric Dumazet
` (7 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:44 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
qdisc_pkt_len_init() is currently initalizing qdisc_skb_cb(skb)->pkt_len
Add qdisc_skb_cb(skb)->pkt_segs initialization and rename this function
to qdisc_pkt_len_segs_init().
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/core/dev.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index dba9eef8bd83dda89b5edd870b47373722264f48..895c3e37e686f0f625bd5eec7079a43cbd33a7eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4069,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
-static void qdisc_pkt_len_init(struct sk_buff *skb)
+static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u16 gso_segs;
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ if (!shinfo->gso_size) {
+ qdisc_skb_cb(skb)->pkt_segs = 1;
+ return;
+ }
+
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
- if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
- u16 gso_segs = shinfo->gso_segs;
+ if (skb_transport_header_was_set(skb)) {
unsigned int hdr_len;
/* mac layer + network layer */
@@ -4113,6 +4119,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
return;
gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
shinfo->gso_segs = gso_segs;
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs;
}
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
@@ -4738,7 +4745,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_update_prio(skb);
- qdisc_pkt_len_init(skb);
+ qdisc_pkt_len_segs_init(skb);
tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 04/10] net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (2 preceding siblings ...)
2025-11-10 9:44 ` [PATCH net-next 03/10] net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in qdisc_pkt_len_init() Eric Dumazet
@ 2025-11-10 9:44 ` Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 05/10] net_sched: cake: use qdisc_pkt_segs() Eric Dumazet
` (6 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:44 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Avoid up to two cache line misses in qdisc dequeue() to fetch
skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
This gives a 5 % improvement in a TX intensive workload.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sch_generic.h | 13 ++++++++++---
net/sched/sch_cake.c | 1 +
net/sched/sch_dualpi2.c | 1 +
net/sched/sch_netem.c | 1 +
net/sched/sch_qfq.c | 2 +-
net/sched/sch_taprio.c | 1 +
net/sched/sch_tbf.c | 1 +
7 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9cd8b5d4b23698fd8959ef40c303468e31c1d4af..ae037e56088208ad17f664c43689aee895569cab 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
return qdisc_skb_cb(skb)->pkt_len;
}
+static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb)
+{
+ u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
+
+ DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
+ skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
+ return pkt_segs;
+}
+
/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
__NET_XMIT_STOLEN = 0x00010000,
@@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
const struct sk_buff *skb)
{
- _bstats_update(bstats,
- qdisc_pkt_len(skb),
- skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
+ _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb));
}
static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 32bacfc314c260dccf94178d309ccb2be22d69e4..993ce808230fb7d4769c926f6c8368d927f5a45f 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1800,6 +1800,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
+ qdisc_skb_cb(segs)->pkt_segs = 1;
cobalt_set_enqueue_time(segs, now);
get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
segs);
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 4b975feb52b1f3d3b37b31713d1477de5f5806d9..6d7e6389758dc8e645b1116efe4e11fb7290ac86 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
*/
qdisc_skb_cb(nskb)->pkt_len = nskb->len;
+ qdisc_skb_cb(nskb)->pkt_segs = 1;
dualpi2_skb_cb(nskb)->classified =
dualpi2_skb_cb(skb)->classified;
dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index eafc316ae319e3f8c23b0cb0c58fdf54be102213..32a5f33040461f3be952055c097b5f2fe760a858 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff *segs;
netdev_features_t features = netif_skb_features(skb);
+ qdisc_skb_cb(skb)->pkt_segs = 1;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 2255355e51d350eded4549c1584b60d4d9b00fff..d920f57dc6d7659c510a98956c6dd2ed9e5ee5b8 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
}
- gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
+ gso_segs = qdisc_pkt_segs(skb);
err = qdisc_enqueue(skb, cl->qdisc, to_free);
if (unlikely(err != NET_XMIT_SUCCESS)) {
pr_debug("qfq_enqueue: enqueue failed %d\n", err);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 39b735386996eb59712a1fc28f7bb903ec1b2220..300d577b328699eb42d2b829ecfc76464fd7b186 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
skb_list_walk_safe(segs, segs, nskb) {
skb_mark_not_on_list(segs);
qdisc_skb_cb(segs)->pkt_len = segs->len;
+ qdisc_skb_cb(segs)->pkt_segs = 1;
slen += segs->len;
/* FIXME: we should be segmenting to a smaller size
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 4c977f049670a600eafd219c898e5f29597be2c1..f2340164f579a25431979e12ec3d23ab828edd16 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
skb_mark_not_on_list(segs);
seg_len = segs->len;
qdisc_skb_cb(segs)->pkt_len = seg_len;
+ qdisc_skb_cb(segs)->pkt_segs = 1;
ret = qdisc_enqueue(segs, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 05/10] net_sched: cake: use qdisc_pkt_segs()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (3 preceding siblings ...)
2025-11-10 9:44 ` [PATCH net-next 04/10] net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update() Eric Dumazet
@ 2025-11-10 9:45 ` Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 06/10] net_sched: add Qdisc_read_mostly and Qdisc_write groups Eric Dumazet
` (5 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:45 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Use new qdisc_pkt_segs() to avoid a cache line miss in cake_enqueue()
for non GSO packets.
cake_overhead() does not have to recompute it.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/sched/sch_cake.c | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 993ce808230fb7d4769c926f6c8368d927f5a45f..312f5b000ffb67d74faf70f26d808e26315b4ab8 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -1398,12 +1398,12 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
const struct skb_shared_info *shinfo = skb_shinfo(skb);
unsigned int hdr_len, last_len = 0;
u32 off = skb_network_offset(skb);
+ u16 segs = qdisc_pkt_segs(skb);
u32 len = qdisc_pkt_len(skb);
- u16 segs = 1;
q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
- if (!shinfo->gso_size)
+ if (segs == 1)
return cake_calc_overhead(q, len, off);
/* borrowed from qdisc_pkt_len_init() */
@@ -1430,12 +1430,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
hdr_len += sizeof(struct udphdr);
}
- if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
- segs = DIV_ROUND_UP(skb->len - hdr_len,
- shinfo->gso_size);
- else
- segs = shinfo->gso_segs;
-
len = shinfo->gso_size + hdr_len;
last_len = skb->len - shinfo->gso_size * (segs - 1);
@@ -1788,7 +1782,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(len > b->max_skblen))
b->max_skblen = len;
- if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
unsigned int slen = 0, numsegs = 0;
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 06/10] net_sched: add Qdisc_read_mostly and Qdisc_write groups
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (4 preceding siblings ...)
2025-11-10 9:45 ` [PATCH net-next 05/10] net_sched: cake: use qdisc_pkt_segs() Eric Dumazet
@ 2025-11-10 9:45 ` Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 07/10] net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb() Eric Dumazet
` (4 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:45 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in
fast path by reducing this to a single dirtied cache line.
In current layout, we change only four/six fields in the first cache line:
- q.spinlock
- q.qlen
- bstats.bytes
- bstats.packets
- some Qdisc also change q.next/q.prev
In the second cache line we change in the fast path:
- running
- state
- qstats.backlog
/* --- cacheline 2 boundary (128 bytes) --- */
struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */
struct qdisc_skb_head q; /* 0x98 0x18 */
struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct gnet_stats_queue qstats; /* 0xc0 0x14 */
bool running; /* 0xd4 0x1 */
/* XXX 3 bytes hole, try to pack */
unsigned long state; /* 0xd8 0x8 */
struct Qdisc * next_sched; /* 0xe0 0x8 */
struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */
/* --- cacheline 4 boundary (256 bytes) --- */
Reorganize things to have a first cache line mostly read,
then a mostly written one.
This gives a ~3% increase of performance under tx stress.
Note that there is an additional hols because @qstats now spans over a third cache line.
/* --- cacheline 2 boundary (128 bytes) --- */
__u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */
struct sk_buff_head gso_skb; /* 0x80 0x18 */
struct Qdisc * next_sched; /* 0x98 0x8 */
struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */
__u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */
/* XXX 8 bytes hole, try to pack */
/* --- cacheline 3 boundary (192 bytes) --- */
__u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */
struct qdisc_skb_head q; /* 0xc0 0x18 */
unsigned long state; /* 0xd8 0x8 */
struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */
bool running; /* 0xf0 0x1 */
/* XXX 3 bytes hole, try to pack */
struct gnet_stats_queue qstats; /* 0xf4 0x14 */
/* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */
__u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */
/* XXX 56 bytes hole, try to pack */
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/sch_generic.h | 29 ++++++++++++++++++-----------
1 file changed, 18 insertions(+), 11 deletions(-)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ae037e56088208ad17f664c43689aee895569cab..b76436ec3f4aa412bac1be3371f5c7c6245cc362 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -103,17 +103,24 @@ struct Qdisc {
int pad;
refcount_t refcnt;
- /*
- * For performance sake on SMP, we put highly modified fields at the end
- */
- struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
- struct qdisc_skb_head q;
- struct gnet_stats_basic_sync bstats;
- struct gnet_stats_queue qstats;
- bool running; /* must be written under qdisc spinlock */
- unsigned long state;
- struct Qdisc *next_sched;
- struct sk_buff_head skb_bad_txq;
+ /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
+ __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
+ struct sk_buff_head gso_skb;
+ struct Qdisc *next_sched;
+ struct sk_buff_head skb_bad_txq;
+ __cacheline_group_end(Qdisc_read_mostly);
+
+ /* Fields dirtied in dequeue() fast path. */
+ __cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
+ struct qdisc_skb_head q;
+ unsigned long state;
+ struct gnet_stats_basic_sync bstats;
+ bool running; /* must be written under qdisc spinlock */
+
+ /* Note : we only change qstats.backlog in fast path. */
+ struct gnet_stats_queue qstats;
+ __cacheline_group_end(Qdisc_write);
+
atomic_long_t defer_count ____cacheline_aligned_in_smp;
struct llist_head defer_list;
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 07/10] net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (5 preceding siblings ...)
2025-11-10 9:45 ` [PATCH net-next 06/10] net_sched: add Qdisc_read_mostly and Qdisc_write groups Eric Dumazet
@ 2025-11-10 9:45 ` Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 08/10] net_sched: sch_fq: prefetch one skb ahead in dequeue() Eric Dumazet
` (3 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:45 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Group together changes to qdisc fields to reduce chances of false sharing
if another cpu attempts to acquire the qdisc spinlock.
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
qdisc_bstats_update(sch, skb);
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/sched/sch_fq.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index fee922da2f99c0c7ac6d86569cf3bbce47898951..0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -497,6 +497,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
skb_mark_not_on_list(skb);
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
+ qdisc_bstats_update(sch, skb);
}
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
@@ -776,7 +777,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
f->time_next_packet = now + len;
}
out:
- qdisc_bstats_update(sch, skb);
return skb;
}
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 08/10] net_sched: sch_fq: prefetch one skb ahead in dequeue()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (6 preceding siblings ...)
2025-11-10 9:45 ` [PATCH net-next 07/10] net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb() Eric Dumazet
@ 2025-11-10 9:45 ` Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 09/10] net: prefech skb->priority in __dev_xmit_skb() Eric Dumazet
` (2 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:45 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
prefetch the skb that we are likely to dequeue at the next dequeue().
Also call fq_dequeue_skb() a bit sooner in fq_dequeue().
This reduces the window between read of q.qlen and
changes of fields in the cache line that could be dirtied
by another cpu trying to queue a packet.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/sched/sch_fq.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 0b0ca1aa9251f959e87dd5dc504fbe0f4cbc75eb..6e5f2f4f241546605f8ba37f96275446c8836eee 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -480,7 +480,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
struct sk_buff *skb)
{
if (skb == flow->head) {
- flow->head = skb->next;
+ struct sk_buff *next = skb->next;
+
+ prefetch(next);
+ flow->head = next;
} else {
rb_erase(&skb->rbnode, &flow->t_root);
skb->dev = qdisc_dev(sch);
@@ -712,6 +715,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
goto begin;
}
prefetch(&skb->end);
+ fq_dequeue_skb(sch, f, skb);
if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
@@ -719,7 +723,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
if (--f->qlen == 0)
q->inactive_flows++;
q->band_pkt_count[fq_skb_cb(skb)->band]--;
- fq_dequeue_skb(sch, f, skb);
} else {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 09/10] net: prefech skb->priority in __dev_xmit_skb()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (7 preceding siblings ...)
2025-11-10 9:45 ` [PATCH net-next 08/10] net_sched: sch_fq: prefetch one skb ahead in dequeue() Eric Dumazet
@ 2025-11-10 9:45 ` Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 10/10] net: annotate a data-race " Eric Dumazet
2025-11-10 16:44 ` [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Jakub Kicinski
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:45 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
Most qdiscs need to read skb->priority at enqueue time().
__dev_xmit_skb()
In commit 100dfa74cad9 ("net: dev_queue_xmit() llist adoption")
I added a prefetch(next), lets add another one for the second
half of skb.
Note that skb->priority and skb->hash share a common cache line,
so this patch helps qdiscs needing both fields.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/core/dev.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/core/dev.c b/net/core/dev.c
index 895c3e37e686f0f625bd5eec7079a43cbd33a7eb..44022fdec655e40e70ff5e1894f55fc76235b00c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4246,6 +4246,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
prefetch(next);
+ prefetch(&next->priority);
skb_mark_not_on_list(skb);
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
count++;
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH net-next 10/10] net: annotate a data-race in __dev_xmit_skb()
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (8 preceding siblings ...)
2025-11-10 9:45 ` [PATCH net-next 09/10] net: prefech skb->priority in __dev_xmit_skb() Eric Dumazet
@ 2025-11-10 9:45 ` Eric Dumazet
2025-11-10 16:44 ` [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Jakub Kicinski
10 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 9:45 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
Toke Høiland-Jørgensen, Kuniyuki Iwashima,
Willem de Bruijn, netdev, eric.dumazet, Eric Dumazet
q->limit is read locklessly, add a READ_ONCE().
Fixes: 100dfa74cad9 ("net: dev_queue_xmit() llist adoption")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/core/dev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index 44022fdec655e40e70ff5e1894f55fc76235b00c..ac994974e2a81889fcc0a2e664edcdb7cfd0496d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4194,7 +4194,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
do {
if (first_n && !defer_count) {
defer_count = atomic_long_inc_return(&q->defer_count);
- if (unlikely(defer_count > q->limit)) {
+ if (unlikely(defer_count > READ_ONCE(q->limit))) {
kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
return NET_XMIT_DROP;
}
--
2.51.2.1041.gc1ab5b90ca-goog
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH net-next 00/10] net_sched: speedup qdisc dequeue
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
` (9 preceding siblings ...)
2025-11-10 9:45 ` [PATCH net-next 10/10] net: annotate a data-race " Eric Dumazet
@ 2025-11-10 16:44 ` Jakub Kicinski
2025-11-10 17:15 ` Eric Dumazet
10 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2025-11-10 16:44 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Paolo Abeni, Simon Horman, Jamal Hadi Salim,
Cong Wang, Jiri Pirko, Toke Høiland-Jørgensen,
Kuniyuki Iwashima, Willem de Bruijn, netdev, eric.dumazet
On Mon, 10 Nov 2025 09:44:55 +0000 Eric Dumazet wrote:
> Avoid up to two cache line misses in qdisc dequeue() to fetch
> skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
>
> Idea is to cache gso_segs at enqueue time before spinlock is
> acquired, in the first skb cache line, where we already
> have qdisc_skb_cb(skb)->pkt_len.
>
> This series gives a 8 % improvement in a TX intensive workload.
>
> (120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)
According to CI this breaks a bunch of tests.
https://netdev.bots.linux.dev/contest.html?branch=net-next-2025-11-10--12-00
I think they all hit:
[ 20.682474][ T231] WARNING: CPU: 3 PID: 231 at ./include/net/sch_generic.h:843 __dev_xmit_skb+0x786/0x1550
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH net-next 00/10] net_sched: speedup qdisc dequeue
2025-11-10 16:44 ` [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Jakub Kicinski
@ 2025-11-10 17:15 ` Eric Dumazet
2025-11-10 17:27 ` Jakub Kicinski
0 siblings, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 17:15 UTC (permalink / raw)
To: Jakub Kicinski
Cc: David S . Miller, Paolo Abeni, Simon Horman, Jamal Hadi Salim,
Cong Wang, Jiri Pirko, Toke Høiland-Jørgensen,
Kuniyuki Iwashima, Willem de Bruijn, netdev, eric.dumazet
On Mon, Nov 10, 2025 at 8:44 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 10 Nov 2025 09:44:55 +0000 Eric Dumazet wrote:
> > Avoid up to two cache line misses in qdisc dequeue() to fetch
> > skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
> >
> > Idea is to cache gso_segs at enqueue time before spinlock is
> > acquired, in the first skb cache line, where we already
> > have qdisc_skb_cb(skb)->pkt_len.
> >
> > This series gives a 8 % improvement in a TX intensive workload.
> >
> > (120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)
>
> According to CI this breaks a bunch of tests.
>
> https://netdev.bots.linux.dev/contest.html?branch=net-next-2025-11-10--12-00
>
> I think they all hit:
>
> [ 20.682474][ T231] WARNING: CPU: 3 PID: 231 at ./include/net/sch_generic.h:843 __dev_xmit_skb+0x786/0x1550
Oh well, I will add this in V2, thank you !
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index b76436ec3f4aa412bac1be3371f5c7c6245cc362..79501499dafba56271b9ebd97a8f379ffdc83cac
100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -841,7 +841,7 @@ static inline unsigned int qdisc_pkt_segs(const
struct sk_buff *skb)
u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
- skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
+ (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
return pkt_segs;
}
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH net-next 00/10] net_sched: speedup qdisc dequeue
2025-11-10 17:15 ` Eric Dumazet
@ 2025-11-10 17:27 ` Jakub Kicinski
2025-11-10 18:05 ` Eric Dumazet
0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2025-11-10 17:27 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Paolo Abeni, Simon Horman, Jamal Hadi Salim,
Cong Wang, Jiri Pirko, Toke Høiland-Jørgensen,
Kuniyuki Iwashima, Willem de Bruijn, netdev, eric.dumazet
On Mon, 10 Nov 2025 09:15:46 -0800 Eric Dumazet wrote:
> On Mon, Nov 10, 2025 at 8:44 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Mon, 10 Nov 2025 09:44:55 +0000 Eric Dumazet wrote:
> > > Avoid up to two cache line misses in qdisc dequeue() to fetch
> > > skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
> > >
> > > Idea is to cache gso_segs at enqueue time before spinlock is
> > > acquired, in the first skb cache line, where we already
> > > have qdisc_skb_cb(skb)->pkt_len.
> > >
> > > This series gives a 8 % improvement in a TX intensive workload.
> > >
> > > (120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)
> >
> > According to CI this breaks a bunch of tests.
> >
> > https://netdev.bots.linux.dev/contest.html?branch=net-next-2025-11-10--12-00
> >
> > I think they all hit:
> >
> > [ 20.682474][ T231] WARNING: CPU: 3 PID: 231 at ./include/net/sch_generic.h:843 __dev_xmit_skb+0x786/0x1550
>
> Oh well, I will add this in V2, thank you !
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index b76436ec3f4aa412bac1be3371f5c7c6245cc362..79501499dafba56271b9ebd97a8f379ffdc83cac
> 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -841,7 +841,7 @@ static inline unsigned int qdisc_pkt_segs(const
> struct sk_buff *skb)
> u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
>
> DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
> - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
> + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
> return pkt_segs;
> }
Hm, I think we need more..
The non-debug workers are also failing and they have DEBUG_NET=n
Looks like most of the non-debug tests are tunnel and bridge related.
VxLAN, GRE etc.
https://netdev.bots.linux.dev/contest.html?pass=0&branch=net-next-2025-11-10--12-00&executor=vmksft-forwarding
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH net-next 00/10] net_sched: speedup qdisc dequeue
2025-11-10 17:27 ` Jakub Kicinski
@ 2025-11-10 18:05 ` Eric Dumazet
2025-11-10 18:22 ` Eric Dumazet
0 siblings, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 18:05 UTC (permalink / raw)
To: Jakub Kicinski
Cc: David S . Miller, Paolo Abeni, Simon Horman, Jamal Hadi Salim,
Cong Wang, Jiri Pirko, Toke Høiland-Jørgensen,
Kuniyuki Iwashima, Willem de Bruijn, netdev, eric.dumazet
On Mon, Nov 10, 2025 at 9:27 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 10 Nov 2025 09:15:46 -0800 Eric Dumazet wrote:
> > On Mon, Nov 10, 2025 at 8:44 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > On Mon, 10 Nov 2025 09:44:55 +0000 Eric Dumazet wrote:
> > > > Avoid up to two cache line misses in qdisc dequeue() to fetch
> > > > skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
> > > >
> > > > Idea is to cache gso_segs at enqueue time before spinlock is
> > > > acquired, in the first skb cache line, where we already
> > > > have qdisc_skb_cb(skb)->pkt_len.
> > > >
> > > > This series gives a 8 % improvement in a TX intensive workload.
> > > >
> > > > (120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)
> > >
> > > According to CI this breaks a bunch of tests.
> > >
> > > https://netdev.bots.linux.dev/contest.html?branch=net-next-2025-11-10--12-00
> > >
> > > I think they all hit:
> > >
> > > [ 20.682474][ T231] WARNING: CPU: 3 PID: 231 at ./include/net/sch_generic.h:843 __dev_xmit_skb+0x786/0x1550
> >
> > Oh well, I will add this in V2, thank you !
> >
> > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > index b76436ec3f4aa412bac1be3371f5c7c6245cc362..79501499dafba56271b9ebd97a8f379ffdc83cac
> > 100644
> > --- a/include/net/sch_generic.h
> > +++ b/include/net/sch_generic.h
> > @@ -841,7 +841,7 @@ static inline unsigned int qdisc_pkt_segs(const
> > struct sk_buff *skb)
> > u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
> >
> > DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
> > - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
> > + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
> > return pkt_segs;
> > }
>
> Hm, I think we need more..
>
> The non-debug workers are also failing and they have DEBUG_NET=n
>
> Looks like most of the non-debug tests are tunnel and bridge related.
> VxLAN, GRE etc.
>
> https://netdev.bots.linux.dev/contest.html?pass=0&branch=net-next-2025-11-10--12-00&executor=vmksft-forwarding
Nice !
tc_run()
mini_qdisc_bstats_cpu_update() //
I am not sure this path was setting qdisc_pkt_len() either...
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH net-next 00/10] net_sched: speedup qdisc dequeue
2025-11-10 18:05 ` Eric Dumazet
@ 2025-11-10 18:22 ` Eric Dumazet
0 siblings, 0 replies; 16+ messages in thread
From: Eric Dumazet @ 2025-11-10 18:22 UTC (permalink / raw)
To: Jakub Kicinski
Cc: David S . Miller, Paolo Abeni, Simon Horman, Jamal Hadi Salim,
Cong Wang, Jiri Pirko, Toke Høiland-Jørgensen,
Kuniyuki Iwashima, Willem de Bruijn, netdev, eric.dumazet
On Mon, Nov 10, 2025 at 10:05 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Nov 10, 2025 at 9:27 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Mon, 10 Nov 2025 09:15:46 -0800 Eric Dumazet wrote:
> > > On Mon, Nov 10, 2025 at 8:44 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > > >
> > > > On Mon, 10 Nov 2025 09:44:55 +0000 Eric Dumazet wrote:
> > > > > Avoid up to two cache line misses in qdisc dequeue() to fetch
> > > > > skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held.
> > > > >
> > > > > Idea is to cache gso_segs at enqueue time before spinlock is
> > > > > acquired, in the first skb cache line, where we already
> > > > > have qdisc_skb_cb(skb)->pkt_len.
> > > > >
> > > > > This series gives a 8 % improvement in a TX intensive workload.
> > > > >
> > > > > (120 Mpps -> 130 Mpps on a Turin host, IDPF with 32 TX queues)
> > > >
> > > > According to CI this breaks a bunch of tests.
> > > >
> > > > https://netdev.bots.linux.dev/contest.html?branch=net-next-2025-11-10--12-00
> > > >
> > > > I think they all hit:
> > > >
> > > > [ 20.682474][ T231] WARNING: CPU: 3 PID: 231 at ./include/net/sch_generic.h:843 __dev_xmit_skb+0x786/0x1550
> > >
> > > Oh well, I will add this in V2, thank you !
> > >
> > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> > > index b76436ec3f4aa412bac1be3371f5c7c6245cc362..79501499dafba56271b9ebd97a8f379ffdc83cac
> > > 100644
> > > --- a/include/net/sch_generic.h
> > > +++ b/include/net/sch_generic.h
> > > @@ -841,7 +841,7 @@ static inline unsigned int qdisc_pkt_segs(const
> > > struct sk_buff *skb)
> > > u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
> > >
> > > DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
> > > - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
> > > + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
> > > return pkt_segs;
> > > }
> >
> > Hm, I think we need more..
> >
> > The non-debug workers are also failing and they have DEBUG_NET=n
> >
> > Looks like most of the non-debug tests are tunnel and bridge related.
> > VxLAN, GRE etc.
> >
> > https://netdev.bots.linux.dev/contest.html?pass=0&branch=net-next-2025-11-10--12-00&executor=vmksft-forwarding
>
> Nice !
>
> tc_run()
> mini_qdisc_bstats_cpu_update() //
>
> I am not sure this path was setting qdisc_pkt_len() either...
pkt_len was set in sch_handle_ingress(), I will add in V2 :
diff --git a/net/core/dev.c b/net/core/dev.c
index ac994974e2a81889fcc0a2e664edcdb7cfd0496d..10042139dbb054b9a93dfb019477a80263feb029
100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4435,7 +4435,7 @@ sch_handle_ingress(struct sk_buff *skb, struct
packet_type **pt_prev, int *ret,
*pt_prev = NULL;
}
- qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_pkt_len_segs_init(skb);
tcx_set_ingress(skb, true);
if (static_branch_unlikely(&tcx_needed_key)) {
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2025-11-10 18:22 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-10 9:44 [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 01/10] net_sched: make room for (struct qdisc_skb_cb)->pkt_segs Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 02/10] net: init shinfo->gso_segs from qdisc_pkt_len_init() Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 03/10] net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in qdisc_pkt_len_init() Eric Dumazet
2025-11-10 9:44 ` [PATCH net-next 04/10] net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update() Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 05/10] net_sched: cake: use qdisc_pkt_segs() Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 06/10] net_sched: add Qdisc_read_mostly and Qdisc_write groups Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 07/10] net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb() Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 08/10] net_sched: sch_fq: prefetch one skb ahead in dequeue() Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 09/10] net: prefech skb->priority in __dev_xmit_skb() Eric Dumazet
2025-11-10 9:45 ` [PATCH net-next 10/10] net: annotate a data-race " Eric Dumazet
2025-11-10 16:44 ` [PATCH net-next 00/10] net_sched: speedup qdisc dequeue Jakub Kicinski
2025-11-10 17:15 ` Eric Dumazet
2025-11-10 17:27 ` Jakub Kicinski
2025-11-10 18:05 ` Eric Dumazet
2025-11-10 18:22 ` Eric Dumazet
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).