All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Toke Høiland-Jørgensen" <toke@redhat.com>
To: "Jonas Köppeler" <j.koeppeler@tu-berlin.de>,
	"Eric Dumazet" <edumazet@google.com>
Cc: "David S . Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Simon Horman <horms@kernel.org>,
	Jamal Hadi Salim <jhs@mojatatu.com>,
	Cong Wang <xiyou.wangcong@gmail.com>,
	Jiri Pirko <jiri@resnulli.us>,
	Kuniyuki Iwashima <kuniyu@google.com>,
	Willem de Bruijn <willemb@google.com>,
	netdev@vger.kernel.org, eric.dumazet@gmail.com
Subject: Re: [PATCH v1 net-next 5/5] net: dev_queue_xmit() llist adoption
Date: Tue, 11 Nov 2025 17:42:07 +0100	[thread overview]
Message-ID: <87ms4sldwg.fsf@toke.dk> (raw)
In-Reply-To: <99ba3114-10a3-4df2-b49e-63ce8687836d@tu-berlin.de>

Jonas Köppeler <j.koeppeler@tu-berlin.de> writes:

> On 11/10/25 18:34, Eric Dumazet wrote:
>> On Mon, Nov 10, 2025 at 6:49 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>> Eric Dumazet <edumazet@google.com> writes:
>>>
>>>> I can work on a patch today.
>>> This sounds like an excellent idea in any case - thanks! :)
>> The following (on top of my last series) seems to work for me
>>
>>   include/net/pkt_sched.h   |    5 +++--
>>   include/net/sch_generic.h |   24 +++++++++++++++++++++++-
>>   net/core/dev.c            |   33 +++++++++++++++++++--------------
>>   net/sched/sch_cake.c      |    4 +++-
>>   4 files changed, 48 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
>> index 4678db45832a1e3bf7b8a07756fb89ab868bd5d2..e703c507d0daa97ae7c3bf131e322b1eafcc5664
>> 100644
>> --- a/include/net/pkt_sched.h
>> +++ b/include/net/pkt_sched.h
>> @@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
>>
>>   void __qdisc_run(struct Qdisc *q);
>>
>> -static inline void qdisc_run(struct Qdisc *q)
>> +static inline struct sk_buff *qdisc_run(struct Qdisc *q)
>>   {
>>          if (qdisc_run_begin(q)) {
>>                  __qdisc_run(q);
>> -               qdisc_run_end(q);
>> +               return qdisc_run_end(q);
>>          }
>> +       return NULL;
>>   }
>>
>>   extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 79501499dafba56271b9ebd97a8f379ffdc83cac..19cd2bc13bdba48f941b1599f896c15c8c7860ae
>> 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -88,6 +88,8 @@ struct Qdisc {
>>   #define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
>>   #define TCQ_F_NOLOCK           0x100 /* qdisc does not require locking */
>>   #define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
>> +#define TCQ_F_DEQUEUE_DROPS    0x400 /* ->dequeue() can drop packets
>> in q->to_free */
>> +
>>          u32                     limit;
>>          const struct Qdisc_ops  *ops;
>>          struct qdisc_size_table __rcu *stab;
>> @@ -119,6 +121,8 @@ struct Qdisc {
>>
>>                  /* Note : we only change qstats.backlog in fast path. */
>>                  struct gnet_stats_queue qstats;
>> +
>> +               struct sk_buff          *to_free;
>>          __cacheline_group_end(Qdisc_write);
>>
>>
>> @@ -218,8 +222,15 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
>>          return true;
>>   }
>>
>> -static inline void qdisc_run_end(struct Qdisc *qdisc)
>> +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
>>   {
>> +       struct sk_buff *to_free = NULL;
>> +
>> +       if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
>> +               to_free = qdisc->to_free;
>> +               if (to_free)
>> +                       qdisc->to_free = NULL;
>> +       }
>>          if (qdisc->flags & TCQ_F_NOLOCK) {
>>                  spin_unlock(&qdisc->seqlock);
>>
>> @@ -235,6 +246,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
>>          } else {
>>                  WRITE_ONCE(qdisc->running, false);
>>          }
>> +       return to_free;
>>   }
>>
>>   static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
>> @@ -1105,6 +1117,16 @@ static inline void tcf_set_drop_reason(const
>> struct sk_buff *skb,
>>          tc_skb_cb(skb)->drop_reason = reason;
>>   }
>>
>> +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
>> +                                     enum skb_drop_reason reason)
>> +{
>> +       DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
>> +
>> +       tcf_set_drop_reason(skb, reason);
>> +       skb->next = q->to_free;
>> +       q->to_free = skb;
>> +}
>> +
>>   /* Instead of calling kfree_skb() while root qdisc lock is held,
>>    * queue the skb for future freeing at end of __dev_xmit_skb()
>>    */
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index ac994974e2a81889fcc0a2e664edcdb7cfd0496d..18cfcd765b1b3e4af1c5339e36df517e7abc914f
>> 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff
>> *skb, struct Qdisc *q,
>>                                   struct net_device *dev,
>>                                   struct netdev_queue *txq)
>>   {
>> -       struct sk_buff *next, *to_free = NULL;
>> +       struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
>>          spinlock_t *root_lock = qdisc_lock(q);
>>          struct llist_node *ll_list, *first_n;
>>          unsigned long defer_count = 0;
>> @@ -4160,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff
>> *skb, struct Qdisc *q,
>>                          if (unlikely(!nolock_qdisc_is_empty(q))) {
>>                                  rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
>>                                  __qdisc_run(q);
>> -                               qdisc_run_end(q);
>> +                               to_free2 = qdisc_run_end(q);
>>
>> -                               goto no_lock_out;
>> +                               goto free_out;
>>                          }
>>
>>                          qdisc_bstats_cpu_update(q, skb);
>> @@ -4170,18 +4170,15 @@ static inline int __dev_xmit_skb(struct
>> sk_buff *skb, struct Qdisc *q,
>>                              !nolock_qdisc_is_empty(q))
>>                                  __qdisc_run(q);
>>
>> -                       qdisc_run_end(q);
>> -                       return NET_XMIT_SUCCESS;
>> +                       to_free2 = qdisc_run_end(q);
>> +                       rc = NET_XMIT_SUCCESS;
>> +                       goto free_out;
>>                  }
>>
>>                  rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
>> -               qdisc_run(q);
>> +               to_free2 = qdisc_run(q);
>>
>> -no_lock_out:
>> -               if (unlikely(to_free))
>> -                       kfree_skb_list_reason(to_free,
>> -                                             tcf_get_drop_reason(to_free));
>> -               return rc;
>> +               goto free_out;
>>          }
>>
>>          /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
>> @@ -4239,7 +4236,7 @@ static inline int __dev_xmit_skb(struct sk_buff
>> *skb, struct Qdisc *q,
>>                  qdisc_bstats_update(q, skb);
>>                  if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
>>                          __qdisc_run(q);
>> -               qdisc_run_end(q);
>> +               to_free2 = qdisc_run_end(q);
>>                  rc = NET_XMIT_SUCCESS;
>>          } else {
>>                  int count = 0;
>> @@ -4251,15 +4248,19 @@ static inline int __dev_xmit_skb(struct
>> sk_buff *skb, struct Qdisc *q,
>>                          rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
>>                          count++;
>>                  }
>> -               qdisc_run(q);
>> +               to_free2 = qdisc_run(q);
>>                  if (count != 1)
>>                          rc = NET_XMIT_SUCCESS;
>>          }
>>   unlock:
>>          spin_unlock(root_lock);
>> +free_out:
>>          if (unlikely(to_free))
>>                  kfree_skb_list_reason(to_free,
>>                                        tcf_get_drop_reason(to_free));
>> +       if (unlikely(to_free2))
>> +               kfree_skb_list_reason(to_free2,
>> +                                     tcf_get_drop_reason(to_free2));
>>          return rc;
>>   }
>>
>> @@ -5741,6 +5742,7 @@ static __latent_entropy void net_tx_action(void)
>>          }
>>
>>          if (sd->output_queue) {
>> +               struct sk_buff *to_free;
>>                  struct Qdisc *head;
>>
>>                  local_irq_disable();
>> @@ -5780,9 +5782,12 @@ static __latent_entropy void net_tx_action(void)
>>                          }
>>
>>                          clear_bit(__QDISC_STATE_SCHED, &q->state);
>> -                       qdisc_run(q);
>> +                       to_free = qdisc_run(q);
>>                          if (root_lock)
>>                                  spin_unlock(root_lock);
>> +                       if (unlikely(to_free))
>> +                               kfree_skb_list_reason(to_free,
>> +                                             tcf_get_drop_reason(to_free));
>>                  }
>>
>>                  rcu_read_unlock();
>> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
>> index 312f5b000ffb67d74faf70f26d808e26315b4ab8..a717cc4e0606e80123ec9c76331d454dad699b69
>> 100644
>> --- a/net/sched/sch_cake.c
>> +++ b/net/sched/sch_cake.c
>> @@ -2183,7 +2183,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
>>                  b->tin_dropped++;
>>                  qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
>>                  qdisc_qstats_drop(sch);
>> -               kfree_skb_reason(skb, reason);
>> +               qdisc_dequeue_drop(sch, skb, reason);
>>                  if (q->rate_flags & CAKE_FLAG_INGRESS)
>>                          goto retry;
>>          }
>> @@ -2569,6 +2569,8 @@ static void cake_reconfigure(struct Qdisc *sch)
>>
>>          sch->flags &= ~TCQ_F_CAN_BYPASS;
>>
>> +       sch->flags |= TCQ_F_DEQUEUE_DROPS;
>> +
>>          q->buffer_limit = min(q->buffer_limit,
>>                                max(sch->limit * psched_mtu(qdisc_dev(sch)),
>>                                    q->buffer_config_limit));
>
> Thanks for the patches. I experimented with these patches and these are the results:
> Running UDP flood (~21 Mpps load) over 2 minutes.
> pre-patch (baseline):
>    cake achieves stable packet rate of 0.476 Mpps
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 3593552166 bytes 56149224 pkt (dropped 2183, overlimits 0
>          requeues 311)
>        backlog 0b 0p requeues 311
>        memory used: 15503616b of 15140Kb
>
> net-next/main:
>    cake throughput drops from 0.61 Mpps to 0.15 Mpps (the expected collapse
>    we've seen before)
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 1166199994 bytes 18221827 pkt (dropped 71317773, overlimits 0
>          requeues 1914)
>        backlog 0b 0p requeues 1914
>        memory used: 15504576b of 15140Kb
>
> net-next/main + this dequeue patch:
>    cake throughput drops in the first 6 seconds from 0.65 Mpps  and then
>    oscillates between 0.27–0.36 Mpps
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 2627464378 bytes 41054083 pkt (dropped 50102108, overlimits 0
>          requeues 1008)
>        backlog 0b 0p requeues 1008
>        memory used: 15503616b of 15140Kb
>
> net-next/main + this dequeue patch + limiting the number of deferred packets:
>    cake throughput drops in the first 6 seconds from 0.65 Mpps  and then
>    oscillates between 0.35–0.43 Mpps
>
>      qdisc cake 8001: dev enp7s0np1 root refcnt 33 bandwidth unlimited
>          besteffort flows nonat nowash no-ack-filter split-gso rtt 100ms
>          noatm overhead 18 mpu 64
>        Sent 2969529126 bytes 46398843 pkt (dropped 43618919, overlimits 0
>          requeues 814)
>        backlog 0b 0p requeues 814
>        memory used: 15503616b of 15140Kb
>
> I can provide the full throughput traces if needed.
> So the last patches definitely mitigate cake's performance degradation.

I also did a series of test runs with the variant in Eric's v2 patch set
that includes the dequeue side improvement, and it definitely helps, to
the point where I can now get up to 50% higher PPS with a cake instance
serving multiple flows.

However, it's unstable: if I add enough flows I end up back in a state
where throughput collapses to a dozen Kpps. The exact number of flows
where this happens varies a bit, but it's quite distinct when it does.

The number of flows it takes before things break down varies a bit; I
have not found a definite cause for this yet, but I'll keep looking.

-Toke

[0] https://lore.kernel.org/r/20251111093204.1432437-1-edumazet@google.com


  reply	other threads:[~2025-11-11 16:42 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-13 14:54 [PATCH v1 net-next 0/5] net: optimize TX throughput and efficiency Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 1/5] net: add add indirect call wrapper in skb_release_head_state() Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 2/5] net/sched: act_mirred: add loop detection Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 3/5] Revert "net/sched: Fix mirred deadlock on device recursion" Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 4/5] net: sched: claim one cache line in Qdisc Eric Dumazet
2025-10-13 14:54 ` [PATCH v1 net-next 5/5] net: dev_queue_xmit() llist adoption Eric Dumazet
2025-11-07 15:28   ` Toke Høiland-Jørgensen
2025-11-07 15:37     ` Eric Dumazet
2025-11-07 15:46       ` Eric Dumazet
2025-11-09 10:09         ` Eric Dumazet
2025-11-09 12:54           ` Eric Dumazet
2025-11-09 16:33             ` Toke Høiland-Jørgensen
2025-11-09 17:14               ` Eric Dumazet
2025-11-09 19:18               ` Jonas Köppeler
2025-11-09 19:28                 ` Eric Dumazet
2025-11-09 20:18                   ` Eric Dumazet
2025-11-09 20:29                     ` Eric Dumazet
2025-11-10 11:31                       ` Toke Høiland-Jørgensen
2025-11-10 13:26                         ` Eric Dumazet
2025-11-10 14:49                           ` Toke Høiland-Jørgensen
2025-11-10 17:34                             ` Eric Dumazet
2025-11-11 13:44                               ` Jonas Köppeler
2025-11-11 16:42                                 ` Toke Høiland-Jørgensen [this message]
2025-10-13 16:23 ` [PATCH v1 net-next 0/5] net: optimize TX throughput and efficiency Toke Høiland-Jørgensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87ms4sldwg.fsf@toke.dk \
    --to=toke@redhat.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=eric.dumazet@gmail.com \
    --cc=horms@kernel.org \
    --cc=j.koeppeler@tu-berlin.de \
    --cc=jhs@mojatatu.com \
    --cc=jiri@resnulli.us \
    --cc=kuba@kernel.org \
    --cc=kuniyu@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=willemb@google.com \
    --cc=xiyou.wangcong@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.