* [PATCH net-next v10 1/4] tun/tap: add ptr_ring consume helper with netdev queue wakeup
2026-05-06 14:10 [PATCH net-next v10 0/4] tun/tap & vhost-net: apply qdisc backpressure on full ptr_ring to reduce TX drops Simon Schippers
@ 2026-05-06 14:10 ` Simon Schippers
2026-05-06 22:18 ` Michael S. Tsirkin
2026-05-06 14:10 ` [PATCH net-next v10 2/4] vhost-net: wake queue of tun/tap after ptr_ring consume Simon Schippers
` (2 subsequent siblings)
3 siblings, 1 reply; 11+ messages in thread
From: Simon Schippers @ 2026-05-06 14:10 UTC (permalink / raw)
To: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, mst, eperezma, leiyang, stephen, jon, tim.gebauer,
simon.schippers, netdev, linux-kernel, kvm, virtualization
Introduce tun_ring_consume() that wraps ptr_ring_consume() and calls
__tun_wake_queue(). The latter wakes the stopped netdev subqueue once
half of the ring capacity has been consumed, tracked via the new
cons_cnt field in tun_file. cons_cnt is updated while holding the ring
consumer lock, avoiding races. As a safety net, the queue is also woken
when the ring becomes empty. The point is to allow the queue to be
stopped when it gets full, which is required for traffic shaping -
implemented by the following "avoid ptr_ring tail-drop when a qdisc
is present". That patch also explains the pairing of the smp_mb()
of __tun_wake_queue().
Without the corresponding queue stopping, this patch alone causes no
regression for a tap setup sending to a qemu VM: 1.132 Mpps
to 1.144 Mpps.
Details: AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU
threads, pktgen sender; Avg over 50 runs @ 100,000,000 packets;
SRSO and spectre v2 mitigations disabled.
Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
drivers/net/tun.c | 54 +++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 50 insertions(+), 4 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b183189f1853..00ecf128fe8e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -145,6 +145,7 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct ptr_ring tx_ring;
+ int cons_cnt;
struct xdp_rxq_info xdp_rxq;
};
@@ -557,6 +558,13 @@ void tun_ptr_free(void *ptr)
}
EXPORT_SYMBOL_GPL(tun_ptr_free);
+static void tun_reset_cons_cnt(struct tun_file *tfile)
+{
+ spin_lock(&tfile->tx_ring.consumer_lock);
+ tfile->cons_cnt = 0;
+ spin_unlock(&tfile->tx_ring.consumer_lock);
+}
+
static void tun_queue_purge(struct tun_file *tfile)
{
void *ptr;
@@ -564,6 +572,7 @@ static void tun_queue_purge(struct tun_file *tfile)
while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
tun_ptr_free(ptr);
+ tun_reset_cons_cnt(tfile);
skb_queue_purge(&tfile->sk.sk_write_queue);
skb_queue_purge(&tfile->sk.sk_error_queue);
}
@@ -730,6 +739,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
goto out;
}
+ tun_reset_cons_cnt(tfile);
tfile->queue_index = tun->numqueues;
tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
@@ -2115,13 +2125,46 @@ static ssize_t tun_put_user(struct tun_struct *tun,
return total;
}
-static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
+/* Callers must hold ring.consumer_lock */
+static void __tun_wake_queue(struct tun_struct *tun,
+ struct tun_file *tfile, int consumed)
+{
+ struct netdev_queue *txq = netdev_get_tx_queue(tun->dev,
+ tfile->queue_index);
+
+ /* Paired with smp_mb__after_atomic() in tun_net_xmit() */
+ smp_mb();
+ if (netif_tx_queue_stopped(txq)) {
+ tfile->cons_cnt += consumed;
+ if (tfile->cons_cnt >= tfile->tx_ring.size / 2 ||
+ __ptr_ring_empty(&tfile->tx_ring)) {
+ netif_tx_wake_queue(txq);
+ tfile->cons_cnt = 0;
+ }
+ }
+}
+
+static void *tun_ring_consume(struct tun_struct *tun, struct tun_file *tfile)
+{
+ void *ptr;
+
+ spin_lock(&tfile->tx_ring.consumer_lock);
+ ptr = __ptr_ring_consume(&tfile->tx_ring);
+ if (ptr)
+ __tun_wake_queue(tun, tfile, 1);
+
+ spin_unlock(&tfile->tx_ring.consumer_lock);
+ return ptr;
+}
+
+static void *tun_ring_recv(struct tun_struct *tun, struct tun_file *tfile,
+ int noblock, int *err)
{
DECLARE_WAITQUEUE(wait, current);
void *ptr = NULL;
int error = 0;
- ptr = ptr_ring_consume(&tfile->tx_ring);
+ ptr = tun_ring_consume(tun, tfile);
if (ptr)
goto out;
if (noblock) {
@@ -2133,7 +2176,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- ptr = ptr_ring_consume(&tfile->tx_ring);
+ ptr = tun_ring_consume(tun, tfile);
if (ptr)
break;
if (signal_pending(current)) {
@@ -2170,7 +2213,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
if (!ptr) {
/* Read frames from ring */
- ptr = tun_ring_recv(tfile, noblock, &err);
+ ptr = tun_ring_recv(tun, tfile, noblock, &err);
if (!ptr)
return err;
}
@@ -3406,6 +3449,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
return -ENOMEM;
}
+ tun_reset_cons_cnt(tfile);
+
mutex_init(&tfile->napi_mutex);
RCU_INIT_POINTER(tfile->tun, NULL);
tfile->flags = 0;
@@ -3614,6 +3659,7 @@ static int tun_queue_resize(struct tun_struct *tun)
for (i = 0; i < tun->numqueues; i++) {
tfile = rtnl_dereference(tun->tfiles[i]);
rings[i] = &tfile->tx_ring;
+ tun_reset_cons_cnt(tfile);
}
list_for_each_entry(tfile, &tun->disabled, next)
rings[i++] = &tfile->tx_ring;
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH net-next v10 1/4] tun/tap: add ptr_ring consume helper with netdev queue wakeup
2026-05-06 14:10 ` [PATCH net-next v10 1/4] tun/tap: add ptr_ring consume helper with netdev queue wakeup Simon Schippers
@ 2026-05-06 22:18 ` Michael S. Tsirkin
2026-05-07 6:21 ` Simon Schippers
0 siblings, 1 reply; 11+ messages in thread
From: Michael S. Tsirkin @ 2026-05-06 22:18 UTC (permalink / raw)
To: Simon Schippers
Cc: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, eperezma, leiyang, stephen, jon, tim.gebauer,
netdev, linux-kernel, kvm, virtualization
On Wed, May 06, 2026 at 04:10:30PM +0200, Simon Schippers wrote:
> Introduce tun_ring_consume() that wraps ptr_ring_consume() and calls
> __tun_wake_queue(). The latter wakes the stopped netdev subqueue once
> half of the ring capacity has been consumed, tracked via the new
> cons_cnt field in tun_file. cons_cnt is updated while holding the ring
> consumer lock, avoiding races. As a safety net, the queue is also woken
> when the ring becomes empty. The point is to allow the queue to be
> stopped when it gets full, which is required for traffic shaping -
> implemented by the following "avoid ptr_ring tail-drop when a qdisc
> is present". That patch also explains the pairing of the smp_mb()
> of __tun_wake_queue().
>
> Without the corresponding queue stopping, this patch alone causes no
> regression for a tap setup sending to a qemu VM: 1.132 Mpps
> to 1.144 Mpps.
>
> Details: AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU
> threads, pktgen sender; Avg over 50 runs @ 100,000,000 packets;
> SRSO and spectre v2 mitigations disabled.
>
> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> ---
> drivers/net/tun.c | 54 +++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b183189f1853..00ecf128fe8e 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -145,6 +145,7 @@ struct tun_file {
> struct list_head next;
> struct tun_struct *detached;
> struct ptr_ring tx_ring;
> + int cons_cnt;
> struct xdp_rxq_info xdp_rxq;
> };
>
> @@ -557,6 +558,13 @@ void tun_ptr_free(void *ptr)
> }
> EXPORT_SYMBOL_GPL(tun_ptr_free);
>
> +static void tun_reset_cons_cnt(struct tun_file *tfile)
> +{
> + spin_lock(&tfile->tx_ring.consumer_lock);
> + tfile->cons_cnt = 0;
> + spin_unlock(&tfile->tx_ring.consumer_lock);
> +}
> +
> static void tun_queue_purge(struct tun_file *tfile)
> {
> void *ptr;
> @@ -564,6 +572,7 @@ static void tun_queue_purge(struct tun_file *tfile)
> while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> tun_ptr_free(ptr);
>
> + tun_reset_cons_cnt(tfile);
> skb_queue_purge(&tfile->sk.sk_write_queue);
> skb_queue_purge(&tfile->sk.sk_error_queue);
> }
> @@ -730,6 +739,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> goto out;
> }
>
> + tun_reset_cons_cnt(tfile);
> tfile->queue_index = tun->numqueues;
> tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
>
> @@ -2115,13 +2125,46 @@ static ssize_t tun_put_user(struct tun_struct *tun,
> return total;
> }
>
> -static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
> +/* Callers must hold ring.consumer_lock */
> +static void __tun_wake_queue(struct tun_struct *tun,
> + struct tun_file *tfile, int consumed)
> +{
> + struct netdev_queue *txq = netdev_get_tx_queue(tun->dev,
> + tfile->queue_index);
> +
> + /* Paired with smp_mb__after_atomic() in tun_net_xmit() */
> + smp_mb();
> + if (netif_tx_queue_stopped(txq)) {
> + tfile->cons_cnt += consumed;
> + if (tfile->cons_cnt >= tfile->tx_ring.size / 2 ||
> + __ptr_ring_empty(&tfile->tx_ring)) {
> + netif_tx_wake_queue(txq);
> + tfile->cons_cnt = 0;
> + }
> + }
> +}
> +
> +static void *tun_ring_consume(struct tun_struct *tun, struct tun_file *tfile)
> +{
> + void *ptr;
> +
> + spin_lock(&tfile->tx_ring.consumer_lock);
> + ptr = __ptr_ring_consume(&tfile->tx_ring);
> + if (ptr)
> + __tun_wake_queue(tun, tfile, 1);
> +
> + spin_unlock(&tfile->tx_ring.consumer_lock);
> + return ptr;
> +}
> +
> +static void *tun_ring_recv(struct tun_struct *tun, struct tun_file *tfile,
> + int noblock, int *err)
> {
> DECLARE_WAITQUEUE(wait, current);
> void *ptr = NULL;
> int error = 0;
>
> - ptr = ptr_ring_consume(&tfile->tx_ring);
> + ptr = tun_ring_consume(tun, tfile);
> if (ptr)
> goto out;
> if (noblock) {
> @@ -2133,7 +2176,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>
> while (1) {
> set_current_state(TASK_INTERRUPTIBLE);
> - ptr = ptr_ring_consume(&tfile->tx_ring);
> + ptr = tun_ring_consume(tun, tfile);
> if (ptr)
> break;
> if (signal_pending(current)) {
So based on commit log I expected all calls to ptr_ring_consume to
be replaced with tun_ring_consume, but it looks like tun_queue_purge
still calls ptr_ring_consume.
I suspect that together with patch 4 it can sometimes leave us stuck
with a stopped queue and an empty ring, forever.
> @@ -2170,7 +2213,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>
> if (!ptr) {
> /* Read frames from ring */
> - ptr = tun_ring_recv(tfile, noblock, &err);
> + ptr = tun_ring_recv(tun, tfile, noblock, &err);
> if (!ptr)
> return err;
> }
> @@ -3406,6 +3449,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> return -ENOMEM;
> }
>
> + tun_reset_cons_cnt(tfile);
> +
> mutex_init(&tfile->napi_mutex);
> RCU_INIT_POINTER(tfile->tun, NULL);
> tfile->flags = 0;
> @@ -3614,6 +3659,7 @@ static int tun_queue_resize(struct tun_struct *tun)
> for (i = 0; i < tun->numqueues; i++) {
> tfile = rtnl_dereference(tun->tfiles[i]);
> rings[i] = &tfile->tx_ring;
> + tun_reset_cons_cnt(tfile);
> }
> list_for_each_entry(tfile, &tun->disabled, next)
> rings[i++] = &tfile->tx_ring;
> --
> 2.43.0
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH net-next v10 1/4] tun/tap: add ptr_ring consume helper with netdev queue wakeup
2026-05-06 22:18 ` Michael S. Tsirkin
@ 2026-05-07 6:21 ` Simon Schippers
0 siblings, 0 replies; 11+ messages in thread
From: Simon Schippers @ 2026-05-07 6:21 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, eperezma, leiyang, stephen, jon, tim.gebauer,
netdev, linux-kernel, kvm, virtualization
On 5/7/26 00:18, Michael S. Tsirkin wrote:
> On Wed, May 06, 2026 at 04:10:30PM +0200, Simon Schippers wrote:
>> Introduce tun_ring_consume() that wraps ptr_ring_consume() and calls
>> __tun_wake_queue(). The latter wakes the stopped netdev subqueue once
>> half of the ring capacity has been consumed, tracked via the new
>> cons_cnt field in tun_file. cons_cnt is updated while holding the ring
>> consumer lock, avoiding races. As a safety net, the queue is also woken
>> when the ring becomes empty. The point is to allow the queue to be
>> stopped when it gets full, which is required for traffic shaping -
>> implemented by the following "avoid ptr_ring tail-drop when a qdisc
>> is present". That patch also explains the pairing of the smp_mb()
>> of __tun_wake_queue().
>>
>> Without the corresponding queue stopping, this patch alone causes no
>> regression for a tap setup sending to a qemu VM: 1.132 Mpps
>> to 1.144 Mpps.
>>
>> Details: AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU
>> threads, pktgen sender; Avg over 50 runs @ 100,000,000 packets;
>> SRSO and spectre v2 mitigations disabled.
>>
>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>> ---
>> drivers/net/tun.c | 54 +++++++++++++++++++++++++++++++++++++++++++----
>> 1 file changed, 50 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index b183189f1853..00ecf128fe8e 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -145,6 +145,7 @@ struct tun_file {
>> struct list_head next;
>> struct tun_struct *detached;
>> struct ptr_ring tx_ring;
>> + int cons_cnt;
>> struct xdp_rxq_info xdp_rxq;
>> };
>>
>> @@ -557,6 +558,13 @@ void tun_ptr_free(void *ptr)
>> }
>> EXPORT_SYMBOL_GPL(tun_ptr_free);
>>
>> +static void tun_reset_cons_cnt(struct tun_file *tfile)
>> +{
>> + spin_lock(&tfile->tx_ring.consumer_lock);
>> + tfile->cons_cnt = 0;
>> + spin_unlock(&tfile->tx_ring.consumer_lock);
>> +}
>> +
>> static void tun_queue_purge(struct tun_file *tfile)
>> {
>> void *ptr;
>> @@ -564,6 +572,7 @@ static void tun_queue_purge(struct tun_file *tfile)
>> while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
>> tun_ptr_free(ptr);
>>
>> + tun_reset_cons_cnt(tfile);
>> skb_queue_purge(&tfile->sk.sk_write_queue);
>> skb_queue_purge(&tfile->sk.sk_error_queue);
>> }
>> @@ -730,6 +739,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>> goto out;
>> }
>>
>> + tun_reset_cons_cnt(tfile);
>> tfile->queue_index = tun->numqueues;
>> tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
>>
>> @@ -2115,13 +2125,46 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>> return total;
>> }
>>
>> -static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>> +/* Callers must hold ring.consumer_lock */
>> +static void __tun_wake_queue(struct tun_struct *tun,
>> + struct tun_file *tfile, int consumed)
>> +{
>> + struct netdev_queue *txq = netdev_get_tx_queue(tun->dev,
>> + tfile->queue_index);
>> +
>> + /* Paired with smp_mb__after_atomic() in tun_net_xmit() */
>> + smp_mb();
>> + if (netif_tx_queue_stopped(txq)) {
>> + tfile->cons_cnt += consumed;
>> + if (tfile->cons_cnt >= tfile->tx_ring.size / 2 ||
>> + __ptr_ring_empty(&tfile->tx_ring)) {
>> + netif_tx_wake_queue(txq);
>> + tfile->cons_cnt = 0;
>> + }
>> + }
>> +}
>> +
>> +static void *tun_ring_consume(struct tun_struct *tun, struct tun_file *tfile)
>> +{
>> + void *ptr;
>> +
>> + spin_lock(&tfile->tx_ring.consumer_lock);
>> + ptr = __ptr_ring_consume(&tfile->tx_ring);
>> + if (ptr)
>> + __tun_wake_queue(tun, tfile, 1);
>> +
>> + spin_unlock(&tfile->tx_ring.consumer_lock);
>> + return ptr;
>> +}
>> +
>> +static void *tun_ring_recv(struct tun_struct *tun, struct tun_file *tfile,
>> + int noblock, int *err)
>> {
>> DECLARE_WAITQUEUE(wait, current);
>> void *ptr = NULL;
>> int error = 0;
>>
>> - ptr = ptr_ring_consume(&tfile->tx_ring);
>> + ptr = tun_ring_consume(tun, tfile);
>> if (ptr)
>> goto out;
>> if (noblock) {
>> @@ -2133,7 +2176,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
>>
>> while (1) {
>> set_current_state(TASK_INTERRUPTIBLE);
>> - ptr = ptr_ring_consume(&tfile->tx_ring);
>> + ptr = tun_ring_consume(tun, tfile);
>> if (ptr)
>> break;
>> if (signal_pending(current)) {
>
>
> So based on commit log I expected all calls to ptr_ring_consume to
> be replaced with tun_ring_consume, but it looks like tun_queue_purge
> still calls ptr_ring_consume.
> I suspect that together with patch 4 it can sometimes leave us stuck
> with a stopped queue and an empty ring, forever.
>
I see. I will replace ptr_ring_consume() with tun_ring_consume().
>
>
>
>
>> @@ -2170,7 +2213,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>>
>> if (!ptr) {
>> /* Read frames from ring */
>> - ptr = tun_ring_recv(tfile, noblock, &err);
>> + ptr = tun_ring_recv(tun, tfile, noblock, &err);
>> if (!ptr)
>> return err;
>> }
>> @@ -3406,6 +3449,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>> return -ENOMEM;
>> }
>>
>> + tun_reset_cons_cnt(tfile);
>> +
>> mutex_init(&tfile->napi_mutex);
>> RCU_INIT_POINTER(tfile->tun, NULL);
>> tfile->flags = 0;
>> @@ -3614,6 +3659,7 @@ static int tun_queue_resize(struct tun_struct *tun)
>> for (i = 0; i < tun->numqueues; i++) {
>> tfile = rtnl_dereference(tun->tfiles[i]);
>> rings[i] = &tfile->tx_ring;
>> + tun_reset_cons_cnt(tfile);
>> }
>> list_for_each_entry(tfile, &tun->disabled, next)
>> rings[i++] = &tfile->tx_ring;
>> --
>> 2.43.0
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH net-next v10 2/4] vhost-net: wake queue of tun/tap after ptr_ring consume
2026-05-06 14:10 [PATCH net-next v10 0/4] tun/tap & vhost-net: apply qdisc backpressure on full ptr_ring to reduce TX drops Simon Schippers
2026-05-06 14:10 ` [PATCH net-next v10 1/4] tun/tap: add ptr_ring consume helper with netdev queue wakeup Simon Schippers
@ 2026-05-06 14:10 ` Simon Schippers
2026-05-06 14:10 ` [PATCH net-next v10 3/4] ptr_ring: move free-space check into separate helper Simon Schippers
2026-05-06 14:10 ` [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present Simon Schippers
3 siblings, 0 replies; 11+ messages in thread
From: Simon Schippers @ 2026-05-06 14:10 UTC (permalink / raw)
To: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, mst, eperezma, leiyang, stephen, jon, tim.gebauer,
simon.schippers, netdev, linux-kernel, kvm, virtualization
Add tun_wake_queue() to tun.c and export it for use by vhost-net. The
function validates that the file belongs to a tun/tap device,
dereferences the tun_struct under RCU, and delegates to
__tun_wake_queue().
vhost_net_buf_produce() now calls tun_wake_queue() after a successful
batched consume of the ring to allow the netdev subqueue to be woken up.
The point is to allow the queue to be stopped when it gets full, which
is required for traffic shaping - implemented by the following
"avoid ptr_ring tail-drop when a qdisc is present".
Without the corresponding queue stopping, this patch alone causes no
throughput regression for a tap+vhost-net setup sending to a qemu VM:
3.857 Mpps to 3.891 Mpps.
Details: AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU
threads, XDP drop program active in VM, pktgen sender; Avg over
50 runs @ 100,000,000 packets. SRSO and spectre v2 mitigations disabled.
Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
drivers/net/tun.c | 23 +++++++++++++++++++++++
drivers/vhost/net.c | 21 +++++++++++++++------
include/linux/if_tun.h | 3 +++
3 files changed, 41 insertions(+), 6 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 00ecf128fe8e..fc358c4c355b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3776,6 +3776,29 @@ struct ptr_ring *tun_get_tx_ring(struct file *file)
}
EXPORT_SYMBOL_GPL(tun_get_tx_ring);
+/* Callers must hold ring.consumer_lock */
+void tun_wake_queue(struct file *file, int consumed)
+{
+ struct tun_file *tfile;
+ struct tun_struct *tun;
+
+ if (file->f_op != &tun_fops)
+ return;
+
+ tfile = file->private_data;
+ if (!tfile)
+ return;
+
+ rcu_read_lock();
+
+ tun = rcu_dereference(tfile->tun);
+ if (tun)
+ __tun_wake_queue(tun, tfile, consumed);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tun_wake_queue);
+
module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 80965181920c..ee583d6cc0fa 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -176,13 +176,21 @@ static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
return ret;
}
-static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
+static int vhost_net_buf_produce(struct sock *sk,
+ struct vhost_net_virtqueue *nvq)
{
+ struct file *file = sk->sk_socket->file;
struct vhost_net_buf *rxq = &nvq->rxq;
rxq->head = 0;
- rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
- VHOST_NET_BATCH);
+ spin_lock(&nvq->rx_ring->consumer_lock);
+ rxq->tail = __ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
+ VHOST_NET_BATCH);
+
+ if (rxq->tail)
+ tun_wake_queue(file, rxq->tail);
+
+ spin_unlock(&nvq->rx_ring->consumer_lock);
return rxq->tail;
}
@@ -209,14 +217,15 @@ static int vhost_net_buf_peek_len(void *ptr)
return __skb_array_len_with_tag(ptr);
}
-static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
+static int vhost_net_buf_peek(struct sock *sk,
+ struct vhost_net_virtqueue *nvq)
{
struct vhost_net_buf *rxq = &nvq->rxq;
if (!vhost_net_buf_is_empty(rxq))
goto out;
- if (!vhost_net_buf_produce(nvq))
+ if (!vhost_net_buf_produce(sk, nvq))
return 0;
out:
@@ -995,7 +1004,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
unsigned long flags;
if (rvq->rx_ring)
- return vhost_net_buf_peek(rvq);
+ return vhost_net_buf_peek(sk, rvq);
spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
head = skb_peek(&sk->sk_receive_queue);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 80166eb62f41..5f3e206c7a73 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -22,6 +22,7 @@ struct tun_msg_ctl {
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);
+void tun_wake_queue(struct file *file, int consumed);
static inline bool tun_is_xdp_frame(void *ptr)
{
@@ -55,6 +56,8 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
return ERR_PTR(-EINVAL);
}
+static inline void tun_wake_queue(struct file *f, int consumed) {}
+
static inline bool tun_is_xdp_frame(void *ptr)
{
return false;
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH net-next v10 3/4] ptr_ring: move free-space check into separate helper
2026-05-06 14:10 [PATCH net-next v10 0/4] tun/tap & vhost-net: apply qdisc backpressure on full ptr_ring to reduce TX drops Simon Schippers
2026-05-06 14:10 ` [PATCH net-next v10 1/4] tun/tap: add ptr_ring consume helper with netdev queue wakeup Simon Schippers
2026-05-06 14:10 ` [PATCH net-next v10 2/4] vhost-net: wake queue of tun/tap after ptr_ring consume Simon Schippers
@ 2026-05-06 14:10 ` Simon Schippers
2026-05-06 14:10 ` [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present Simon Schippers
3 siblings, 0 replies; 11+ messages in thread
From: Simon Schippers @ 2026-05-06 14:10 UTC (permalink / raw)
To: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, mst, eperezma, leiyang, stephen, jon, tim.gebauer,
simon.schippers, netdev, linux-kernel, kvm, virtualization
This patch moves the check for available free space for a new entry into
a separate function. As a result, __ptr_ring_produce() remains logically
unchanged, while the new helper allows callers to determine in advance
whether subsequent __ptr_ring_produce() calls will succeed. This
information can, for example, be used to temporarily stop producing until
__ptr_ring_produce_peek() indicates that space is available again.
Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
include/linux/ptr_ring.h | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d2c3629bbe45..0887284e5b43 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -96,6 +96,17 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
return ret;
}
+/* Note: callers invoking this in a loop must use a compiler barrier,
+ * for example cpu_relax(). Callers must hold producer_lock.
+ */
+static inline int __ptr_ring_produce_peek(struct ptr_ring *r)
+{
+ if (unlikely(!r->size) || data_race(r->queue[r->producer]))
+ return -ENOSPC;
+
+ return 0;
+}
+
/* Note: callers invoking this in a loop must use a compiler barrier,
* for example cpu_relax(). Callers must hold producer_lock.
* Callers are responsible for making sure pointer that is being queued
@@ -103,8 +114,10 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
*/
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
- if (unlikely(!r->size) || data_race(r->queue[r->producer]))
- return -ENOSPC;
+ int p = __ptr_ring_produce_peek(r);
+
+ if (p)
+ return p;
/* Make sure the pointer we are storing points to a valid data. */
/* Pairs with the dependency ordering in __ptr_ring_consume. */
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present
2026-05-06 14:10 [PATCH net-next v10 0/4] tun/tap & vhost-net: apply qdisc backpressure on full ptr_ring to reduce TX drops Simon Schippers
` (2 preceding siblings ...)
2026-05-06 14:10 ` [PATCH net-next v10 3/4] ptr_ring: move free-space check into separate helper Simon Schippers
@ 2026-05-06 14:10 ` Simon Schippers
2026-05-06 22:28 ` Michael S. Tsirkin
3 siblings, 1 reply; 11+ messages in thread
From: Simon Schippers @ 2026-05-06 14:10 UTC (permalink / raw)
To: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, mst, eperezma, leiyang, stephen, jon, tim.gebauer,
simon.schippers, netdev, linux-kernel, kvm, virtualization
This commit prevents tail-drop when a qdisc is present and the ptr_ring
becomes full. Once an entry is successfully produced and the ptr_ring
reaches capacity, the netdev queue is stopped instead of dropping
subsequent packets. If no qdisc is present, the previous tail-drop
behavior is preserved.
If producing an entry fails anyways due to a race, tun_net_xmit() drops
the packet. Such races are expected because LLTX is enabled and the
transmit path operates without the usual locking.
The __tun_wake_queue() function of the consumer races with the producer
for waking/stopping the netdev queue, which could result in a stalled
queue. Therefore, an smp_mb__after_atomic() is introduced that pairs
with the smp_mb() of the consumer. It follows the principle of store
buffering described in tools/memory-model/Documentation/recipes.txt:
- The producer in tun_net_xmit() first sets __QUEUE_STATE_DRV_XOFF,
followed by an smp_mb__after_atomic() (= smp_mb()), and then reads the
ring with __ptr_ring_produce_peek().
- The consumer in __tun_wake_queue() first writes zero to the ring in
__ptr_ring_consume(), followed by an smp_mb(), and then reads the queue
status with netif_tx_queue_stopped().
=> Following the aforementioned principle, it is impossible for the
producer to see a full ring (and therefore not wake the queue on the
re-check) while the consumer simultaneously fails to see a stopped
queue (and therefore also does not wake it).
Benchmarks:
The benchmarks show a slight regression in raw transmission performance
when using two sending threads. Packet loss also occurs only in the
two-thread sending case; no packet loss was observed with a single
sending thread.
Test setup:
AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU threads;
Average over 50 runs @ 100,000,000 packets. SRSO and spectre v2
mitigations disabled.
Note for tap+vhost-net:
XDP drop program active in VM -> ~2.5x faster; slower for tap due to
more syscalls (high utilization of entry_SYSRETQ_unsafe_stack in perf)
+--------------------------+--------------+----------------+----------+
| 1 thread | Stock | Patched with | diff |
| sending | | fq_codel qdisc | |
+------------+-------------+--------------+----------------+----------+
| TAP | Received | 1.132 Mpps | 1.133 Mpps | +0.1% |
| +-------------+--------------+----------------+----------+
| | Lost/s | 3.765 Mpps | 0 pps | |
+------------+-------------+--------------+----------------+----------+
| TAP | Received | 3.857 Mpps | 3.905 Mpps | +1.2% |
| +-------------+--------------+----------------+----------+
| +vhost-net | Lost/s | 0.802 Mpps | 0 pps | |
+------------+-------------+--------------+----------------+----------+
+--------------------------+--------------+----------------+----------+
| 2 threads | Stock | Patched with | diff |
| sending | | fq_codel qdisc | |
+------------+-------------+--------------+----------------+----------+
| TAP | Received | 1.115 Mpps | 1.092 Mpps | -2.1% |
| +-------------+--------------+----------------+----------+
| | Lost/s | 8.490 Mpps | 359 pps | |
+------------+-------------+--------------+----------------+----------+
| TAP | Received | 3.664 Mpps | 3.549 Mpps | -3.1% |
| +-------------+--------------+----------------+----------+
| +vhost-net | Lost/s | 5.330 Mpps | 832 pps | |
+------------+-------------+--------------+----------------+----------+
Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
---
drivers/net/tun.c | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index fc358c4c355b..d9ffbf88cfd8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1018,6 +1018,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
struct netdev_queue *queue;
struct tun_file *tfile;
int len = skb->len;
+ int ret;
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
@@ -1072,13 +1073,33 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset_ct(skb);
- if (ptr_ring_produce(&tfile->tx_ring, skb)) {
+ queue = netdev_get_tx_queue(dev, txq);
+
+ spin_lock(&tfile->tx_ring.producer_lock);
+ ret = __ptr_ring_produce(&tfile->tx_ring, skb);
+ if (!qdisc_txq_has_no_queue(queue) &&
+ (__ptr_ring_produce_peek(&tfile->tx_ring) || ret)) {
+ netif_tx_stop_queue(queue);
+ /* Paired with smp_mb() in __tun_wake_queue() */
+ smp_mb__after_atomic();
+ if (!__ptr_ring_produce_peek(&tfile->tx_ring))
+ netif_tx_wake_queue(queue);
+ }
+ spin_unlock(&tfile->tx_ring.producer_lock);
+
+ if (ret) {
+ /* This should be a rare case if a qdisc is present, but
+ * can happen due to lltx.
+ * Since skb_tx_timestamp(), skb_orphan(),
+ * run_ebpf_filter() and pskb_trim() could have tinkered
+ * with the SKB, returning NETDEV_TX_BUSY is unsafe and
+ * we must drop instead.
+ */
drop_reason = SKB_DROP_REASON_FULL_RING;
goto drop;
}
/* dev->lltx requires to do our own update of trans_start */
- queue = netdev_get_tx_queue(dev, txq);
txq_trans_cond_update(queue);
/* Notify and wake up reader process */
--
2.43.0
^ permalink raw reply related [flat|nested] 11+ messages in thread* Re: [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present
2026-05-06 14:10 ` [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present Simon Schippers
@ 2026-05-06 22:28 ` Michael S. Tsirkin
2026-05-06 22:56 ` Michael S. Tsirkin
0 siblings, 1 reply; 11+ messages in thread
From: Michael S. Tsirkin @ 2026-05-06 22:28 UTC (permalink / raw)
To: Simon Schippers
Cc: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, eperezma, leiyang, stephen, jon, tim.gebauer,
netdev, linux-kernel, kvm, virtualization
On Wed, May 06, 2026 at 04:10:33PM +0200, Simon Schippers wrote:
> This commit prevents tail-drop when a qdisc is present and the ptr_ring
> becomes full. Once an entry is successfully produced and the ptr_ring
> reaches capacity, the netdev queue is stopped instead of dropping
> subsequent packets. If no qdisc is present, the previous tail-drop
> behavior is preserved.
>
> If producing an entry fails anyways due to a race, tun_net_xmit() drops
> the packet. Such races are expected because LLTX is enabled and the
> transmit path operates without the usual locking.
>
> The __tun_wake_queue() function of the consumer races with the producer
> for waking/stopping the netdev queue, which could result in a stalled
> queue. Therefore, an smp_mb__after_atomic() is introduced that pairs
> with the smp_mb() of the consumer. It follows the principle of store
> buffering described in tools/memory-model/Documentation/recipes.txt:
>
> - The producer in tun_net_xmit() first sets __QUEUE_STATE_DRV_XOFF,
> followed by an smp_mb__after_atomic() (= smp_mb()), and then reads the
> ring with __ptr_ring_produce_peek().
>
> - The consumer in __tun_wake_queue() first writes zero to the ring in
> __ptr_ring_consume(), followed by an smp_mb(), and then reads the queue
> status with netif_tx_queue_stopped().
>
> => Following the aforementioned principle, it is impossible for the
> producer to see a full ring (and therefore not wake the queue on the
> re-check) while the consumer simultaneously fails to see a stopped
> queue (and therefore also does not wake it).
>
> Benchmarks:
> The benchmarks show a slight regression in raw transmission performance
> when using two sending threads. Packet loss also occurs only in the
> two-thread sending case; no packet loss was observed with a single
> sending thread.
>
> Test setup:
> AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU threads;
> Average over 50 runs @ 100,000,000 packets. SRSO and spectre v2
> mitigations disabled.
>
> Note for tap+vhost-net:
> XDP drop program active in VM -> ~2.5x faster; slower for tap due to
> more syscalls (high utilization of entry_SYSRETQ_unsafe_stack in perf)
>
> +--------------------------+--------------+----------------+----------+
> | 1 thread | Stock | Patched with | diff |
> | sending | | fq_codel qdisc | |
> +------------+-------------+--------------+----------------+----------+
> | TAP | Received | 1.132 Mpps | 1.133 Mpps | +0.1% |
> | +-------------+--------------+----------------+----------+
> | | Lost/s | 3.765 Mpps | 0 pps | |
> +------------+-------------+--------------+----------------+----------+
> | TAP | Received | 3.857 Mpps | 3.905 Mpps | +1.2% |
> | +-------------+--------------+----------------+----------+
> | +vhost-net | Lost/s | 0.802 Mpps | 0 pps | |
> +------------+-------------+--------------+----------------+----------+
>
> +--------------------------+--------------+----------------+----------+
> | 2 threads | Stock | Patched with | diff |
> | sending | | fq_codel qdisc | |
> +------------+-------------+--------------+----------------+----------+
> | TAP | Received | 1.115 Mpps | 1.092 Mpps | -2.1% |
> | +-------------+--------------+----------------+----------+
> | | Lost/s | 8.490 Mpps | 359 pps | |
> +------------+-------------+--------------+----------------+----------+
> | TAP | Received | 3.664 Mpps | 3.549 Mpps | -3.1% |
> | +-------------+--------------+----------------+----------+
> | +vhost-net | Lost/s | 5.330 Mpps | 832 pps | |
> +------------+-------------+--------------+----------------+----------+
>
> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> ---
> drivers/net/tun.c | 25 +++++++++++++++++++++++--
> 1 file changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index fc358c4c355b..d9ffbf88cfd8 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1018,6 +1018,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> struct netdev_queue *queue;
> struct tun_file *tfile;
> int len = skb->len;
> + int ret;
>
> rcu_read_lock();
> tfile = rcu_dereference(tun->tfiles[txq]);
> @@ -1072,13 +1073,33 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>
> nf_reset_ct(skb);
>
> - if (ptr_ring_produce(&tfile->tx_ring, skb)) {
> + queue = netdev_get_tx_queue(dev, txq);
> +
> + spin_lock(&tfile->tx_ring.producer_lock);
> + ret = __ptr_ring_produce(&tfile->tx_ring, skb);
> + if (!qdisc_txq_has_no_queue(queue) &&
> + (__ptr_ring_produce_peek(&tfile->tx_ring) || ret)) {
> + netif_tx_stop_queue(queue);
> + /* Paired with smp_mb() in __tun_wake_queue() */
> + smp_mb__after_atomic();
> + if (!__ptr_ring_produce_peek(&tfile->tx_ring))
> + netif_tx_wake_queue(queue);
> + }
> + spin_unlock(&tfile->tx_ring.producer_lock);
> +
There's a weird corner case here when tx_queue_len is 0
but a qdisc has been configured - it looks like that
currently it just drops all packets, with this change,
the qdisc will get stuck permanently.
I suspect just checking tx_ring.size should fix it.
Or if you feel adventurous, change return code for __ptr_ring_produce
to distinguish between "no ring" and "no space".
> + if (ret) {
> + /* This should be a rare case if a qdisc is present, but
> + * can happen due to lltx.
> + * Since skb_tx_timestamp(), skb_orphan(),
> + * run_ebpf_filter() and pskb_trim() could have tinkered
> + * with the SKB, returning NETDEV_TX_BUSY is unsafe and
> + * we must drop instead.
> + */
> drop_reason = SKB_DROP_REASON_FULL_RING;
> goto drop;
> }
>
> /* dev->lltx requires to do our own update of trans_start */
> - queue = netdev_get_tx_queue(dev, txq);
> txq_trans_cond_update(queue);
>
> /* Notify and wake up reader process */
> --
> 2.43.0
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present
2026-05-06 22:28 ` Michael S. Tsirkin
@ 2026-05-06 22:56 ` Michael S. Tsirkin
2026-05-07 6:32 ` Simon Schippers
0 siblings, 1 reply; 11+ messages in thread
From: Michael S. Tsirkin @ 2026-05-06 22:56 UTC (permalink / raw)
To: Simon Schippers
Cc: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, eperezma, leiyang, stephen, jon, tim.gebauer,
netdev, linux-kernel, kvm, virtualization
On Wed, May 06, 2026 at 06:28:06PM -0400, Michael S. Tsirkin wrote:
> On Wed, May 06, 2026 at 04:10:33PM +0200, Simon Schippers wrote:
> > This commit prevents tail-drop when a qdisc is present and the ptr_ring
> > becomes full. Once an entry is successfully produced and the ptr_ring
> > reaches capacity, the netdev queue is stopped instead of dropping
> > subsequent packets. If no qdisc is present, the previous tail-drop
> > behavior is preserved.
> >
> > If producing an entry fails anyways due to a race, tun_net_xmit() drops
> > the packet. Such races are expected because LLTX is enabled and the
> > transmit path operates without the usual locking.
> >
> > The __tun_wake_queue() function of the consumer races with the producer
> > for waking/stopping the netdev queue, which could result in a stalled
> > queue. Therefore, an smp_mb__after_atomic() is introduced that pairs
> > with the smp_mb() of the consumer. It follows the principle of store
> > buffering described in tools/memory-model/Documentation/recipes.txt:
> >
> > - The producer in tun_net_xmit() first sets __QUEUE_STATE_DRV_XOFF,
> > followed by an smp_mb__after_atomic() (= smp_mb()), and then reads the
> > ring with __ptr_ring_produce_peek().
> >
> > - The consumer in __tun_wake_queue() first writes zero to the ring in
> > __ptr_ring_consume(), followed by an smp_mb(), and then reads the queue
> > status with netif_tx_queue_stopped().
> >
> > => Following the aforementioned principle, it is impossible for the
> > producer to see a full ring (and therefore not wake the queue on the
> > re-check) while the consumer simultaneously fails to see a stopped
> > queue (and therefore also does not wake it).
> >
> > Benchmarks:
> > The benchmarks show a slight regression in raw transmission performance
> > when using two sending threads. Packet loss also occurs only in the
> > two-thread sending case; no packet loss was observed with a single
> > sending thread.
> >
> > Test setup:
> > AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU threads;
> > Average over 50 runs @ 100,000,000 packets. SRSO and spectre v2
> > mitigations disabled.
> >
> > Note for tap+vhost-net:
> > XDP drop program active in VM -> ~2.5x faster; slower for tap due to
> > more syscalls (high utilization of entry_SYSRETQ_unsafe_stack in perf)
> >
> > +--------------------------+--------------+----------------+----------+
> > | 1 thread | Stock | Patched with | diff |
> > | sending | | fq_codel qdisc | |
> > +------------+-------------+--------------+----------------+----------+
> > | TAP | Received | 1.132 Mpps | 1.133 Mpps | +0.1% |
> > | +-------------+--------------+----------------+----------+
> > | | Lost/s | 3.765 Mpps | 0 pps | |
> > +------------+-------------+--------------+----------------+----------+
> > | TAP | Received | 3.857 Mpps | 3.905 Mpps | +1.2% |
> > | +-------------+--------------+----------------+----------+
> > | +vhost-net | Lost/s | 0.802 Mpps | 0 pps | |
> > +------------+-------------+--------------+----------------+----------+
> >
> > +--------------------------+--------------+----------------+----------+
> > | 2 threads | Stock | Patched with | diff |
> > | sending | | fq_codel qdisc | |
> > +------------+-------------+--------------+----------------+----------+
> > | TAP | Received | 1.115 Mpps | 1.092 Mpps | -2.1% |
> > | +-------------+--------------+----------------+----------+
> > | | Lost/s | 8.490 Mpps | 359 pps | |
> > +------------+-------------+--------------+----------------+----------+
> > | TAP | Received | 3.664 Mpps | 3.549 Mpps | -3.1% |
> > | +-------------+--------------+----------------+----------+
> > | +vhost-net | Lost/s | 5.330 Mpps | 832 pps | |
> > +------------+-------------+--------------+----------------+----------+
> >
> > Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> > Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
> > Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> > ---
> > drivers/net/tun.c | 25 +++++++++++++++++++++++--
> > 1 file changed, 23 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > index fc358c4c355b..d9ffbf88cfd8 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -1018,6 +1018,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> > struct netdev_queue *queue;
> > struct tun_file *tfile;
> > int len = skb->len;
> > + int ret;
> >
> > rcu_read_lock();
> > tfile = rcu_dereference(tun->tfiles[txq]);
> > @@ -1072,13 +1073,33 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >
> > nf_reset_ct(skb);
> >
> > - if (ptr_ring_produce(&tfile->tx_ring, skb)) {
> > + queue = netdev_get_tx_queue(dev, txq);
> > +
> > + spin_lock(&tfile->tx_ring.producer_lock);
> > + ret = __ptr_ring_produce(&tfile->tx_ring, skb);
> > + if (!qdisc_txq_has_no_queue(queue) &&
> > + (__ptr_ring_produce_peek(&tfile->tx_ring) || ret)) {
> > + netif_tx_stop_queue(queue);
> > + /* Paired with smp_mb() in __tun_wake_queue() */
> > + smp_mb__after_atomic();
> > + if (!__ptr_ring_produce_peek(&tfile->tx_ring))
> > + netif_tx_wake_queue(queue);
> > + }
> > + spin_unlock(&tfile->tx_ring.producer_lock);
> > +
>
> There's a weird corner case here when tx_queue_len is 0
> but a qdisc has been configured - it looks like that
> currently it just drops all packets, with this change,
> the qdisc will get stuck permanently.
>
> I suspect just checking tx_ring.size should fix it.
> Or if you feel adventurous, change return code for __ptr_ring_produce
> to distinguish between "no ring" and "no space".
__ptr_ring_produce_peek really.
>
> > + if (ret) {
> > + /* This should be a rare case if a qdisc is present, but
> > + * can happen due to lltx.
> > + * Since skb_tx_timestamp(), skb_orphan(),
> > + * run_ebpf_filter() and pskb_trim() could have tinkered
> > + * with the SKB, returning NETDEV_TX_BUSY is unsafe and
> > + * we must drop instead.
> > + */
> > drop_reason = SKB_DROP_REASON_FULL_RING;
> > goto drop;
> > }
> >
> > /* dev->lltx requires to do our own update of trans_start */
> > - queue = netdev_get_tx_queue(dev, txq);
> > txq_trans_cond_update(queue);
> >
> > /* Notify and wake up reader process */
> > --
> > 2.43.0
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present
2026-05-06 22:56 ` Michael S. Tsirkin
@ 2026-05-07 6:32 ` Simon Schippers
2026-05-07 15:19 ` Simon Schippers
0 siblings, 1 reply; 11+ messages in thread
From: Simon Schippers @ 2026-05-07 6:32 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, eperezma, leiyang, stephen, jon, tim.gebauer,
netdev, linux-kernel, kvm, virtualization
On 5/7/26 00:56, Michael S. Tsirkin wrote:
> On Wed, May 06, 2026 at 06:28:06PM -0400, Michael S. Tsirkin wrote:
>> On Wed, May 06, 2026 at 04:10:33PM +0200, Simon Schippers wrote:
>>> This commit prevents tail-drop when a qdisc is present and the ptr_ring
>>> becomes full. Once an entry is successfully produced and the ptr_ring
>>> reaches capacity, the netdev queue is stopped instead of dropping
>>> subsequent packets. If no qdisc is present, the previous tail-drop
>>> behavior is preserved.
>>>
>>> If producing an entry fails anyways due to a race, tun_net_xmit() drops
>>> the packet. Such races are expected because LLTX is enabled and the
>>> transmit path operates without the usual locking.
>>>
>>> The __tun_wake_queue() function of the consumer races with the producer
>>> for waking/stopping the netdev queue, which could result in a stalled
>>> queue. Therefore, an smp_mb__after_atomic() is introduced that pairs
>>> with the smp_mb() of the consumer. It follows the principle of store
>>> buffering described in tools/memory-model/Documentation/recipes.txt:
>>>
>>> - The producer in tun_net_xmit() first sets __QUEUE_STATE_DRV_XOFF,
>>> followed by an smp_mb__after_atomic() (= smp_mb()), and then reads the
>>> ring with __ptr_ring_produce_peek().
>>>
>>> - The consumer in __tun_wake_queue() first writes zero to the ring in
>>> __ptr_ring_consume(), followed by an smp_mb(), and then reads the queue
>>> status with netif_tx_queue_stopped().
>>>
>>> => Following the aforementioned principle, it is impossible for the
>>> producer to see a full ring (and therefore not wake the queue on the
>>> re-check) while the consumer simultaneously fails to see a stopped
>>> queue (and therefore also does not wake it).
>>>
>>> Benchmarks:
>>> The benchmarks show a slight regression in raw transmission performance
>>> when using two sending threads. Packet loss also occurs only in the
>>> two-thread sending case; no packet loss was observed with a single
>>> sending thread.
>>>
>>> Test setup:
>>> AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU threads;
>>> Average over 50 runs @ 100,000,000 packets. SRSO and spectre v2
>>> mitigations disabled.
>>>
>>> Note for tap+vhost-net:
>>> XDP drop program active in VM -> ~2.5x faster; slower for tap due to
>>> more syscalls (high utilization of entry_SYSRETQ_unsafe_stack in perf)
>>>
>>> +--------------------------+--------------+----------------+----------+
>>> | 1 thread | Stock | Patched with | diff |
>>> | sending | | fq_codel qdisc | |
>>> +------------+-------------+--------------+----------------+----------+
>>> | TAP | Received | 1.132 Mpps | 1.133 Mpps | +0.1% |
>>> | +-------------+--------------+----------------+----------+
>>> | | Lost/s | 3.765 Mpps | 0 pps | |
>>> +------------+-------------+--------------+----------------+----------+
>>> | TAP | Received | 3.857 Mpps | 3.905 Mpps | +1.2% |
>>> | +-------------+--------------+----------------+----------+
>>> | +vhost-net | Lost/s | 0.802 Mpps | 0 pps | |
>>> +------------+-------------+--------------+----------------+----------+
>>>
>>> +--------------------------+--------------+----------------+----------+
>>> | 2 threads | Stock | Patched with | diff |
>>> | sending | | fq_codel qdisc | |
>>> +------------+-------------+--------------+----------------+----------+
>>> | TAP | Received | 1.115 Mpps | 1.092 Mpps | -2.1% |
>>> | +-------------+--------------+----------------+----------+
>>> | | Lost/s | 8.490 Mpps | 359 pps | |
>>> +------------+-------------+--------------+----------------+----------+
>>> | TAP | Received | 3.664 Mpps | 3.549 Mpps | -3.1% |
>>> | +-------------+--------------+----------------+----------+
>>> | +vhost-net | Lost/s | 5.330 Mpps | 832 pps | |
>>> +------------+-------------+--------------+----------------+----------+
>>>
>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>> ---
>>> drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>> 1 file changed, 23 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index fc358c4c355b..d9ffbf88cfd8 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -1018,6 +1018,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>> struct netdev_queue *queue;
>>> struct tun_file *tfile;
>>> int len = skb->len;
>>> + int ret;
>>>
>>> rcu_read_lock();
>>> tfile = rcu_dereference(tun->tfiles[txq]);
>>> @@ -1072,13 +1073,33 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>
>>> nf_reset_ct(skb);
>>>
>>> - if (ptr_ring_produce(&tfile->tx_ring, skb)) {
>>> + queue = netdev_get_tx_queue(dev, txq);
>>> +
>>> + spin_lock(&tfile->tx_ring.producer_lock);
>>> + ret = __ptr_ring_produce(&tfile->tx_ring, skb);
>>> + if (!qdisc_txq_has_no_queue(queue) &&
>>> + (__ptr_ring_produce_peek(&tfile->tx_ring) || ret)) {
>>> + netif_tx_stop_queue(queue);
>>> + /* Paired with smp_mb() in __tun_wake_queue() */
>>> + smp_mb__after_atomic();
>>> + if (!__ptr_ring_produce_peek(&tfile->tx_ring))
>>> + netif_tx_wake_queue(queue);
>>> + }
>>> + spin_unlock(&tfile->tx_ring.producer_lock);
>>> +
>>
>> There's a weird corner case here when tx_queue_len is 0
>> but a qdisc has been configured - it looks like that
>> currently it just drops all packets, with this change,
>> the qdisc will get stuck permanently.
>>
>> I suspect just checking tx_ring.size should fix it.
>> Or if you feel adventurous, change return code for __ptr_ring_produce
>> to distinguish between "no ring" and "no space".
>
>
> __ptr_ring_produce_peek really.
>
Yes, I like the approach of returning this from
__ptr_ring_produce_peek(). Then I will do a switch on the return value
in tun_net_xmit().
Additionally, I should wake up in tun_queue_resize() after calling
ptr_ring_resize_multiple_bh(). For a new dev->tx_queue_len > 0, it
should be fine without waking, but for 0 it is not.
>
>>
>>> + if (ret) {
>>> + /* This should be a rare case if a qdisc is present, but
>>> + * can happen due to lltx.
>>> + * Since skb_tx_timestamp(), skb_orphan(),
>>> + * run_ebpf_filter() and pskb_trim() could have tinkered
>>> + * with the SKB, returning NETDEV_TX_BUSY is unsafe and
>>> + * we must drop instead.
>>> + */
>>> drop_reason = SKB_DROP_REASON_FULL_RING;
>>> goto drop;
>>> }
>>>
>>> /* dev->lltx requires to do our own update of trans_start */
>>> - queue = netdev_get_tx_queue(dev, txq);
>>> txq_trans_cond_update(queue);
>>>
>>> /* Notify and wake up reader process */
>>> --
>>> 2.43.0
>
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH net-next v10 4/4] tun/tap & vhost-net: avoid ptr_ring tail-drop when a qdisc is present
2026-05-07 6:32 ` Simon Schippers
@ 2026-05-07 15:19 ` Simon Schippers
0 siblings, 0 replies; 11+ messages in thread
From: Simon Schippers @ 2026-05-07 15:19 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: willemdebruijn.kernel, jasowang, andrew+netdev, davem, edumazet,
kuba, pabeni, eperezma, leiyang, stephen, jon, tim.gebauer,
netdev, linux-kernel, kvm, virtualization
On 5/7/26 08:32, Simon Schippers wrote:
> On 5/7/26 00:56, Michael S. Tsirkin wrote:
>> On Wed, May 06, 2026 at 06:28:06PM -0400, Michael S. Tsirkin wrote:
>>> On Wed, May 06, 2026 at 04:10:33PM +0200, Simon Schippers wrote:
>>>> This commit prevents tail-drop when a qdisc is present and the ptr_ring
>>>> becomes full. Once an entry is successfully produced and the ptr_ring
>>>> reaches capacity, the netdev queue is stopped instead of dropping
>>>> subsequent packets. If no qdisc is present, the previous tail-drop
>>>> behavior is preserved.
>>>>
>>>> If producing an entry fails anyways due to a race, tun_net_xmit() drops
>>>> the packet. Such races are expected because LLTX is enabled and the
>>>> transmit path operates without the usual locking.
>>>>
>>>> The __tun_wake_queue() function of the consumer races with the producer
>>>> for waking/stopping the netdev queue, which could result in a stalled
>>>> queue. Therefore, an smp_mb__after_atomic() is introduced that pairs
>>>> with the smp_mb() of the consumer. It follows the principle of store
>>>> buffering described in tools/memory-model/Documentation/recipes.txt:
>>>>
>>>> - The producer in tun_net_xmit() first sets __QUEUE_STATE_DRV_XOFF,
>>>> followed by an smp_mb__after_atomic() (= smp_mb()), and then reads the
>>>> ring with __ptr_ring_produce_peek().
>>>>
>>>> - The consumer in __tun_wake_queue() first writes zero to the ring in
>>>> __ptr_ring_consume(), followed by an smp_mb(), and then reads the queue
>>>> status with netif_tx_queue_stopped().
>>>>
>>>> => Following the aforementioned principle, it is impossible for the
>>>> producer to see a full ring (and therefore not wake the queue on the
>>>> re-check) while the consumer simultaneously fails to see a stopped
>>>> queue (and therefore also does not wake it).
>>>>
>>>> Benchmarks:
>>>> The benchmarks show a slight regression in raw transmission performance
>>>> when using two sending threads. Packet loss also occurs only in the
>>>> two-thread sending case; no packet loss was observed with a single
>>>> sending thread.
>>>>
>>>> Test setup:
>>>> AMD Ryzen 5 5600X at 4.3 GHz, 3200 MHz RAM, isolated QEMU threads;
>>>> Average over 50 runs @ 100,000,000 packets. SRSO and spectre v2
>>>> mitigations disabled.
>>>>
>>>> Note for tap+vhost-net:
>>>> XDP drop program active in VM -> ~2.5x faster; slower for tap due to
>>>> more syscalls (high utilization of entry_SYSRETQ_unsafe_stack in perf)
>>>>
>>>> +--------------------------+--------------+----------------+----------+
>>>> | 1 thread | Stock | Patched with | diff |
>>>> | sending | | fq_codel qdisc | |
>>>> +------------+-------------+--------------+----------------+----------+
>>>> | TAP | Received | 1.132 Mpps | 1.133 Mpps | +0.1% |
>>>> | +-------------+--------------+----------------+----------+
>>>> | | Lost/s | 3.765 Mpps | 0 pps | |
>>>> +------------+-------------+--------------+----------------+----------+
>>>> | TAP | Received | 3.857 Mpps | 3.905 Mpps | +1.2% |
>>>> | +-------------+--------------+----------------+----------+
>>>> | +vhost-net | Lost/s | 0.802 Mpps | 0 pps | |
>>>> +------------+-------------+--------------+----------------+----------+
>>>>
>>>> +--------------------------+--------------+----------------+----------+
>>>> | 2 threads | Stock | Patched with | diff |
>>>> | sending | | fq_codel qdisc | |
>>>> +------------+-------------+--------------+----------------+----------+
>>>> | TAP | Received | 1.115 Mpps | 1.092 Mpps | -2.1% |
>>>> | +-------------+--------------+----------------+----------+
>>>> | | Lost/s | 8.490 Mpps | 359 pps | |
>>>> +------------+-------------+--------------+----------------+----------+
>>>> | TAP | Received | 3.664 Mpps | 3.549 Mpps | -3.1% |
>>>> | +-------------+--------------+----------------+----------+
>>>> | +vhost-net | Lost/s | 5.330 Mpps | 832 pps | |
>>>> +------------+-------------+--------------+----------------+----------+
>>>>
>>>> Co-developed-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>> Signed-off-by: Tim Gebauer <tim.gebauer@tu-dortmund.de>
>>>> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
>>>> ---
>>>> drivers/net/tun.c | 25 +++++++++++++++++++++++--
>>>> 1 file changed, 23 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index fc358c4c355b..d9ffbf88cfd8 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -1018,6 +1018,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>> struct netdev_queue *queue;
>>>> struct tun_file *tfile;
>>>> int len = skb->len;
>>>> + int ret;
>>>>
>>>> rcu_read_lock();
>>>> tfile = rcu_dereference(tun->tfiles[txq]);
>>>> @@ -1072,13 +1073,33 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>
>>>> nf_reset_ct(skb);
>>>>
>>>> - if (ptr_ring_produce(&tfile->tx_ring, skb)) {
>>>> + queue = netdev_get_tx_queue(dev, txq);
>>>> +
>>>> + spin_lock(&tfile->tx_ring.producer_lock);
>>>> + ret = __ptr_ring_produce(&tfile->tx_ring, skb);
>>>> + if (!qdisc_txq_has_no_queue(queue) &&
>>>> + (__ptr_ring_produce_peek(&tfile->tx_ring) || ret)) {
>>>> + netif_tx_stop_queue(queue);
>>>> + /* Paired with smp_mb() in __tun_wake_queue() */
>>>> + smp_mb__after_atomic();
>>>> + if (!__ptr_ring_produce_peek(&tfile->tx_ring))
>>>> + netif_tx_wake_queue(queue);
>>>> + }
>>>> + spin_unlock(&tfile->tx_ring.producer_lock);
>>>> +
>>>
>>> There's a weird corner case here when tx_queue_len is 0
>>> but a qdisc has been configured - it looks like that
>>> currently it just drops all packets, with this change,
>>> the qdisc will get stuck permanently.
>>>
>>> I suspect just checking tx_ring.size should fix it.
>>> Or if you feel adventurous, change return code for __ptr_ring_produce
>>> to distinguish between "no ring" and "no space".
>>
>>
>> __ptr_ring_produce_peek really.
>>
>
> Yes, I like the approach of returning this from
> __ptr_ring_produce_peek(). Then I will do a switch on the return value
> in tun_net_xmit().
Sashiko reports the same :)
So for the v11 I will:
- Change __ptr_ring_produce_peek() to return -ENOSPC / -EINVAL? (for 0
sized ring) / 0
- Lock the ring.consumer_lock in __tun_detach() to avoid a race with
consumer (Sashiko).
>
> Additionally, I should wake up in tun_queue_resize() after calling
> ptr_ring_resize_multiple_bh(). For a new dev->tx_queue_len > 0, it
> should be fine without waking, but for 0 it is not.
>
>>
>>>
>>>> + if (ret) {
>>>> + /* This should be a rare case if a qdisc is present, but
>>>> + * can happen due to lltx.
>>>> + * Since skb_tx_timestamp(), skb_orphan(),
>>>> + * run_ebpf_filter() and pskb_trim() could have tinkered
>>>> + * with the SKB, returning NETDEV_TX_BUSY is unsafe and
>>>> + * we must drop instead.
>>>> + */
>>>> drop_reason = SKB_DROP_REASON_FULL_RING;
>>>> goto drop;
>>>> }
>>>>
>>>> /* dev->lltx requires to do our own update of trans_start */
>>>> - queue = netdev_get_tx_queue(dev, txq);
>>>> txq_trans_cond_update(queue);
>>>>
>>>> /* Notify and wake up reader process */
>>>> --
>>>> 2.43.0
>>
^ permalink raw reply [flat|nested] 11+ messages in thread